diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index e29881fcbac01..68aff793ae6aa 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 270663c415c72..7045d8810493e 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os diff --git a/.buildkite/lm-eval-harness/conftest.py b/.buildkite/lm-eval-harness/conftest.py index 769d2efda4adc..c0d60dd5328f4 100644 --- a/.buildkite/lm-eval-harness/conftest.py +++ b/.buildkite/lm-eval-harness/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path import pytest diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 409a6ca820082..930adfaf3e192 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ LM eval harness on model to compare vs HF baseline computed offline. Configs are found in configs/$MODEL.yaml diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index d3f5fc5cd4cee..72c52d5bb5e9b 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do ### Visualizing the results -The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. +The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. If you do not see the table, please wait till the benchmark finish running. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md index e43ea765f1556..ef11c040057c8 100644 --- a/.buildkite/nightly-benchmarks/nightly-annotation.md +++ b/.buildkite/nightly-benchmarks/nightly-annotation.md @@ -16,7 +16,7 @@ Please download the visualization scripts in the post - Download `nightly-benchmarks.zip`. - In the same folder, run the following code: - ```console + ```bash export HF_TOKEN= apt update apt install -y git diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 7f2a2d8dc2969..a4f1638c1adb8 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py index 778a3a8d87f63..8532ff7ef798c 100644 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 10a7a2f5a467e..053fd52c35ae9 100644 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py index e5f179a0f5b68..ddea1d2b1b1ed 100644 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from lmdeploy.serve.openai.api_client import APIClient diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 2a7b37991f31a..fb3b9d5e34e03 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime import json diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml index 083bb795caf5a..d5cad1c73c6f8 100644 --- a/.buildkite/pyproject.toml +++ b/.buildkite/pyproject.toml @@ -6,11 +6,6 @@ [tool.ruff] line-length = 88 -exclude = [ - # External file, leaving license intact - "examples/other/fp8/quantizer/quantize.py", - "vllm/vllm_flash_attn/flash_attn_interface.pyi" -] [tool.ruff.lint.per-file-ignores] "vllm/third_party/**" = ["ALL"] diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index b3c27e2c99c2b..55678b8936e04 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,5 +1,6 @@ steps: - label: "Build wheel - CUDA 12.8" + id: build-wheel-cuda-12-8 agents: queue: cpu_queue_postmerge commands: @@ -11,6 +12,7 @@ steps: DOCKER_BUILDKIT: "1" - label: "Build wheel - CUDA 12.6" + id: build-wheel-cuda-12-6 agents: queue: cpu_queue_postmerge commands: @@ -28,6 +30,7 @@ steps: - label: "Build wheel - CUDA 11.8" # depends_on: block-build-cu118-wheel + id: build-wheel-cuda-11-8 agents: queue: cpu_queue_postmerge commands: @@ -44,6 +47,7 @@ steps: - label: "Build release image" depends_on: block-release-image-build + id: build-release-image agents: queue: cpu_queue_postmerge commands: @@ -51,6 +55,18 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - label: "Annotate release workflow" + depends_on: + - build-release-image + - build-wheel-cuda-12-8 + - build-wheel-cuda-12-6 + - build-wheel-cuda-11-8 + id: annotate-release-workflow + agents: + queue: cpu_queue_postmerge + commands: + - "bash .buildkite/scripts/annotate-release.sh" + - label: "Build and publish TPU release image" depends_on: ~ if: build.env("NIGHTLY") == "1" @@ -70,9 +86,10 @@ steps: DOCKER_BUILDKIT: "1" - input: "Provide Release version here" + id: input-release-version fields: - text: "What is the release version?" - key: "release-version" + key: release-version - block: "Build CPU release image" key: block-cpu-release-image-build @@ -85,6 +102,7 @@ steps: commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" env: DOCKER_BUILDKIT: "1" @@ -100,6 +118,7 @@ steps: commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest" - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)" env: DOCKER_BUILDKIT: "1" diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh new file mode 100755 index 0000000000000..94e0ac2398f34 --- /dev/null +++ b/.buildkite/scripts/annotate-release.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +set -ex + +# Get release version and strip leading 'v' if present +RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//') + +if [ -z "$RELEASE_VERSION" ]; then + echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid." + exit 1 +fi + +buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF +To download the wheel: +\`\`\` +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . +\`\`\` + +To download and upload the image: + +\`\`\` +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai +docker tag vllm/vllm-openai vllm/vllm-openai:latest +docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION} +docker push vllm/vllm-openai:latest +docker push vllm/vllm-openai:v${RELEASE_VERSION} +\`\`\` +EOF \ No newline at end of file diff --git a/.buildkite/scripts/ci-clean-log.sh b/.buildkite/scripts/ci-clean-log.sh new file mode 100644 index 0000000000000..69d8a3a288316 --- /dev/null +++ b/.buildkite/scripts/ci-clean-log.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Usage: ./ci_clean_log.sh ci.log +# This script strips timestamps and color codes from CI log files. + +# Check if argument is given +if [ $# -lt 1 ]; then + echo "Usage: $0 ci.log" + exit 1 +fi + +INPUT_FILE="$1" + +# Strip timestamps +sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE" + +# Strip colorization +sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE" diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index bbc896ec68190..6e9af1e721bb7 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} fi +if [[ $commands == *"pytest -v -s lora"* ]]; then + commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} +fi + #ignore certain kernels tests if [[ $commands == *" kernels/core"* ]]; then commands="${commands} \ diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh index 077bd99149079..36bcb015d308e 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh @@ -7,6 +7,7 @@ set -ex # Setup cleanup remove_docker_container() { if [[ -n "$container_id" ]]; then + podman stop --all -t0 podman rm -f "$container_id" || true fi podman system prune -f @@ -37,7 +38,7 @@ function cpu_tests() { pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] - pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]" + pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" } # All of CPU tests are expected to be finished less than 40 mins. diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 40f3df96065d1..8db8c3a05fb30 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -6,75 +6,82 @@ set -ex # allow to bind to different cores CORE_RANGE=${CORE_RANGE:-48-95} +OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} +export CMAKE_BUILD_PARALLEL_LEVEL=32 + # Setup cleanup remove_docker_container() { set -e; - docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; - docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; + docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; } trap remove_docker_container EXIT remove_docker_container # Try building the docker image -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu . -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 - export BUILDKITE_BUILD_NUMBER=$3 + + # list packages + docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " + set -e + pip list" + + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pip list" # offline inference - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " set -e python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" # Run basic model test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -v -s tests/kernels/test_cache.py -m cpu_model - pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model - pytest -v -s tests/models/decoder_only/language -m cpu_model - pytest -v -s tests/models/embedding/language -m cpu_model - pytest -v -s tests/models/encoder_decoder/language -m cpu_model - pytest -v -s tests/models/decoder_only/audio_language -m cpu_model - pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" + pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model + pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model + pytest -v -s tests/models/language/generation -m cpu_model + pytest -v -s tests/models/language/pooling -m cpu_model + pytest -v -s tests/models/multimodal/generation \ + --ignore=tests/models/multimodal/generation/test_mllama.py \ + --ignore=tests/models/multimodal/generation/test_pixtral.py \ + -m cpu_model" # Run compressed-tensor test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -s -v \ + VLLM_USE_V1=0 pytest -s -v \ tests/quantization/test_ipex_quant.py" # Run chunked-prefill and prefix-cache test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v -k cpu_model \ tests/basic_correctness/test_chunked_prefill.py" # online serving - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - export VLLM_CPU_KVCACHE_SPACE=10 - export VLLM_CPU_OMP_THREADS_BIND=$1 python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - python3 benchmarks/benchmark_serving.py \ + VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \ --backend vllm \ --dataset-name random \ --model facebook/opt-125m \ @@ -83,7 +90,7 @@ function cpu_tests() { --tokenizer facebook/opt-125m" # Run multi-lora tests - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/lora/test_qwen2vl.py" @@ -91,4 +98,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER" +timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/.buildkite/scripts/hardware_ci/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh index 95b6ac37f1857..5efac3ddf469f 100644 --- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh @@ -10,15 +10,17 @@ docker build -t hpu-test-env -f docker/Dockerfile.hpu . # Setup cleanup # certain versions of HPU software stack have a bug that can # override the exit code of the script, so we need to use -# separate remove_docker_container and remove_docker_container_and_exit +# separate remove_docker_containers and remove_docker_containers_and_exit # functions, while other platforms only need one remove_docker_container # function. EXITCODE=1 -remove_docker_container() { docker rm -f hpu-test || true; } -remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; } -trap remove_docker_container_and_exit EXIT -remove_docker_container +remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; } +remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; } +trap remove_docker_containers_and_exit EXIT +remove_docker_containers # Run the image and launch offline inference docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m +docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2 + EXITCODE=$? diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh index c0b9dd8dadba9..a397457c83261 100644 --- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh +++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh @@ -53,4 +53,12 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys" + /bin/bash -c " + set -e; # Exit on first error + python3 /workspace/vllm/examples/offline_inference/neuron.py; + python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys; + for f in /workspace/vllm/tests/neuron/2_core/*.py; do + echo \"Running test file: \$f\"; + python3 -m pytest \$f -v --capture=tee-sys; + done + " \ No newline at end of file diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 2d375d7e9d871..90cad506ab1e9 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -2,102 +2,186 @@ set -xu + +remove_docker_container() { + docker rm -f tpu-test || true; + docker rm -f vllm-tpu || true; +} + +trap remove_docker_container EXIT + +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + # Build the docker image. docker build -f docker/Dockerfile.tpu -t vllm-tpu . # Set up cleanup. -remove_docker_container() { docker rm -f tpu-test || true; } -trap remove_docker_container EXIT -# Remove the container that might not be cleaned up in the previous run. -remove_docker_container +cleanup_docker() { + # Get Docker's root directory + docker_root=$(docker info -f '{{.DockerRootDir}}') + if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 + fi + echo "Docker root directory: $docker_root" + # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') + # Define the threshold + threshold=70 + if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." + else + echo "Disk usage is below $threshold%. No cleanup needed." + fi +} +cleanup_docker # For HF_TOKEN. source /etc/environment -# Run a simple end-to-end example. + docker run --privileged --net host --shm-size=16G -it \ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ - vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ - && python3 -m pip install pytest pytest-asyncio tpu-info \ - && python3 -m pip install lm_eval[api]==0.4.4 \ - && export VLLM_XLA_CACHE_PATH= \ - && export VLLM_USE_V1=1 \ - && export VLLM_XLA_CHECK_RECOMPILATION=1 \ - && echo HARDWARE \ - && tpu-info \ - && { \ - echo TEST_0: Running test_perf.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \ - echo TEST_0_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_1: Running test_compilation.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \ - echo TEST_1_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_2: Running test_basic.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \ - echo TEST_2_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ - python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ - echo TEST_3_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_4: Running test_quantization_accuracy.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \ - echo TEST_4_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_5: Running examples/offline_inference/tpu.py; \ - python3 /workspace/vllm/examples/offline_inference/tpu.py; \ - echo TEST_5_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_6: Running test_tpu_model_runner.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \ - echo TEST_6_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_7: Running test_sampler.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \ - echo TEST_7_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_8: Running test_topk_topp_sampler.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \ - echo TEST_8_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_9: Running test_multimodal.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \ - echo TEST_9_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_10: Running test_pallas.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \ - echo TEST_10_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_11: Running test_struct_output_generate.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \ - echo TEST_11_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_12: Running test_moe_pallas.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \ - echo TEST_12_EXIT_CODE: \$?; \ - } & \ - # Disable the TPU LoRA tests until the feature is activated - # & { \ - # echo TEST_13: Running test_moe_pallas.py; \ - # python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \ - # echo TEST_13_EXIT_CODE: \$?; \ - # } & \ - wait \ - && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \ -" + vllm-tpu /bin/bash -c ' +set -e # Exit immediately if a command exits with a non-zero status. +set -u # Treat unset variables as an error. +echo "--- Starting script inside Docker container ---" + +# Create results directory +RESULTS_DIR=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $RESULTS_DIR" + +# Install dependencies +echo "--- Installing Python dependencies ---" +python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ + && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ + && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 +echo "--- Python dependencies installed ---" +export VLLM_USE_V1=1 +export VLLM_XLA_CHECK_RECOMPILATION=1 +export VLLM_XLA_CACHE_PATH= +echo "Using VLLM V1" + +echo "--- Hardware Information ---" +tpu-info +echo "--- Starting Tests ---" +set +e +overall_script_exit_code=0 + +# --- Test Definitions --- +# If a test fails, this function will print logs and will not cause the main script to exit. +run_test() { + local test_num=$1 + local test_name=$2 + local test_command=$3 + local log_file="$RESULTS_DIR/test_${test_num}.log" + local actual_exit_code + + echo "--- TEST_$test_num: Running $test_name ---" + + # Execute the test command. + eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) + actual_exit_code=$? + + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log + + if [ "$actual_exit_code" -ne 0 ]; then + echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 + echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 + if [ -f "$log_file" ]; then + cat "$log_file" >&2 + else + echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 + fi + echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 + return "$actual_exit_code" # Return the failure code + else + echo "TEST_$test_num ($test_name) PASSED." + return 0 # Return success + fi +} + +# Helper function to call run_test and update the overall script exit code +run_and_track_test() { + local test_num_arg="$1" + local test_name_arg="$2" + local test_command_arg="$3" + + # Run the test + run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" + local test_specific_exit_code=$? + + # If the test failed, set the overall script exit code to 1 + if [ "$test_specific_exit_code" -ne 0 ]; then + # No need for extra echo here, run_test already logged the failure. + overall_script_exit_code=1 + fi +} + +# --- Actual Test Execution --- +run_and_track_test 0 "test_perf.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py" +run_and_track_test 1 "test_compilation.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py" +run_and_track_test 2 "test_basic.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py" +run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ + "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" +run_and_track_test 4 "test_quantization_accuracy.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" +run_and_track_test 5 "examples/offline_inference/tpu.py" \ + "python3 /workspace/vllm/examples/offline_inference/tpu.py" +run_and_track_test 6 "test_tpu_model_runner.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py" +run_and_track_test 7 "test_sampler.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" +run_and_track_test 8 "test_topk_topp_sampler.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" +run_and_track_test 9 "test_multimodal.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" +run_and_track_test 10 "test_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" +run_and_track_test 11 "test_struct_output_generate.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" +run_and_track_test 12 "test_moe_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" +run_and_track_test 13 "test_lora.py" \ + "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" +run_and_track_test 14 "test_tpu_qkv_linear.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" +run_and_track_test 15 "test_spmd_model_weight_loading.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" +run_and_track_test 16 "test_kv_cache_update_kernel.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" + +# After all tests have been attempted, exit with the overall status. +if [ "$overall_script_exit_code" -ne 0 ]; then + echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" +else + echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" +fi +exit "$overall_script_exit_code" +' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. + +# Capture the exit code of the docker run command +DOCKER_RUN_EXIT_CODE=$? + +# The trap will run for cleanup. +# Exit the main script with the Docker run command's exit code. +if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then + echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." + exit "$DOCKER_RUN_EXIT_CODE" +else + echo "Docker run command completed successfully." + exit 0 +fi # TODO: This test fails because it uses RANDOM_SEED sampling -# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ +# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index f54010c4231f9..827649bfcf548 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -28,4 +28,5 @@ docker run \ sh -c ' VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2 + VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager ' diff --git a/.buildkite/scripts/rerun-test.sh b/.buildkite/scripts/rerun-test.sh new file mode 100644 index 0000000000000..d79c0d5f381b1 --- /dev/null +++ b/.buildkite/scripts/rerun-test.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Usage: ./rerun_test.sh path/to/test.py::test_name + +# Check if argument is given +if [ $# -lt 1 ]; then + echo "Usage: $0 path/to/test.py::test_name" + echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]" + exit 1 +fi + +TEST=$1 +COUNT=1 + +while pytest -sv "$TEST"; do + COUNT=$((COUNT + 1)) + echo "RUN NUMBER ${COUNT}" +done diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh new file mode 100755 index 0000000000000..209d9c4341cdd --- /dev/null +++ b/.buildkite/scripts/tpu/cleanup_docker.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -euo pipefail + +docker_root=$(docker info -f '{{.DockerRootDir}}') +if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 +fi +echo "Docker root directory: $docker_root" +# Check disk usage of the filesystem where Docker's root directory is located +disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') +# Define the threshold +threshold=70 +if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." +else + echo "Disk usage is below $threshold%. No cleanup needed." +fi diff --git a/.buildkite/scripts/tpu/config_v6e_1.env b/.buildkite/scripts/tpu/config_v6e_1.env new file mode 100644 index 0000000000000..03ec116f698d2 --- /dev/null +++ b/.buildkite/scripts/tpu/config_v6e_1.env @@ -0,0 +1,14 @@ +# Environment config +TEST_NAME=llama8b +CONTAINER_NAME=vllm-tpu + +# vllm config +MODEL=meta-llama/Llama-3.1-8B-Instruct +MAX_NUM_SEQS=256 +MAX_NUM_BATCHED_TOKENS=1024 +TENSOR_PARALLEL_SIZE=1 +MAX_MODEL_LEN=2048 +DOWNLOAD_DIR=/mnt/disks/persist +EXPECTED_THROUGHPUT=8.0 +INPUT_LEN=1800 +OUTPUT_LEN=128 diff --git a/.buildkite/scripts/tpu/docker_run_bm.sh b/.buildkite/scripts/tpu/docker_run_bm.sh new file mode 100755 index 0000000000000..715afce5f71ab --- /dev/null +++ b/.buildkite/scripts/tpu/docker_run_bm.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +if [ ! -f "$1" ]; then + echo "Error: The env file '$1' does not exist." + exit 1 # Exit the script with a non-zero status to indicate an error +fi + +ENV_FILE=$1 + +# For testing on local vm, use `set -a` to export all variables +source /etc/environment +source $ENV_FILE + +remove_docker_container() { + docker rm -f tpu-test || true; + docker rm -f vllm-tpu || true; + docker rm -f $CONTAINER_NAME || true; +} + +trap remove_docker_container EXIT + +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + +# Build docker image. +# TODO: build the image outside the script and share the image with other +# tpu test if building time is too long. +DOCKER_BUILDKIT=1 docker build \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=0 \ + --tag vllm/vllm-tpu-bm \ + --progress plain -f docker/Dockerfile.tpu . + +LOG_ROOT=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $LOG_ROOT" + +if [ -z "$HF_TOKEN" ]; then + echo "Error: HF_TOKEN is not set or is empty." + exit 1 +fi + +# Make sure mounted disk or dir exists +if [ ! -d "$DOWNLOAD_DIR" ]; then + echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder." + exit 1 +fi + +echo "Run model $MODEL" +echo + +echo "starting docker...$CONTAINER_NAME" +echo +docker run \ + -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \ + --env-file $ENV_FILE \ + -e HF_TOKEN="$HF_TOKEN" \ + -e TARGET_COMMIT=$BUILDKITE_COMMIT \ + -e MODEL=$MODEL \ + -e WORKSPACE=/workspace \ + --name $CONTAINER_NAME \ + -d \ + --privileged \ + --network host \ + -v /dev/shm:/dev/shm \ + vllm/vllm-tpu-bm tail -f /dev/null + +echo "run script..." +echo +docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh" + +echo "copy result back..." +VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt +BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt +docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" +docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG" + +throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') +echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput" + +if [ "$BUILDKITE" = "true" ]; then + echo "Running inside Buildkite" + buildkite-agent artifact upload "$VLLM_LOG" + buildkite-agent artifact upload "$BM_LOG" +else + echo "Not running inside Buildkite" +fi + +# +# compare the throughput with EXPECTED_THROUGHPUT +# and assert meeting the expectation +# +if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then + echo "Failed to get the throughput" + exit 1 +fi + +if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then + echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)" + exit 1 +fi diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh new file mode 100755 index 0000000000000..877669cd956ac --- /dev/null +++ b/.buildkite/scripts/tpu/run_bm.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +set -euo pipefail + +VLLM_LOG="$WORKSPACE/vllm_log.txt" +BM_LOG="$WORKSPACE/bm_log.txt" + +if [ -n "$TARGET_COMMIT" ]; then + head_hash=$(git rev-parse HEAD) + if [ "$TARGET_COMMIT" != "$head_hash" ]; then + echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash" + exit 1 + fi +fi + +echo "model: $MODEL" +echo + +# +# create a log folder +# +mkdir "$WORKSPACE/log" + +# TODO: Move to image building. +pip install pandas +pip install datasets + +# +# create sonnet_4x +# +echo "Create sonnet_4x.txt" +echo "" > benchmarks/sonnet_4x.txt +for _ in {1..4} + do + cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt +done + +# +# start vllm service in backend +# +echo "lanching vllm..." +echo "logging to $VLLM_LOG" +echo + +VLLM_USE_V1=1 vllm serve $MODEL \ + --seed 42 \ + --disable-log-requests \ + --max-num-seqs $MAX_NUM_SEQS \ + --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ + --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ + --no-enable-prefix-caching \ + --download_dir $DOWNLOAD_DIR \ + --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 & + + +echo "wait for 20 minutes.." +echo +# sleep 1200 +# wait for 10 minutes... +for i in {1..120}; do + # TODO: detect other type of errors. + if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then + echo "Detected RuntimeError, exiting." + exit 1 + elif grep -Fq "Application startup complete" "$VLLM_LOG"; then + echo "Application started" + break + else + echo "wait for 10 seconds..." + sleep 10 + fi +done + +# +# run test +# +echo "run benchmark test..." +echo "logging to $BM_LOG" +echo +python benchmarks/benchmark_serving.py \ + --backend vllm \ + --model $MODEL \ + --dataset-name sonnet \ + --dataset-path benchmarks/sonnet_4x.txt \ + --sonnet-input-len $INPUT_LEN \ + --sonnet-output-len $OUTPUT_LEN \ + --ignore-eos > "$BM_LOG" + +echo "completed..." +echo + +throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') +echo "throughput: $throughput" +echo diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 461fb6d30c45e..a13e2cb782182 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -33,14 +33,23 @@ steps: - label: Documentation Build # 2min mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/test_docs/docs" + working_dir: "/vllm-workspace/test_docs" fast_check: true no_gpu: True commands: - - pip install -r ../../requirements/docs.txt - - SPHINXOPTS=\"-W\" make html - # Check API reference (if it fails, you may have missing mock imports) - - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html + - pip install -r ../requirements/docs.txt + # TODO: add `--strict` once warnings in docstrings are fixed + - mkdocs build + +- label: Pytorch Nightly Dependency Override Check # 2min + # if this test fails, it means the nightly torch version is not compatible with some + # of the dependencies. Please check the error message and add the package to whitelist + # in /vllm/tools/generate_nightly_torch_test.py + soft_fail: true + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh - label: Async Engine, Inputs, Utils, Worker Test # 24min mirror_hardwares: [amdexperimental] @@ -59,6 +68,7 @@ steps: - pytest -v -s async_engine # AsyncLLMEngine - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py - pytest -v -s multimodal - pytest -v -s test_utils.py # Utils - pytest -v -s worker # Worker @@ -89,7 +99,7 @@ steps: - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - label: Chunked Prefill Test - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - vllm/ - tests/basic_correctness/test_chunked_prefill @@ -125,7 +135,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/test_chat_utils.py - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests @@ -138,12 +148,14 @@ steps: - vllm/core/ - tests/distributed/test_utils - tests/distributed/test_pynccl + - tests/distributed/test_events - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile/test_basic_correctness - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py - tests/v1/test_async_llm_dp.py + - tests/v1/engine/test_engine_core_client.py commands: # test with tp=2 and external_dp=2 - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py @@ -153,9 +165,11 @@ steps: # test with internal dp - python3 ../examples/offline_inference/data_parallel.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests @@ -164,6 +178,23 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd +- label: EPLB Algorithm Test + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution Test # 5min + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + - label: Metrics, Tracing Test # 10min mirror_hardwares: [amdexperimental, amdproduction] num_gpus: 2 @@ -173,6 +204,11 @@ steps: - tests/tracing commands: - pytest -v -s metrics + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" - pytest -v -s tracing ##### fast check tests ##### @@ -197,8 +233,9 @@ steps: - tests/test_sequence - tests/test_config - tests/test_logger + - tests/test_vllm_port commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py # OOM in the CI unless we run this separately - pytest -v -s tokenization @@ -220,6 +257,7 @@ steps: - pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_utils.py - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_metrics_reader.py # TODO: accuracy does not match, whether setting # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - pytest -v -s v1/e2e @@ -244,7 +282,7 @@ steps: - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_embedding.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder.py - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py @@ -260,6 +298,15 @@ steps: commands: - pytest -v -s prefix_caching + +- label: Platform Tests (CUDA) + mirror_hardwares: [amdexperimental] + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + - label: Samplers Test # 36min mirror_hardwares: [amdexperimental] source_file_dependencies: @@ -271,17 +318,6 @@ steps: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers -- label: LogitsProcessor Test # 5min - mirror_hardwares: [amdexperimental, amdproduction] - source_file_dependencies: - - vllm/model_executor/layers - - vllm/model_executor/guided_decoding - - tests/test_logits_processor - - tests/model_executor/test_guided_processors - commands: - - pytest -v -s test_logits_processor.py - - pytest -v -s model_executor/test_guided_processors.py - - label: Speculative decoding tests # 40min mirror_hardwares: [amdexperimental] source_file_dependencies: @@ -294,7 +330,7 @@ steps: - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - vllm/lora - tests/lora @@ -310,8 +346,10 @@ steps: commands: - pytest -v -s compile/test_pass_manager.py - pytest -v -s compile/test_fusion.py + - pytest -v -s compile/test_fusion_attn.py - pytest -v -s compile/test_silu_mul_quant_fusion.py - pytest -v -s compile/test_sequence_parallelism.py + - pytest -v -s compile/test_async_tp.py - label: PyTorch Fullgraph Smoke Test # 9min mirror_hardwares: [amdexperimental, amdproduction] @@ -324,6 +362,7 @@ steps: # these tests need to be separated, cannot combine - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py + - pytest -v -s compile/piecewise/test_full_cudagraph.py - label: PyTorch Fullgraph Test # 18min mirror_hardwares: [amdexperimental, amdproduction] @@ -386,10 +425,23 @@ steps: source_file_dependencies: - vllm/model_executor/model_loader - tests/tensorizer_loader + - tests/entrypoints/openai/test_tensorizer_entrypoint.py commands: - apt-get update && apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s tensorizer_loader + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py + +- label: Model Executor Test + mirror_hardwares: [amdexperimental, amdproduction] + soft_fail: true + source_file_dependencies: + - vllm/model_executor + - tests/model_executor + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor - label: Benchmarks # 9min mirror_hardwares: [amdexperimental, amdproduction] @@ -414,6 +466,9 @@ steps: - vllm/model_executor/layers/quantization - tests/quantization commands: + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release + - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - label: LM Eval Small Models # 53min @@ -467,10 +522,7 @@ steps: - pytest -v -s models/test_registry.py - pytest -v -s models/test_utils.py - pytest -v -s models/test_vision.py - # V1 Test: https://github.com/vllm-project/vllm/issues/14531 - - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2' - - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4' - - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2' + - pytest -v -s models/test_initialization.py - label: Language Models Test (Standard) mirror_hardwares: [amdexperimental] @@ -484,16 +536,36 @@ steps: - pip freeze | grep -E 'torch' - pytest -v -s models/language -m core_model -- label: Language Models Test (Extended) +- label: Language Models Test (Hybrid) # 35 min + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. + - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' + - pytest -v -s models/language/generation -m hybrid_model + +- label: Language Models Test (Extended Generation) # 1hr20min mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ - - tests/models/language + - tests/models/language/generation commands: # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - - pytest -v -s models/language -m 'not core_model' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (Extended Pooling) # 36min + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' - label: Multi-Modal Models Test (Standard) mirror_hardwares: [amdexperimental] @@ -581,13 +653,18 @@ steps: - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py commands: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 40min mirror_hardwares: [amdexperimental] @@ -605,9 +682,11 @@ steps: - vllm/worker/model_runner.py - entrypoints/llm/test_collective_rpc.py - tests/v1/test_async_llm_dp.py + - tests/v1/entrypoints/openai/test_multi_api_servers.py - vllm/v1/engine/ commands: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py @@ -648,7 +727,7 @@ steps: - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins - label: Multi-step Tests (4 GPUs) # 36min - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -709,7 +788,7 @@ steps: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - label: Weight Loading Multiple GPU Test - Large Models # optional - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 gpu: a100 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index a37bdb0f4d9ef..da7f89747a16d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -10,15 +10,21 @@ /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth -/vllm/model_executor/guided_decoding @mgoin @russellb +/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm /vllm/multimodal @DarkLight1337 @ywang96 /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee +/vllm/reasoning @aarnphm +/vllm/entrypoints @aarnphm CMakeLists.txt @tlrmchlsmth +# Any change to the VllmConfig changes can have a large user-facing impact, +# so spam a lot of people +/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor + # vLLM V1 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat -/vllm/v1/structured_output @mgoin @russellb +/vllm/v1/structured_output @mgoin @russellb @aarnphm # Test ownership /.buildkite/lm-eval-harness @mgoin @simon-mo @@ -27,8 +33,8 @@ CMakeLists.txt @tlrmchlsmth /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo -/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb +/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm +/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm /tests/kernels @tlrmchlsmth @WoosukKwon /tests/model_executor/test_guided_processors.py @mgoin @russellb /tests/models @DarkLight1337 @ywang96 @@ -38,7 +44,11 @@ CMakeLists.txt @tlrmchlsmth /tests/quantization @mgoin @robertgshaw2-redhat /tests/spec_decode @njhill @LiuXiaoxuanPKU /tests/test_inputs.py @DarkLight1337 @ywang96 -/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb -/tests/v1/structured_output @mgoin @russellb +/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm +/tests/v1/structured_output @mgoin @russellb @aarnphm /tests/weight_loading @mgoin @youkaichao /tests/lora @jeejeelee + +# Docs +/docs @hmellor +mkdocs.yaml @hmellor diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml index 00b0f024c0da5..8c5c28cd77cff 100644 --- a/.github/ISSUE_TEMPLATE/400-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml @@ -8,6 +8,16 @@ body: attributes: value: > #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: markdown + attributes: + value: | + ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as: + - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys) + - Passwords or authentication credentials + - Private URLs or endpoints + - Personal or confidential data + + Consider redacting or replacing sensitive values with placeholders like `` when sharing configuration or code examples. - type: textarea attributes: label: Your current environment @@ -81,14 +91,14 @@ body: required: true - type: markdown attributes: - value: > - ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output: + value: | + ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output: - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc). - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. - Thanks for contributing 🎉! + Thanks for reporting 🙏! - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/450-ci-failure.yml b/.github/ISSUE_TEMPLATE/450-ci-failure.yml new file mode 100644 index 0000000000000..7af0e0673a2f3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/450-ci-failure.yml @@ -0,0 +1,69 @@ +name: 🧪 CI failure report +description: Report a failing test. +title: "[CI Failure]: " +labels: ["ci-failure"] + +body: +- type: markdown + attributes: + value: > + #### Include the name of the failing Buildkite step and test file in the title. +- type: input + attributes: + label: Name of failing test + description: | + Paste in the fully-qualified name of the failing test from the logs. + placeholder: | + `path/to/test_file.py::test_name[params]` + validations: + required: true +- type: checkboxes + attributes: + label: Basic information + description: Select all items that apply to the failing test. + options: + - label: Flaky test + - label: Can reproduce locally + - label: Caused by external libraries (e.g. bug in `transformers`) +- type: textarea + attributes: + label: 🧪 Describe the failing test + description: | + Please provide a clear and concise description of the failing test. + placeholder: | + A clear and concise description of the failing test. + + ``` + The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present. + ``` + validations: + required: true +- type: textarea + attributes: + label: 📝 History of failing test + description: | + Since when did the test start to fail? + You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main). + + If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods: + + - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally. + + - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally. + + - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only) + placeholder: | + Approximate timeline and/or problematic PRs + + A link to the Buildkite analytics of the failing test (if available) + validations: + required: true +- type: textarea + attributes: + label: CC List. + description: > + The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test. +- type: markdown + attributes: + value: > + Thanks for reporting 🙏! diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7042e81a84daa..017ec7ca82da7 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,6 +1,18 @@ -FILL IN THE PR DESCRIPTION HERE +## Essential Elements of an Effective PR Description Checklist +- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". +- [ ] The test plan, such as providing test command. +- [ ] The test results, such as pasting the results comparison before and after, or e2e results +- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model. -FIX #xxxx (*link existing issues this PR will resolve*) +PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED. + +## Purpose + +## Test Plan + +## Test Result + +## (Optional) Documentation Update -**BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) +**BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) diff --git a/.github/mergify.yml b/.github/mergify.yml index ccfd571625b54..9c047bcaf95dc 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -36,6 +36,21 @@ pull_request_rules: add: - frontend +- name: label-llama + description: Automatically apply llama label + conditions: + - or: + - files~=^examples/.*llama.*\.py + - files~=^tests/.*llama.*\.py + - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py + - files~=^vllm/model_executor/models/.*llama.*\.py + - files~=^vllm/transformers_utils/configs/.*llama.*\.py + - title~=(?i)llama + actions: + label: + add: + - llama + - name: label-multi-modality description: Automatically apply multi-modality label conditions: @@ -51,6 +66,53 @@ pull_request_rules: add: - multi-modality +- name: label-performance + description: Automatically apply performance label + conditions: + - or: + - files~=^benchmarks/ + - files~=^vllm/benchmarks/ + - files~=^tests/benchmarks/ + - files~=^\.buildkite/nightly-benchmarks/ + actions: + label: + add: + - performance + +- name: label-qwen + description: Automatically apply qwen label + conditions: + - or: + - files~=^examples/.*qwen.*\.py + - files~=^tests/.*qwen.*\.py + - files~=^vllm/model_executor/models/.*qwen.*\.py + - files~=^vllm/reasoning/.*qwen.*\.py + - title~=(?i)Qwen + actions: + label: + add: + - qwen + +- name: label-rocm + description: Automatically apply rocm label + conditions: + - or: + - files~=^csrc/rocm/ + - files~=^docker/Dockerfile.rocm + - files~=^requirements/rocm.*\.txt + - files~=^vllm/attention/backends/rocm.*\.py + - files~=^vllm/attention/ops/rocm.*\.py + - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py + - files~=^vllm/v1/attention/backends/mla/rocm.*\.py + - files~=^tests/kernels/.*_rocm.*\.py + - files=vllm/platforms/rocm.py + - title~=(?i)AMD + - title~=(?i)ROCm + actions: + label: + add: + - rocm + - name: label-structured-output description: Automatically apply structured-output label conditions: @@ -58,7 +120,7 @@ pull_request_rules: - files~=^benchmarks/structured_schemas/ - files=benchmarks/benchmark_serving_structured_output.py - files=benchmarks/run_structured_output_benchmark.sh - - files=docs/source/features/structured_outputs.md + - files=docs/features/structured_outputs.md - files=examples/offline_inference/structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py @@ -135,9 +197,7 @@ pull_request_rules: - files~=^tests/entrypoints/openai/tool_parsers/ - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py - files~=^vllm/entrypoints/openai/tool_parsers/ - - files=docs/source/features/tool_calling.md - - files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md - - files=docs/source/getting_started/examples/chat_with_tools.md + - files=docs/features/tool_calling.md - files~=^examples/tool_chat_* - files=examples/offline_inference/chat_with_tools.py - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh index 3246c6f9bc4b7..8d65936fba1d8 100755 --- a/.github/scripts/cleanup_pr_body.sh +++ b/.github/scripts/cleanup_pr_body.sh @@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}" # Remove HTML
section that includes text of "PR Checklist (Click to Expand)" python3 - <= 12.3, we recommend upgrading to CUDA 12.3 or later " + "if you intend on running FP8 quantized MoE models on Hopper or Blackwell.") + else() + message(STATUS "Not building moe_data as no compatible archs found " + "in CUDA target architectures.") endif() endif() @@ -632,6 +658,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # if CUDA endif endif() +if (VLLM_GPU_LANG STREQUAL "HIP") + # Add QuickReduce kernels + list(APPEND VLLM_EXT_SRC + "csrc/custom_quickreduce.cu" + ) +# if ROCM endif +endif() + message(STATUS "Enabling C extension.") define_gpu_extension_target( _C @@ -678,7 +712,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") # 9.0 for latest bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") if (MARLIN_MOE_ARCHS) # @@ -779,5 +813,7 @@ endif() # For CUDA we also build and ship some external projects. if (VLLM_GPU_LANG STREQUAL "CUDA") include(cmake/external_projects/flashmla.cmake) + + # vllm-flash-attn should be last as it overwrites some CMake functions include(cmake/external_projects/vllm_flash_attn.cmake) endif () diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6d46a6dca371d..2947aad75ee56 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,3 +1,3 @@ # Contributing to vLLM -You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html). +You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing). diff --git a/README.md b/README.md index 5b87ae838885c..3e6ae2acab2a9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@

- - vLLM + + vLLM

@@ -58,8 +58,8 @@ vLLM is fast with: - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph -- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8. -- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. +- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8 +- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer - Speculative decoding - Chunked prefill @@ -72,14 +72,14 @@ vLLM is flexible and easy to use with: - Tensor parallelism and pipeline parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron. +- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron - Prefix caching support - Multi-LoRA support vLLM seamlessly supports most popular open-source models on HuggingFace, including: - Transformer-like LLMs (e.g., Llama) - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3) -- Embedding Models (e.g. E5-Mistral) +- Embedding Models (e.g., E5-Mistral) - Multi-modal LLMs (e.g., LLaVA) Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html). @@ -100,14 +100,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. ## Contributing We welcome and value any contributions and collaborations. -Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved. +Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved. ## Sponsors vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! - + Cash Donations: - a16z - Dropbox @@ -154,12 +154,14 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ## Contact Us + - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions) - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai) -- coordinating contributions and development, please use [Slack](https://slack.vllm.ai) +- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai) - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu) + ## Media Kit -- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit). +- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit) diff --git a/SECURITY.md b/SECURITY.md index 47196a1f1221e..6053cfb41f35b 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form --- +Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations. + Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. diff --git a/benchmarks/README.md b/benchmarks/README.md index 4a8ab895e18e9..fb8690d42db98 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive datasets supported on vLLM. It’s a living document, updated as new features and datasets become available. -## Dataset Overview +**Dataset Overview** @@ -64,6 +64,12 @@ become available. + + + + + +
lmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered
CustomLocal file: data.jsonl
@@ -76,7 +82,10 @@ become available. **Note**: HuggingFace dataset's `dataset-name` should be set to `hf` --- -## Example - Online Benchmark +
+🚀 Example - Online Benchmark + +
First start serving your model @@ -124,7 +133,40 @@ P99 ITL (ms): 8.39 ================================================== ``` -### VisionArena Benchmark for Vision Language Models +**Custom Dataset** + +If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl + +``` +{"prompt": "What is the capital of India?"} +{"prompt": "What is the capital of Iran?"} +{"prompt": "What is the capital of China?"} +``` + +```bash +# start server +VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests +``` + +```bash +# run benchmarking script +python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ + --backend vllm \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --endpoint /v1/completions \ + --dataset-name custom \ + --dataset-path \ + --custom-skip-chat-template \ + --num-prompts 80 \ + --max-concurrency 1 \ + --temperature=0.3 \ + --top-p=0.75 \ + --result-dir "./log/" +``` + +You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. + +**VisionArena Benchmark for Vision Language Models** ```bash # need a model with vision capability here @@ -142,14 +184,13 @@ python3 vllm/benchmarks/benchmark_serving.py \ --num-prompts 1000 ``` -### InstructCoder Benchmark with Speculative Decoding +**InstructCoder Benchmark with Speculative Decoding** ``` bash VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ - --speculative-model "[ngram]" \ - --ngram_prompt_lookup_min 2 \ - --ngram-prompt-lookup-max 5 \ - --num_speculative_tokens 5 + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' ``` ``` bash @@ -160,7 +201,7 @@ python3 benchmarks/benchmark_serving.py \ --num-prompts 2048 ``` -### Other HuggingFaceDataset Examples +**Other HuggingFaceDataset Examples** ```bash vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests @@ -204,7 +245,17 @@ python3 vllm/benchmarks/benchmark_serving.py \ --seed 42 ``` -### Running With Sampling Parameters +**`philschmid/mt-bench`** + +``` bash +python3 vllm/benchmarks/benchmark_serving.py \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 80 +``` + +**Running With Sampling Parameters** When using OpenAI-compatible backends such as `vllm`, optional sampling parameters can be specified. Example client command: @@ -222,8 +273,27 @@ python3 vllm/benchmarks/benchmark_serving.py \ --num-prompts 10 ``` ---- -## Example - Offline Throughput Benchmark +**Running With Ramp-Up Request Rate** + +The benchmark tool also supports ramping up the request rate over the +duration of the benchmark run. This can be useful for stress testing the +server or finding the maximum throughput that it can handle, given some latency budget. + +Two ramp-up strategies are supported: +- `linear`: Increases the request rate linearly from a start value to an end value. +- `exponential`: Increases the request rate exponentially. + +The following arguments can be used to control the ramp-up: +- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`). +- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark. +- `--ramp-up-end-rps`: The request rate at the end of the benchmark. + +
+ +
+📈 Example - Offline Throughput Benchmark + +
```bash python3 vllm/benchmarks/benchmark_throughput.py \ @@ -241,7 +311,7 @@ Total num prompt tokens: 5014 Total num output tokens: 1500 ``` -### VisionArena Benchmark for Vision Language Models +**VisionArena Benchmark for Vision Language Models** ``` bash python3 vllm/benchmarks/benchmark_throughput.py \ @@ -261,7 +331,7 @@ Total num prompt tokens: 14527 Total num output tokens: 1280 ``` -### InstructCoder Benchmark with Speculative Decoding +**InstructCoder Benchmark with Speculative Decoding** ``` bash VLLM_WORKER_MULTIPROC_METHOD=spawn \ @@ -274,10 +344,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \ --output-len=100 \ --num-prompts=2048 \ --async-engine \ - --speculative-model="[ngram]" \ - --ngram_prompt_lookup_min=2 \ - --ngram-prompt-lookup-max=5 \ - --num_speculative_tokens=5 + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' ``` ``` @@ -286,7 +355,7 @@ Total num prompt tokens: 261136 Total num output tokens: 204800 ``` -### Other HuggingFaceDataset Examples +**Other HuggingFaceDataset Examples** **`lmms-lab/LLaVA-OneVision-Data`** @@ -325,7 +394,7 @@ python3 benchmarks/benchmark_throughput.py \ --num-prompts 10 ``` -### Benchmark with LoRA Adapters +**Benchmark with LoRA Adapters** ``` bash # download dataset @@ -341,3 +410,196 @@ python3 vllm/benchmarks/benchmark_throughput.py \ --enable-lora \ --lora-path yard1/llama-2-7b-sql-lora-test ``` + +
+ +
+🛠️ Example - Structured Output Benchmark + +
+ +Benchmark the performance of structured output generation (JSON, grammar, regex). + +**Server Setup** + +```bash +vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests +``` + +**JSON Schema Benchmark** + +```bash +python3 benchmarks/benchmark_serving_structured_output.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset json \ + --structured-output-ratio 1.0 \ + --request-rate 10 \ + --num-prompts 1000 +``` + +**Grammar-based Generation Benchmark** + +```bash +python3 benchmarks/benchmark_serving_structured_output.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset grammar \ + --structure-type grammar \ + --request-rate 10 \ + --num-prompts 1000 +``` + +**Regex-based Generation Benchmark** + +```bash +python3 benchmarks/benchmark_serving_structured_output.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset regex \ + --request-rate 10 \ + --num-prompts 1000 +``` + +**Choice-based Generation Benchmark** + +```bash +python3 benchmarks/benchmark_serving_structured_output.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset choice \ + --request-rate 10 \ + --num-prompts 1000 +``` + +**XGrammar Benchmark Dataset** + +```bash +python3 benchmarks/benchmark_serving_structured_output.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset xgrammar_bench \ + --request-rate 10 \ + --num-prompts 1000 +``` + +
+ +
+📚 Example - Long Document QA Benchmark + +
+ +Benchmark the performance of long document question-answering with prefix caching. + +**Basic Long Document QA Test** + +```bash +python3 benchmarks/benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 16 \ + --document-length 2000 \ + --output-len 50 \ + --repeat-count 5 +``` + +**Different Repeat Modes** + +```bash +# Random mode (default) - shuffle prompts randomly +python3 benchmarks/benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 8 \ + --document-length 3000 \ + --repeat-count 3 \ + --repeat-mode random + +# Tile mode - repeat entire prompt list in sequence +python3 benchmarks/benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 8 \ + --document-length 3000 \ + --repeat-count 3 \ + --repeat-mode tile + +# Interleave mode - repeat each prompt consecutively +python3 benchmarks/benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 8 \ + --document-length 3000 \ + --repeat-count 3 \ + --repeat-mode interleave +``` + +
+ +
+🗂️ Example - Prefix Caching Benchmark + +
+ +Benchmark the efficiency of automatic prefix caching. + +**Fixed Prompt with Prefix Caching** + +```bash +python3 benchmarks/benchmark_prefix_caching.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-prompts 1 \ + --repeat-count 100 \ + --input-length-range 128:256 +``` + +**ShareGPT Dataset with Prefix Caching** + +```bash +# download dataset +# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +python3 benchmarks/benchmark_prefix_caching.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \ + --enable-prefix-caching \ + --num-prompts 20 \ + --repeat-count 5 \ + --input-length-range 128:256 +``` + +
+ +
+⚡ Example - Request Prioritization Benchmark + +
+ +Benchmark the performance of request prioritization in vLLM. + +**Basic Prioritization Test** + +```bash +python3 benchmarks/benchmark_prioritization.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --input-len 128 \ + --output-len 64 \ + --num-prompts 100 \ + --scheduling-policy priority +``` + +**Multiple Sequences per Prompt** + +```bash +python3 benchmarks/benchmark_prioritization.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --input-len 128 \ + --output-len 64 \ + --num-prompts 100 \ + --scheduling-policy priority \ + --n 2 +``` + +
diff --git a/benchmarks/auto_tune.sh b/benchmarks/auto_tune.sh index ea63c6f71a6c5..b257b57ce06f5 100644 --- a/benchmarks/auto_tune.sh +++ b/benchmarks/auto_tune.sh @@ -10,11 +10,16 @@ # 3. Set variables (ALL REQUIRED) # BASE: your directory for vllm repo # MODEL: the model served by vllm +# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support. +# TP: ways of tensor parallelism # DOWNLOAD_DIR: directory to download and load model weights. # INPUT_LEN: request input len # OUTPUT_LEN: request output len # MIN_CACHE_HIT_PCT: prefix cache rate # MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000 +# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with. +# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with. +# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST. # 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens. # 5. The final result will be saved in RESULT file. @@ -30,31 +35,31 @@ TAG=$(date +"%Y_%m_%d_%H_%M") BASE="" MODEL="meta-llama/Llama-3.1-8B-Instruct" +SYSTEM="TPU" +TP=1 DOWNLOAD_DIR="" INPUT_LEN=4000 OUTPUT_LEN=16 -MIN_CACHE_HIT_PCT_PCT=0 +MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 +NUM_SEQS_LIST="128 256" +NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096" LOG_FOLDER="$BASE/auto-benchmark/$TAG" RESULT="$LOG_FOLDER/result.txt" +PROFILE_PATH="$LOG_FOLDER/profile" -echo "result file$ $RESULT" +echo "result file: $RESULT" echo "model: $MODEL" -echo rm -rf $LOG_FOLDER +rm -rf $PROFILE_PATH mkdir -p $LOG_FOLDER +mkdir -p $PROFILE_PATH cd "$BASE/vllm" -# create sonnet-4x.txt so that we can sample 2048 tokens for input -echo "" > benchmarks/sonnet_4x.txt -for _ in {1..4} -do -cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt -done -pip install datasets +pip install -q datasets current_hash=$(git rev-parse HEAD) echo "hash:$current_hash" >> "$RESULT" @@ -64,53 +69,88 @@ best_throughput=0 best_max_num_seqs=0 best_num_batched_tokens=0 best_goodput=0 -run_benchmark() { - local max_num_seqs=$1 - local max_num_batched_tokens=$2 - echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" - local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" - echo "vllm_log: $vllm_log" - echo - rm -f $vllm_log - # start the server - VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \ +start_server() { + local gpu_memory_utilization=$1 + local max_num_seqs=$2 + local max_num_batched_tokens=$3 + local vllm_log=$4 + local profile_dir=$5 + + pkill -f vllm + + VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \ --disable-log-requests \ --port 8004 \ - --gpu-memory-utilization 0.98 \ + --gpu-memory-utilization $gpu_memory_utilization \ --max-num-seqs $max_num_seqs \ --max-num-batched-tokens $max_num_batched_tokens \ - --tensor-parallel-size 1 \ + --tensor-parallel-size $TP \ --enable-prefix-caching \ --load-format dummy \ - --download-dir $DOWNLOAD_DIR \ + --download-dir "$DOWNLOAD_DIR" \ --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 & - echo "wait for 10 minutes.." - echo + # wait for 10 minutes... server_started=0 - for i in {1..60}; do - if grep -Fq "Application startup complete" "$vllm_log"; then - echo "Application started" + for i in {1..60}; do + RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) + STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) + if [[ "$STATUS_CODE" -eq 200 ]]; then server_started=1 break else - # echo "wait for 10 seconds..." sleep 10 fi done - if (( ! server_started )); then - echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log" - echo "pkill -f vllm" - echo - pkill vllm - sleep 10 + echo "server did not start within 10 minutes. Please check server log at $vllm_log". return 1 + else + return 0 fi +} + +update_best_profile() { + local profile_dir=$1 + local profile_index=$2 + sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort)) + selected_profile_file= + if [[ "$SYSTEM" == "TPU" ]]; then + selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb" + fi + if [[ "$SYSTEM" == "GPU" ]]; then + selected_profile_file="${sorted_paths[$profile_index]}" + fi + rm -f $PROFILE_PATH/* + cp $selected_profile_file $PROFILE_PATH +} + +run_benchmark() { + local max_num_seqs=$1 + local max_num_batched_tokens=$2 + local gpu_memory_utilization=$3 + echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" + local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" + local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}" + echo "vllm_log: $vllm_log" + echo + rm -f $vllm_log + mkdir -p $profile_dir + pkill -f vllm + local profile_index=0 + + echo "starting server..." + start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir + result=$? + if [[ "$result" -eq 1 ]]; then + echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" + else + echo "server started." + fi + echo echo "run benchmark test..." - echo meet_latency_requirement=0 # get a basic qps by using request-rate inf bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" @@ -118,30 +158,32 @@ run_benchmark() { python benchmarks/benchmark_serving.py \ --backend vllm \ --model $MODEL \ - --dataset-name sonnet \ - --dataset-path benchmarks/sonnet_4x.txt \ - --sonnet-input-len $INPUT_LEN \ - --sonnet-output-len $OUTPUT_LEN \ + --dataset-name random \ + --random-input-len $INPUT_LEN \ + --random-output-len $OUTPUT_LEN \ --ignore-eos \ --disable-tqdm \ --request-rate inf \ --percentile-metrics ttft,tpot,itl,e2el \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ - --num-prompts 100 \ - --sonnet-prefix-len $prefix_len \ - --port 8004 > "$bm_log" - through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') + --num-prompts 1000 \ + --random-prefix-len $prefix_len \ + --port 8004 \ + --profile &> "$bm_log" + throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then meet_latency_requirement=1 + request_rate=inf fi if (( ! meet_latency_requirement )); then - # start from request-rate as int(through_put) + 1 - request_rate=$((${through_put%.*} + 1)) + # start from request-rate as int(throughput) + 1 + request_rate=$((${throughput%.*} + 1)) while ((request_rate > 0)); do + profile_index=$((profile_index+1)) # clear prefix cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache sleep 5 @@ -149,19 +191,18 @@ run_benchmark() { python benchmarks/benchmark_serving.py \ --backend vllm \ --model $MODEL \ - --dataset-name sonnet \ - --dataset-path benchmarks/sonnet_4x.txt \ - --sonnet-input-len $INPUT_LEN \ - --sonnet-output-len $OUTPUT_LEN \ - --ignore_eos \ + --dataset-name random \ + --random-input-len $INPUT_LEN \ + --random-output-len $OUTPUT_LEN \ + --ignore-eos \ --disable-tqdm \ --request-rate $request_rate \ --percentile-metrics ttft,tpot,itl,e2el \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --num-prompts 100 \ - --sonnet-prefix-len $prefix_len \ - --port 8004 > "$bm_log" - through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') + --random-prefix-len $prefix_len \ + --port 8004 &> "$bm_log" + throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then @@ -173,13 +214,19 @@ run_benchmark() { fi # write the results and update the best result. if ((meet_latency_requirement)); then - echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" - echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT" - if (( $(echo "$through_put > $best_throughput" | bc -l) )); then - best_throughput=$through_put + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT" + if (( $(echo "$throughput > $best_throughput" | bc -l) )); then + best_throughput=$throughput best_max_num_seqs=$max_num_seqs best_num_batched_tokens=$max_num_batched_tokens best_goodput=$goodput + if [[ "$SYSTEM" == "TPU" ]]; then + update_best_profile "$profile_dir/plugins/profile" $profile_index + fi + if [[ "$SYSTEM" == "GPU" ]]; then + update_best_profile "$profile_dir" $profile_index + fi fi else echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" @@ -188,25 +235,42 @@ run_benchmark() { echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" - echo "pkill -f vllm" - echo pkill vllm sleep 10 - rm -f $vllm_log printf '=%.0s' $(seq 1 20) return 0 } +read -r -a num_seqs_list <<< "$NUM_SEQS_LIST" +read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST" -num_seqs_list="128 256" -num_batched_tokens_list="512 1024 2048 4096" -for num_seqs in $num_seqs_list; do - for num_batched_tokens in $num_batched_tokens_list; do - run_benchmark $num_seqs $num_batched_tokens - exit 0 +# first find out the max gpu-memory-utilization without HBM OOM. +gpu_memory_utilization=0.98 +find_gpu_memory_utilization=0 +while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do + start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" + result=$? + if [[ "$result" -eq 0 ]]; then + find_gpu_memory_utilization=1 + break + else + gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc) + fi +done + +if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then + echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model." +else + echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER." + exit 1 +fi + +for num_seqs in "${num_seqs_list[@]}"; do + for num_batched_tokens in "${num_batched_tokens_list[@]}"; do + run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization done done echo "finish permutations" -echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" -echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT" +echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" +echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 800d426c6d118..c7229dbb8e90d 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import io import json @@ -194,6 +195,11 @@ async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith(("completions", "profile")), ( + "OpenAI Completions API URL must end with 'completions' or 'profile'." + ) + async with aiohttp.ClientSession( trust_env=True, timeout=AIOHTTP_TIMEOUT ) as session: @@ -204,6 +210,8 @@ async def async_request_deepspeed_mii( "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp. "top_p": 1.0, } + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -215,7 +223,7 @@ async def async_request_deepspeed_mii( st = time.perf_counter() try: async with session.post( - url=request_func_input.api_url, json=payload + url=api_url, json=payload, headers=headers ) as response: if response.status == 200: parsed_resp = await response.json() @@ -317,7 +325,7 @@ async def async_request_openai_completions( most_recent_timestamp = timestamp generated_text += text or "" - elif usage := data.get("usage"): + if usage := data.get("usage"): output.output_tokens = usage.get("completion_tokens") if first_chunk_received: output.success = True @@ -396,8 +404,14 @@ async def async_request_openai_chat_completions( chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue + chunk_bytes = chunk_bytes.decode("utf-8") + # NOTE: SSE comments (often used as pings) start with a colon. + # These are not JSON data payload and should be skipped. + if chunk_bytes.startswith(":"): + continue + + chunk = chunk_bytes.removeprefix("data: ") - chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) @@ -604,6 +618,7 @@ ASYNC_REQUEST_FUNCS = { "tensorrt-llm": async_request_trt_llm, "scalellm": async_request_openai_completions, "sglang": async_request_openai_completions, + "llama.cpp": async_request_openai_completions, } OPENAI_COMPATIBLE_BACKENDS = [ diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index d8f48644cc005..55c0cf851264f 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This module defines a framework for sampling benchmark requests from various datasets. Each dataset subclass of BenchmarkDataset must implement sample @@ -9,9 +10,6 @@ generation. Supported dataset types include: - BurstGPT - HuggingFace - VisionArena - -TODO: Implement CustomDataset to parse a JSON file and convert its contents into -SampleRequest instances, similar to the approach used in ShareGPT. """ import base64 @@ -35,6 +33,7 @@ from transformers import PreTrainedTokenizerBase from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict +from vllm.multimodal.image import convert_image_mode from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer logger = logging.getLogger(__name__) @@ -257,7 +256,7 @@ def process_image(image: Any) -> Mapping[str, Any]: if isinstance(image, dict) and "bytes" in image: image = Image.open(BytesIO(image["bytes"])) if isinstance(image, Image.Image): - image = image.convert("RGB") + image = convert_image_mode(image, "RGB") with io.BytesIO() as image_data: image.save(image_data, format="JPEG") image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") @@ -350,11 +349,12 @@ class RandomDataset(BenchmarkDataset): # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] # To avoid uncontrolled change of the prompt length, # the encoded sequence is truncated before being decode again. + total_input_len = prefix_len + int(input_lens[i]) re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ - : input_lens[i] + :total_input_len ] prompt = tokenizer.decode(re_encoded_sequence) - total_input_len = prefix_len + int(input_lens[i]) + total_input_len = len(re_encoded_sequence) requests.append( SampleRequest( prompt=prompt, @@ -441,6 +441,97 @@ class ShareGPTDataset(BenchmarkDataset): return samples +# ----------------------------------------------------------------------------- +# Custom Dataset Implementation +# ----------------------------------------------------------------------------- + + +class CustomDataset(BenchmarkDataset): + """ + Implements the Custom dataset. Loads data from a JSONL file and generates + sample requests based on conversation turns. E.g., + ``` + {"prompt": "What is the capital of India?"} + {"prompt": "What is the capital of Iran?"} + {"prompt": "What is the capital of China?"} + ``` + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + # self.data will be a list of dictionaries + # e.g., [{"prompt": "What is the capital of India?"}, ...] + # This will be the standardized format which load_data() + # has to convert into depending on the filetype of dataset_path. + # sample() will assume this standardized format of self.data + self.data = [] + + # Load the JSONL file + if self.dataset_path.endswith(".jsonl"): + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True) + + # check if the JSONL file has a 'prompt' column + if "prompt" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'prompt' column.") + + # Convert each row to a dictionary and append to self.data + # This will convert the DataFrame to a list of dictionaries + # where each dictionary corresponds to a row in the DataFrame. + # This is the standardized format we want for self.data + for _, row in jsonl_data.iterrows(): + self.data.append(row.to_dict()) + else: + raise NotImplementedError( + "Only JSONL format is supported for CustomDataset." + ) + + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + **kwargs, + ) -> list: + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["prompt"] + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + + return sampled_requests + + # ----------------------------------------------------------------------------- # Sonnet Dataset Implementation # ----------------------------------------------------------------------------- @@ -775,7 +866,15 @@ class InstructCoderDataset(HuggingFaceDataset): for item in self.data: if len(sampled_requests) >= num_requests: break - prompt = f"{item['instruction']}:\n{item['input']}" + prompt = f"{item['input']}\n\n{item['instruction']} Just output \ + the code, do not include any explanation." + + # apply template + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) prompt_len = len(tokenizer(prompt).input_ids) sampled_requests.append( SampleRequest( diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d5aaceeb8c9c3..4d2ea126b24a5 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark the latency of processing a single batch of requests.""" import argparse @@ -6,13 +7,12 @@ import dataclasses import json import os import time -from pathlib import Path from typing import Any, Optional import numpy as np -import torch from tqdm import tqdm +import vllm.envs as envs from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs @@ -80,17 +80,9 @@ def main(args: argparse.Namespace): def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir) - ), - ) as p: - llm_generate() - print(p.key_averages().table(sort_by="self_cuda_time_total")) + llm.start_profile() + llm_generate() + llm.stop_profile() else: start_time = time.perf_counter() llm_generate() @@ -103,11 +95,7 @@ def main(args: argparse.Namespace): run_to_completion(profile_dir=None) if args.profile: - profile_dir = args.profile_result_dir - if not profile_dir: - profile_dir = ( - Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}" - ) + profile_dir = envs.VLLM_TORCH_PROFILER_DIR print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return @@ -135,7 +123,7 @@ def main(args: argparse.Namespace): save_to_pytorch_benchmark_format(args, results) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the latency of processing a single batch of " "requests till completion." @@ -164,15 +152,6 @@ if __name__ == "__main__": action="store_true", help="profile the generation process of a single batch", ) - parser.add_argument( - "--profile-result-dir", - type=str, - default=None, - help=( - "path to save the pytorch profiler output. Can be visualized " - "with ui.perfetto.dev or Tensorboard." - ), - ) parser.add_argument( "--output-json", type=str, @@ -189,5 +168,19 @@ if __name__ == "__main__": ) parser = EngineArgs.add_cli_args(parser) + # V1 enables prefix caching by default which skews the latency + # numbers. We need to disable prefix caching by default. + parser.set_defaults(enable_prefix_caching=False) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() + if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: + raise OSError( + "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " + "Please set it to a valid path to use torch profiler." + ) main(args) diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 109624c877891..6e0f3b51c9d28 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Offline benchmark to test the long document QA throughput. @@ -141,7 +142,7 @@ def main(args): ) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the performance with or " "without automatic prefix caching." @@ -191,5 +192,11 @@ if __name__ == "__main__": ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index ffaa8035797c1..b5e2613de1cd4 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Benchmark the efficiency of prefix caching. @@ -217,7 +218,7 @@ def main(args): ) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the performance with or without " "automatic prefix caching." @@ -267,5 +268,11 @@ if __name__ == "__main__": ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index a05dd24dece83..bb453791c1862 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark offline prioritization.""" import argparse @@ -160,7 +161,7 @@ def main(args: argparse.Namespace): json.dump(results, f, indent=4) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument( "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm" @@ -203,6 +204,12 @@ if __name__ == "__main__": ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index a887e7150dc78..886a51e1cbd9a 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project r"""Benchmark online serving throughput. On the server side, run one of the following commands: @@ -32,7 +33,7 @@ import warnings from collections.abc import AsyncGenerator, Iterable from dataclasses import dataclass from datetime import datetime -from typing import Any, Optional +from typing import Any, Literal, Optional import numpy as np from tqdm.asyncio import tqdm @@ -60,6 +61,7 @@ from benchmark_dataset import ( ASRDataset, BurstGPTDataset, ConversationDataset, + CustomDataset, HuggingFaceDataset, InstructCoderDataset, MTBenchDataset, @@ -105,14 +107,42 @@ class BenchmarkMetrics: percentiles_e2el_ms: list[tuple[float, float]] +def _get_current_request_rate( + ramp_up_strategy: Optional[Literal["linear", "exponential"]], + ramp_up_start_rps: Optional[int], + ramp_up_end_rps: Optional[int], + request_index: int, + total_requests: int, + request_rate: float, +) -> float: + if ( + ramp_up_strategy + and ramp_up_start_rps is not None + and ramp_up_end_rps is not None + ): + progress = request_index / max(total_requests - 1, 1) + if ramp_up_strategy == "linear": + increase = (ramp_up_end_rps - ramp_up_start_rps) * progress + return ramp_up_start_rps + increase + elif ramp_up_strategy == "exponential": + ratio = ramp_up_end_rps / ramp_up_start_rps + return ramp_up_start_rps * (ratio**progress) + else: + raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}") + return request_rate + + async def get_request( input_requests: list[SampleRequest], request_rate: float, burstiness: float = 1.0, -) -> AsyncGenerator[SampleRequest, None]: + ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, + ramp_up_start_rps: Optional[int] = None, + ramp_up_end_rps: Optional[int] = None, +) -> AsyncGenerator[tuple[SampleRequest, float], None]: """ Asynchronously generates requests at a specified rate - with OPTIONAL burstiness. + with OPTIONAL burstiness and OPTIONAL ramp-up strategy. Args: input_requests: @@ -127,22 +157,44 @@ async def get_request( A lower burstiness value (0 < burstiness < 1) results in more bursty requests, while a higher burstiness value (burstiness > 1) results in a more uniform arrival of requests. + ramp_up_strategy (optional): + The ramp-up strategy. Can be "linear" or "exponential". + If None, uses constant request rate (specified by request_rate). + ramp_up_start_rps (optional): + The starting request rate for ramp-up. + ramp_up_end_rps (optional): + The ending request rate for ramp-up. """ - input_requests: Iterable[SampleRequest] = iter(input_requests) - - # Calculate scale parameter theta to maintain the desired request_rate. assert burstiness > 0, ( f"A positive burstiness factor is expected, but given {burstiness}." ) - theta = 1.0 / (request_rate * burstiness) + # Convert to list to get length for ramp-up calculations + if isinstance(input_requests, Iterable) and not isinstance(input_requests, list): + input_requests = list(input_requests) + + total_requests = len(input_requests) + request_index = 0 for request in input_requests: - yield request + current_request_rate = _get_current_request_rate( + ramp_up_strategy, + ramp_up_start_rps, + ramp_up_end_rps, + request_index, + total_requests, + request_rate, + ) - if request_rate == float("inf"): + yield request, current_request_rate + + request_index += 1 + + if current_request_rate == float("inf"): # If the request rate is infinity, then we don't need to wait. continue + theta = 1.0 / (current_request_rate * burstiness) + # Sample the request interval from the gamma distribution. # If burstiness is 1, it follows exponential distribution. interval = np.random.gamma(shape=burstiness, scale=theta) @@ -288,6 +340,9 @@ async def benchmark( max_concurrency: Optional[int], lora_modules: Optional[Iterable[str]], extra_body: Optional[dict], + ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, + ramp_up_start_rps: Optional[int] = None, + ramp_up_end_rps: Optional[int] = None, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -351,7 +406,15 @@ async def benchmark( distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" - print(f"Traffic request rate: {request_rate}") + if ramp_up_strategy is not None: + print( + f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase " + f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over " + "the duration of the benchmark." + ) + else: + print(f"Traffic request rate: {request_rate} RPS.") + print(f"Burstiness factor: {burstiness} ({distribution})") print(f"Maximum request concurrency: {max_concurrency}") @@ -371,7 +434,34 @@ async def benchmark( benchmark_start_time = time.perf_counter() tasks: list[asyncio.Task] = [] - async for request in get_request(input_requests, request_rate, burstiness): + + rps_change_events = [] + last_int_rps = -1 + if ramp_up_strategy is not None and ramp_up_start_rps is not None: + last_int_rps = ramp_up_start_rps + rps_change_events.append( + { + "rps": last_int_rps, + "timestamp": datetime.now().isoformat(), + } + ) + + async for request, current_request_rate in get_request( + input_requests, + request_rate, + burstiness, + ramp_up_strategy, + ramp_up_start_rps, + ramp_up_end_rps, + ): + if ramp_up_strategy is not None: + current_int_rps = int(current_request_rate) + if current_int_rps > last_int_rps: + timestamp = datetime.now().isoformat() + for rps_val in range(last_int_rps + 1, current_int_rps + 1): + rps_change_events.append({"rps": rps_val, "timestamp": timestamp}) + last_int_rps = current_int_rps + prompt, prompt_len, output_len, mm_content = ( request.prompt, request.prompt_len, @@ -395,11 +485,8 @@ async def benchmark( ignore_eos=ignore_eos, extra_body=extra_body, ) - tasks.append( - asyncio.create_task( - limited_request_func(request_func_input=request_func_input, pbar=pbar) - ) - ) + task = limited_request_func(request_func_input=request_func_input, pbar=pbar) + tasks.append(asyncio.create_task(task)) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: @@ -475,6 +562,9 @@ async def benchmark( "errors": [output.error for output in outputs], } + if rps_change_events: + result["rps_change_events"] = rps_change_events + def process_one_metric( # E.g., "ttft" metric_attribute_name: str, @@ -608,6 +698,26 @@ def main(args: argparse.Namespace): tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model tokenizer_mode = args.tokenizer_mode + # Validate ramp-up arguments + if args.ramp_up_strategy is not None: + if args.request_rate != float("inf"): + raise ValueError( + "When using ramp-up, do not specify --request-rate. " + "The request rate will be controlled by ramp-up parameters. " + "Please remove the --request-rate argument." + ) + if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None: + raise ValueError( + "When using --ramp-up-strategy, both --ramp-up-start-rps and " + "--ramp-up-end-rps must be specified" + ) + if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0: + raise ValueError("Ramp-up start and end RPS must be non-negative") + if args.ramp_up_start_rps > args.ramp_up_end_rps: + raise ValueError("Ramp-up start RPS must be less than end RPS") + if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0: + raise ValueError("For exponential ramp-up, the start RPS cannot be 0.") + if args.base_url is not None: api_url = f"{args.base_url}{args.endpoint}" base_url = f"{args.base_url}" @@ -627,7 +737,16 @@ def main(args: argparse.Namespace): "'--dataset-path' if required." ) - if args.dataset_name == "sonnet": + if args.dataset_name == "custom": + dataset = CustomDataset(dataset_path=args.dataset_path) + input_requests = dataset.sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.custom_output_len, + skip_chat_template=args.custom_skip_chat_template, + ) + + elif args.dataset_name == "sonnet": dataset = SonnetDataset(dataset_path=args.dataset_path) # For the "sonnet" dataset, formatting depends on the backend. if args.backend == "openai-chat": @@ -762,6 +881,10 @@ def main(args: argparse.Namespace): if "temperature" not in sampling_params: sampling_params["temperature"] = 0.0 # Default to greedy decoding. + if args.backend == "llama.cpp": + # Disable prompt caching in llama.cpp backend + sampling_params["cache_prompt"] = False + # Avoid GC processing "static" data - reduce pause times. gc.collect() gc.freeze() @@ -787,6 +910,9 @@ def main(args: argparse.Namespace): max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, extra_body=sampling_params, + ramp_up_strategy=args.ramp_up_strategy, + ramp_up_start_rps=args.ramp_up_start_rps, + ramp_up_end_rps=args.ramp_up_end_rps, ) ) @@ -819,6 +945,11 @@ def main(args: argparse.Namespace): result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency + if args.ramp_up_strategy is not None: + result_json["ramp_up_strategy"] = args.ramp_up_strategy + result_json["ramp_up_start_rps"] = args.ramp_up_start_rps + result_json["ramp_up_end_rps"] = args.ramp_up_end_rps + # Merge with benchmark result result_json = {**result_json, **benchmark_result} @@ -834,6 +965,8 @@ def main(args: argparse.Namespace): ]: if field in result_json: del result_json[field] + if field in benchmark_result: + del benchmark_result[field] # Save to file base_model_id = model_id.split("/")[-1] @@ -842,10 +975,14 @@ def main(args: argparse.Namespace): if args.max_concurrency is not None else "" ) - file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa + if args.ramp_up_strategy is not None: + file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa + else: + file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa if args.result_filename: file_name = args.result_filename if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) file_name = os.path.join(args.result_dir, file_name) with open( file_name, mode="a+" if args.append_result else "w", encoding="utf-8" @@ -857,7 +994,7 @@ def main(args: argparse.Namespace): save_to_pytorch_benchmark_format(args, result_json, file_name) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the online serving throughput." ) @@ -886,7 +1023,7 @@ if __name__ == "__main__": "--dataset-name", type=str, default="sharegpt", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"], + choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], help="Name of the dataset to benchmark on.", ) parser.add_argument( @@ -1056,6 +1193,19 @@ if __name__ == "__main__": ) # group for dataset specific arguments + custom_group = parser.add_argument_group("custom dataset options") + custom_group.add_argument( + "--custom-output-len", + type=int, + default=256, + help="Number of output tokens per request, used only for custom dataset.", + ) + custom_group.add_argument( + "--custom-skip-chat-template", + action="store_true", + help="Skip applying chat template to prompt, used only for custom dataset.", + ) + sonnet_group = parser.add_argument_group("sonnet dataset options") sonnet_group.add_argument( "--sonnet-input-len", @@ -1194,6 +1344,35 @@ if __name__ == "__main__": "script chooses a LoRA module at random.", ) - args = parser.parse_args() + parser.add_argument( + "--ramp-up-strategy", + type=str, + default=None, + choices=["linear", "exponential"], + help="The ramp-up strategy. This would be used to " + "ramp up the request rate from initial RPS to final " + "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). " + "over the duration of the benchmark.", + ) + parser.add_argument( + "--ramp-up-start-rps", + type=int, + default=None, + help="The starting request rate for ramp-up (RPS). " + "Needs to be specified when --ramp-up-strategy is used.", + ) + parser.add_argument( + "--ramp-up-end-rps", + type=int, + default=None, + help="The ending request rate for ramp-up (RPS). " + "Needs to be specified when --ramp-up-strategy is used.", + ) + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() + args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 5088c805f53ef..e23a5a9e2233d 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project r"""Benchmark online serving throughput with structured outputs. On the server side, run one of the following commands: @@ -11,7 +12,6 @@ On the client side, run: --model \ --dataset json \ --structured-output-ratio 1.0 \ - --structured-output-backend auto \ --request-rate 10 \ --num-prompts 1000 @@ -672,7 +672,7 @@ async def benchmark( def evaluate(ret, args): def _eval_correctness_json(expected, actual): # extract json string from string using regex - import re + import regex as re actual = actual.replace("\n", "").replace(" ", "").strip() try: @@ -687,7 +687,7 @@ def evaluate(ret, args): return actual in args.choice def _eval_correctness_regex(expected, actual): - import re + import regex as re return re.match(args.regex, actual) is not None @@ -850,7 +850,7 @@ def main(args: argparse.Namespace): json.dump(results, outfile, indent=4) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the online serving throughput." ) @@ -1034,5 +1034,10 @@ if __name__ == "__main__": help="Ratio of Structured Outputs requests", ) + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 7a13babda9d16..0ded34c70badd 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark offline inference throughput.""" import argparse @@ -96,7 +97,7 @@ def run_vllm( assert lora_requests is None, "BeamSearch API does not support LoRA" prompts = [request.prompt for request in requests] # output_len should be the same for all requests. - output_len = requests[0][2] + output_len = requests[0].expected_output_len for request in requests: assert request.expected_output_len == output_len start = time.perf_counter() @@ -594,7 +595,7 @@ def validate_args(args): ) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument( "--backend", @@ -716,6 +717,12 @@ if __name__ == "__main__": ) parser = AsyncEngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index b0c4fca92c3d0..283f938df50af 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json @@ -65,4 +66,9 @@ class InfEncoder(json.JSONEncoder): def write_to_json(filename: str, records: list) -> None: with open(filename, "w") as f: - json.dump(records, f, cls=InfEncoder) + json.dump( + records, + f, + cls=InfEncoder, + default=lambda o: f"<{type(o).__name__} object is not JSON serializable>", + ) diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index da258f98e085f..9ec270bbd2e98 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index 7e9f5a7fc0f46..b4f3c6bf94eda 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Cutlass bench utils from collections.abc import Iterable diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 08e93837f7ddf..a5a5b52f60397 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy @@ -18,7 +19,7 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( w8a8_block_fp8_matmul, ) -from vllm.utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser, cdiv DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -116,14 +117,9 @@ def bench_fp8( scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - def ceil_div(x: int, y: int) -> int: - return (x + y - 1) // y - - block_scale_a = torch.rand( - (m, ceil_div(k, 128)), device="cuda", dtype=torch.float32 - ) + block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32) block_scale_b = torch.rand( - ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32 + cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32 ) block_scale_a_M_major = block_scale_a.t().contiguous().t() block_scale_b_K_major = block_scale_b.t().contiguous().t() diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py index d31b623a1ee60..25b96ef56620e 100644 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index fce156e1c96c6..f62d8102e2d9f 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py index fd19b40bf252c..b1df2f255822d 100644 --- a/benchmarks/disagg_benchmarks/round_robin_proxy.py +++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import itertools diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py index 484d0cb3cba7d..74fa56d076cf1 100644 --- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py +++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index 37a9173a1a937..901524214469e 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle as pkl import time diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py new file mode 100644 index 0000000000000..d17443871cf66 --- /dev/null +++ b/benchmarks/kernels/bench_fp8_gemm.py @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: Apache-2.0 +import argparse +import copy +import itertools + +import torch +from weight_shapes import WEIGHT_SHAPES + +from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm +from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant +from vllm.triton_utils import triton + +PROVIDER_CFGS = { + "torch-bf16": dict(enabled=True), + "fp8-tensor-w-token-a": dict( + w="tensor", a="token", no_a_quant=False, enabled=False + ), + "fp8-tensor-w-tensor-a": dict( + w="tensor", a="tensor", no_a_quant=False, enabled=True + ), + "fp8-channel-w-token-a": dict( + w="channel", a="token", no_a_quant=False, enabled=True + ), + "fp8-channel-w-tensor-a": dict( + w="channel", a="tensor", no_a_quant=False, enabled=False + ), + "fp8-tensor-w-token-a-noquant": dict( + w="tensor", a="token", no_a_quant=True, enabled=False + ), + "fp8-tensor-w-tensor-a-noquant": dict( + w="tensor", a="tensor", no_a_quant=True, enabled=True + ), + "fp8-channel-w-token-a-noquant": dict( + w="channel", a="token", no_a_quant=True, enabled=True + ), + "fp8-channel-w-tensor-a-noquant": dict( + w="channel", a="tensor", no_a_quant=True, enabled=False + ), +} + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str): + if w_type == "tensor": + scale_b = torch.ones(1, device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + else: + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True) + return b_fp8.t(), scale_b_fp8 + + +def build_fp8_runner(cfg, a, b, dtype, device): + b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device) + + scale_a_const = ( + torch.ones(1, device=device, dtype=torch.float32) + if cfg["a"] == "tensor" + else None + ) + + if cfg["no_a_quant"]: + if cfg["a"] == "tensor": + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const) + else: + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True) + + def run(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + return run + + if cfg["a"] == "tensor": + + def run(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + else: + + def run(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + return run + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=_enabled, + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs FP8 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + else: + cfg = PROVIDER_CFGS[provider] + run_quant = build_fp8_runner(cfg, a, b, dtype, device) + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), quantiles=quantiles + ) + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +def prepare_shapes(args): + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + out.append(KN) + return out + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) + args = parser.parse_args() + + for K, N, model in prepare_shapes(args): + print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_fp8_res_n{N}_k{K}", + N=N, + K=K, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/bench_int8_gemm.py b/benchmarks/kernels/bench_int8_gemm.py new file mode 100644 index 0000000000000..e9c6d64404d0d --- /dev/null +++ b/benchmarks/kernels/bench_int8_gemm.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import copy +import itertools + +import torch +from weight_shapes import WEIGHT_SHAPES + +from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm +from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant +from vllm.triton_utils import triton + +PROVIDER_CFGS = { + "torch-bf16": dict(enabled=True), + "int8-tensor-w-token-a": dict( + w="tensor", a="token", no_a_quant=False, enabled=False + ), + "int8-tensor-w-tensor-a": dict( + w="tensor", a="tensor", no_a_quant=False, enabled=True + ), + "int8-channel-w-token-a": dict( + w="channel", a="token", no_a_quant=False, enabled=True + ), + "int8-channel-w-tensor-a": dict( + w="channel", a="tensor", no_a_quant=False, enabled=False + ), + "int8-tensor-w-token-a-noquant": dict( + w="tensor", a="token", no_a_quant=True, enabled=False + ), + "int8-tensor-w-tensor-a-noquant": dict( + w="tensor", a="tensor", no_a_quant=True, enabled=True + ), + "int8-channel-w-token-a-noquant": dict( + w="channel", a="token", no_a_quant=True, enabled=True + ), + "int8-channel-w-tensor-a-noquant": dict( + w="channel", a="tensor", no_a_quant=True, enabled=False + ), +} + + +def _quant_weight(b, w_type, device): + if w_type == "tensor": + scale_b = torch.ones(1, device=device, dtype=torch.float32) + b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b) + assert scale_b_int8.numel() == 1 + else: # channel + b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b) + assert scale_b_int8.numel() == b.shape[0] + return b_int8.t(), scale_b_int8 + + +def build_int8_runner(cfg, a, b, dtype, device): + # quant before running the kernel + b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device) + + scale_a_const = None + if cfg["a"] == "tensor": + scale_a_const = torch.ones(1, device=device, dtype=torch.float32) + + # no quant, create activation ahead + if cfg["no_a_quant"]: + if cfg["a"] == "tensor": + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const) + else: # token + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a) + + def run_quant(): + return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) + + return run_quant + + # dynamic quant, create activation inside + if cfg["a"] == "tensor": + + def run_quant(): + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const) + return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) + + else: # token + + def run_quant(): + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a) + return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) + + return run_quant + + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")] + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=[k for k in _enabled], + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs INT8 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + else: + cfg = PROVIDER_CFGS[provider] + run_quant = build_int8_runner(cfg, a, b, dtype, device) + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), quantiles=quantiles + ) + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +def prepare_shapes(args): + KN_model_names = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + KN_model_names.append(KN) + return KN_model_names + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + help="List of models to benchmark", + ) + parser.add_argument( + "--tp-sizes", + nargs="+", + type=int, + default=[1], + help="List of tensor parallel sizes", + ) + args = parser.parse_args() + + for K, N, model in prepare_shapes(args): + print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_int8_res_n{N}_k{K}", + N=N, + K=K, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index e9934aa479dd6..42de062b08e42 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index d40ab70ec539b..97ee060341373 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py index d39d8a6e3aba3..35c20ee41b9a9 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit @@ -90,7 +91,7 @@ def bench_run( score = torch.randn((m, num_experts), device=device, dtype=dtype) - topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False) + topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) quant_blocksize = 16 w1_blockscale = torch.empty( diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 2197bceabe6c0..acabe6c1ddb0a 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.utils.benchmark as benchmark @@ -6,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 from vllm.model_executor.layers.fused_moe.fused_moe import ( - cutlass_moe_fp8, fused_experts, fused_topk, ) @@ -69,18 +70,9 @@ def bench_run( w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) - ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) - c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64) - ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64) - c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) - for expert in range(num_experts): w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert]) w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert]) - w1_q_notransp = w1_q.clone() - w2_q_notransp = w2_q.clone() - w1_q = w1_q.transpose(1, 2) - w2_q = w2_q.transpose(1, 2) score = torch.randn((m, num_experts), device="cuda", dtype=dtype) @@ -121,10 +113,6 @@ def bench_run( w2_scale: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - ab_strides1: torch.Tensor, - c_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides2: torch.Tensor, num_repeats: int, ): for _ in range(num_repeats): @@ -132,14 +120,10 @@ def bench_run( a, w1, w2, - w1_scale, - w2_scale, topk_weights, topk_ids, - ab_strides1, - c_strides1, - ab_strides2, - c_strides2, + w1_scale, + w2_scale, a1_scale=a_scale, ) @@ -152,10 +136,6 @@ def bench_run( w2_scale: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - ab_strides1: torch.Tensor, - c_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides2: torch.Tensor, ): with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) @@ -164,14 +144,10 @@ def bench_run( a, w1_q, w2_q, - w1_scale, - w2_scale, topk_weights, topk_ids, - ab_strides1, - c_strides1, - ab_strides2, - c_strides2, + w1_scale, + w2_scale, a1_scale=a_scale, ) @@ -217,10 +193,6 @@ def bench_run( w2_scale, topk_weights, topk_ids, - ab_strides1, - c_strides1, - ab_strides2, - c_strides2, ) torch.cuda.synchronize() @@ -229,8 +201,8 @@ def bench_run( with torch.cuda.graph(triton_graph, stream=triton_stream): run_triton_from_graph( a, - w1_q_notransp, - w2_q_notransp, + w1_q, + w2_q, topk_weights, topk_ids, w1_scale, @@ -249,18 +221,12 @@ def bench_run( "w2": w2, "score": score, "topk": topk, - "w1_q_notransp": w1_q_notransp, - "w2_q_notransp": w2_q_notransp, # Cutlass params "a_scale": a_scale, "w1_q": w1_q, "w2_q": w2_q, "w1_scale": w1_scale, "w2_scale": w2_scale, - "ab_strides1": ab_strides1, - "c_strides1": c_strides1, - "ab_strides2": ab_strides2, - "c_strides2": c_strides2, # cuda graph params "cutlass_graph": cutlass_graph, "triton_graph": triton_graph, @@ -278,8 +244,8 @@ def bench_run( # Warmup run_triton_moe( a, - w1_q_notransp, - w2_q_notransp, + w1_q, + w2_q, topk_weights, topk_ids, w1_scale, @@ -290,7 +256,7 @@ def bench_run( results.append( benchmark.Timer( - stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 + stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, @@ -321,16 +287,12 @@ def bench_run( w2_scale, topk_weights, topk_ids, - ab_strides1, - c_strides1, - ab_strides2, - c_strides2, num_warmup, ) results.append( benchmark.Timer( - stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501 + stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index f21ca97eeb8a9..69978ec6b23e9 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 6c1284930c1ec..3d38d4b3534e8 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index f8f1db04790bf..0f896f187ecb9 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index b17baff2e5f5d..34cc45e94d76d 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.utils.benchmark as benchmark @@ -21,8 +22,16 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types, ) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( + FP4_MARLIN_SUPPORTED_GROUP_SIZES, + rand_marlin_weight_fp4_like, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( + marlin_quant_fp8_torch, +) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( MarlinWorkspace, + awq_marlin_quantize, marlin_quantize, ) from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( @@ -34,7 +43,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( quantize_weights, sort_weights, ) -from vllm.scalar_type import ScalarType +from vllm.scalar_type import ScalarType, scalar_types from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] @@ -56,80 +65,144 @@ def bench_run( size_n: int, ): label = "Quant Matmul" - sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format( model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n ) - print(f"Testing: {sub_label}") a = torch.randn(size_m, size_k).to(torch.half).cuda() b = torch.rand(size_k, size_n).to(torch.half).cuda() + has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8] + if act_order and (group_size == -1 or group_size == size_k or has_zp): + return + if size_k % group_size != 0: + return - a_tmp = torch.zeros(size_m, size_k).to(torch.half).cuda() - - # Marlin quant - ( - marlin_w_ref, - marlin_q_w, - marlin_s, - marlin_g_idx, - marlin_sort_indices, - marlin_rand_perm, - ) = marlin_quantize(b, quant_type, group_size, act_order) - - # Marlin_24 quant - (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = ( - marlin_24_quantize(b, quant_type, group_size) + marlin_24_supported = ( + quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES + and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES ) - - marlin_zp = torch.empty(0, dtype=torch.int, device=b.device) - - # GPTQ quant - (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights( - b, quant_type, group_size, act_order + repack_supported = ( + quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES + and group_size in MARLIN_SUPPORTED_GROUP_SIZES ) - q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n) - - # For act_order, sort the "weights" and "g_idx" - # so that group ids are increasing - repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device) - if act_order: - (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx) - - # Prepare - marlin_workspace = MarlinWorkspace( - size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL - ) - - marlin_24_workspace = MarlinWorkspace( - size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL - ) - marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int) - - # AllSpark W8A16 quant - as_supported_case = ( + allspark_supported = ( quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES and group_size == -1 and not act_order and is_k_full ) - if as_supported_case: - properties = torch.cuda.get_device_properties(b.device.index) - sm_count = properties.multi_processor_count - sm_version = properties.major * 10 + properties.minor - supported_arch = sm_version >= 80 and sm_version < 90 - as_supported_case = as_supported_case and supported_arch - if supported_arch: - has_zp = False - w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp) - qw = qw.to(torch.uint8) - - qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight( - qw, s, zp, has_zp + def gen_marlin_params(): + # Marlin quant + marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None + if quant_type == scalar_types.float4_e2m1f: + if group_size != 16 or act_order: + return + marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like( + b.T, group_size ) - CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD + elif quant_type == scalar_types.float8_e4m3fn: + if group_size not in [-1, 128] or act_order: + return + marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size) + elif group_size == 16: + return + elif has_zp: + marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize( + b, quant_type, group_size + ) + else: + marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = ( + marlin_quantize(b, quant_type, group_size, act_order) + ) + return ( + marlin_w_ref, + marlin_q_w, + marlin_s, + marlin_s2, + marlin_zp, + marlin_g_idx, + marlin_sort_indices, + ) + + def gen_marlin_24_params(): + marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None + if marlin_24_supported: + (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = ( + marlin_24_quantize(b, quant_type, group_size) + ) + return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) + + def gen_repack_params(): + q_w_gptq = None + repack_sort_indices = None + if repack_supported: + (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights( + b, quant_type, group_size, act_order + ) + q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n) + + # For act_order, sort the "weights" and "g_idx" + # so that group ids are increasing + repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device) + if act_order: + (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx) + return q_w_gptq, repack_sort_indices + + def gen_allspark_params(): + qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = ( + CUBLAS_M_THRESHOLD + ) = None + nonlocal allspark_supported + if allspark_supported: + properties = torch.cuda.get_device_properties(b.device.index) + sm_count = properties.multi_processor_count + sm_version = properties.major * 10 + properties.minor + + supported_arch = sm_version >= 80 and sm_version < 90 + allspark_supported = allspark_supported and supported_arch + if supported_arch: + w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp) + qw = qw.to(torch.uint8) + + qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight( + qw, s, zp, has_zp + ) + CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD + return ( + qw_reorder, + s_reorder, + zp_reorder, + sm_count, + sm_version, + CUBLAS_M_THRESHOLD, + ) + + ( + marlin_w_ref, + marlin_q_w, + marlin_s, + marlin_s2, + marlin_zp, + marlin_g_idx, + marlin_sort_indices, + ) = gen_marlin_params() + marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = ( + gen_marlin_24_params() + ) + q_w_gptq, repack_sort_indices = gen_repack_params() + qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = ( + gen_allspark_params() + ) + + # Prepare + marlin_workspace = MarlinWorkspace( + size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL + ) + marlin_24_workspace = MarlinWorkspace( + size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL + ) globals = { # Gen params @@ -139,15 +212,14 @@ def bench_run( "size_n": size_n, "size_k": size_k, "a": a, - "a_tmp": a_tmp, # Marlin params "marlin_w_ref": marlin_w_ref, "marlin_q_w": marlin_q_w, "marlin_s": marlin_s, + "marlin_s2": marlin_s2, "marlin_zp": marlin_zp, "marlin_g_idx": marlin_g_idx, "marlin_sort_indices": marlin_sort_indices, - "marlin_rand_perm": marlin_rand_perm, "marlin_workspace": marlin_workspace, "is_k_full": is_k_full, # Marlin_24 params @@ -160,12 +232,12 @@ def bench_run( "q_w_gptq": q_w_gptq, "repack_sort_indices": repack_sort_indices, # AllSpark W8A16 params - "qw_reorder": qw_reorder if as_supported_case else None, - "s_reorder": s_reorder if as_supported_case else None, - "zp_reorder": zp_reorder if as_supported_case else None, - "sm_count": sm_count if as_supported_case else None, - "sm_version": sm_version if as_supported_case else None, - "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD if as_supported_case else None, + "qw_reorder": qw_reorder, + "s_reorder": s_reorder, + "zp_reorder": zp_reorder, + "sm_count": sm_count, + "sm_version": sm_version, + "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD, # Kernels "gptq_marlin_gemm": ops.gptq_marlin_gemm, "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm, @@ -176,7 +248,7 @@ def bench_run( min_run_time = 1 # Warmup pytorch - for i in range(5): + for _ in range(5): torch.matmul(a, marlin_w_ref) results.append( @@ -191,17 +263,17 @@ def bench_run( results.append( benchmark.Timer( - stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 + stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, - description="gptq_marlin_gemm_fp16", + description="gptq_marlin_gemm", ).blocked_autorange(min_run_time=min_run_time) ) results.append( benchmark.Timer( - stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 + stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, @@ -209,10 +281,7 @@ def bench_run( ).blocked_autorange(min_run_time=min_run_time) ) - if ( - quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES - and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES - ): + if marlin_24_supported: results.append( benchmark.Timer( stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501 @@ -223,17 +292,18 @@ def bench_run( ).blocked_autorange(min_run_time=min_run_time) ) - results.append( - benchmark.Timer( - stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 - globals=globals, - label=label, - sub_label=sub_label, - description="gptq_marlin_repack", - ).blocked_autorange(min_run_time=min_run_time) - ) + if repack_supported: + results.append( + benchmark.Timer( + stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_repack", + ).blocked_autorange(min_run_time=min_run_time) + ) - if as_supported_case: + if allspark_supported: results.append( benchmark.Timer( stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501 @@ -249,7 +319,6 @@ def main(args): print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}") - results: list[benchmark.Measurement] = [] for model in args.models: @@ -277,14 +346,17 @@ def main(args): ): continue - for quant_type in query_marlin_supported_quant_types(False): + for quant_type in query_marlin_supported_quant_types(): if ( len(args.limit_num_bits) > 0 and quant_type.size_bits not in args.limit_num_bits ): continue - for group_size in MARLIN_SUPPORTED_GROUP_SIZES: + for group_size in ( + MARLIN_SUPPORTED_GROUP_SIZES + + FP4_MARLIN_SUPPORTED_GROUP_SIZES + ): if ( len(args.limit_group_size) > 0 and group_size not in args.limit_group_size diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c2f7660858f57..cef53b183cef3 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json @@ -6,7 +7,6 @@ import time from contextlib import nullcontext from datetime import datetime from itertools import product -from types import SimpleNamespace from typing import Any, TypedDict import ray @@ -42,7 +42,7 @@ def benchmark_config( use_fp8_w8a8: bool, use_int8_w8a16: bool, num_iters: int = 100, - block_quant_shape: List[int] = None, + block_quant_shape: list[int] = None, use_deep_gemm: bool = False, ) -> float: init_dtype = torch.float16 if use_fp8_w8a8 else dtype @@ -399,7 +399,7 @@ class BenchmarkWorker: dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - block_quant_shape: List[int] = None, + block_quant_shape: list[int] = None, use_deep_gemm: bool = False, ) -> tuple[dict[str, int], float]: current_platform.seed_everything(self.seed) @@ -531,7 +531,7 @@ def save_configs( dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - block_quant_shape: List[int], + block_quant_shape: list[int], ) -> None: dtype_str = get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 @@ -562,7 +562,6 @@ def main(args: argparse.Namespace): config = get_config(model=args.model, trust_remote_code=args.trust_remote_code) if args.model_prefix: config = getattr(config, args.model_prefix) - config = SimpleNamespace(**config) if config.architectures[0] == "DbrxForCausalLM": E = config.ffn_config.moe_num_experts @@ -594,11 +593,7 @@ def main(args: argparse.Namespace): shard_intermediate_size = 2 * intermediate_size // args.tp_size hidden_size = config.hidden_size - dtype = ( - torch.float16 - if current_platform.is_rocm() - else getattr(torch, config.torch_dtype) - ) + dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" block_quant_shape = get_weight_block_size_safety(config) diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py new file mode 100644 index 0000000000000..5170ac09dc42a --- /dev/null +++ b/benchmarks/kernels/benchmark_moe_align_block_size.py @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import itertools + +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( + moe_align_block_size_triton, +) +from vllm.triton_utils import triton + + +def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor: + return torch.stack( + [ + torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk] + for _ in range(num_tokens) + ] + ) + + +def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8): + """ + Verifies vllm vs. Triton + """ + topk_ids = get_topk_ids(num_tokens, num_experts, topk) + + # 1. malloc space for triton and vllm + # malloc enough space (max_num_tokens_padded) for the sorted ids + max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) + sorted_ids_triton = torch.empty( + (max_num_tokens_padded,), dtype=torch.int32, device="cuda" + ) + sorted_ids_triton.fill_(topk_ids.numel()) # fill with sentinel value + expert_ids_triton = torch.zeros( + (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda" + ) + num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda") + + sorted_ids_vllm = torch.empty_like(sorted_ids_triton) + sorted_ids_vllm.fill_(topk_ids.numel()) + expert_ids_vllm = torch.zeros_like(expert_ids_triton) + num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton) + + # 2. run implementations + moe_align_block_size_triton( + topk_ids, + num_experts, + block_size, + sorted_ids_triton, + expert_ids_triton, + num_tokens_post_pad_triton, + ) + + ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_ids_vllm, + expert_ids_vllm, + num_tokens_post_pad_vllm, + ) + print(f"✅ VLLM implementation works with {num_experts} experts!") + + # 3. compare results + if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose( + num_tokens_post_pad_triton, num_tokens_post_pad_vllm + ): + print("✅ Triton and VLLM implementations match.") + else: + print("❌ Triton and VLLM implementations DO NOT match.") + print("Triton expert_ids:", expert_ids_triton) + print("VLLM expert_ids:", expert_ids_vllm) + print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton) + print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm) + + +# test configurations +num_tokens_range = [1, 16, 256, 4096] +num_experts_range = [16, 64, 224, 256, 280, 512] +topk_range = [1, 2, 8] +configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range)) + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["num_tokens", "num_experts", "topk"], + x_vals=configs, + line_arg="provider", + line_vals=["vllm", "triton"], # "triton" + line_names=["VLLM", "Triton"], # "Triton" + plot_name="moe-align-block-size-performance", + args={}, + ) +) +def benchmark(num_tokens, num_experts, topk, provider): + """Benchmark function for Triton.""" + block_size = 256 + topk_ids = get_topk_ids(num_tokens, num_experts, topk) + + max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") + sorted_ids.fill_(topk_ids.numel()) + max_num_m_blocks = max_num_tokens_padded // block_size + expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda") + num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda") + + quantiles = [0.5, 0.2, 0.8] + + if provider == "vllm": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: ops.moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_ids.clone(), + expert_ids.clone(), + num_tokens_post_pad.clone(), + ), + quantiles=quantiles, + ) + elif provider == "triton": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: moe_align_block_size_triton( + topk_ids, + num_experts, + block_size, + sorted_ids.clone(), + expert_ids.clone(), + num_tokens_post_pad.clone(), + ), + quantiles=quantiles, + ) + + return 1000 * ms, 1000 * max_ms, 1000 * min_ms + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--num_experts", + type=int, + default=64, + choices=[8, 16, 32, 64, 128, 256], + ) + parser.add_argument( + "--topk", + type=int, + default=8, + choices=[2, 4, 8], + help="Top-k value for correctness check.", + ) + args = parser.parse_args() + + print("Running correctness check...") + check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk) + benchmark.run(print_data=True, show_plots=True) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 333986fdf5eff..dba1f3943b96c 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from typing import Any, TypedDict diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 17432159c94e7..7e0376c18ecc7 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random import time @@ -84,7 +85,10 @@ def main( if version == "v2": if current_platform.is_rocm(): global PARTITION_SIZE - PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM + if not args.custom_paged_attn and not current_platform.is_navi(): + PARTITION_SIZE = 1024 + else: + PARTITION_SIZE = PARTITION_SIZE_ROCM num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE tmp_output = torch.empty( size=(num_seqs, num_query_heads, num_partitions, head_size), @@ -159,6 +163,7 @@ def main( scale, block_tables, seq_lens, + None, block_size, max_seq_len, alibi_slopes, diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 2463dfebe83cc..6ab26f5f1adf7 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index d720083b61503..4cf633a81358d 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from typing import Optional, Union diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 110d36db157fd..b81baf17a8c67 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import accumulate from typing import Optional @@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora( seed: int, device: str, max_position: int = 8192, - base: int = 10000, + base: float = 10000, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py index 70190ba24d9df..18c459c31d3f8 100644 --- a/benchmarks/kernels/benchmark_shapes.py +++ b/benchmarks/kernels/benchmark_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project WEIGHT_SHAPES = { "ideal": [[4 * 256 * 32, 256 * 32]], diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 6315c1ee6cdd6..4fcdbadd65ecd 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from sglang quantization/tuning_block_wise_kernel.py import argparse diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index e377648254512..43c54d56ca8c1 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # fmt: off # ruff: noqa: E501 import time @@ -84,12 +85,6 @@ def benchmark_shape(m: int, # === DeepGEMM Implementation === def deepgemm_gemm(): - # A quantization is inside the loop as it depends on activations - # A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) - # A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8( - # A, block_size[1]) - # A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm) - # C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm) @@ -97,8 +92,6 @@ def benchmark_shape(m: int, # === vLLM Triton Implementation === def vllm_triton_gemm(): - # A quantization is inside the loop as it depends on activations - # A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) return w8a8_block_fp8_matmul(A_vllm, B_vllm, A_scale_vllm, @@ -108,9 +101,6 @@ def benchmark_shape(m: int, # === vLLM CUTLASS Implementation === def vllm_cutlass_gemm(): - # A quantization is inside the loop as it depends on activations - # A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( - # A, block_size[1], column_major_scales=True) return ops.cutlass_scaled_mm(A_vllm_cutlass, B_vllm.T, scale_a=A_scale_vllm_cutlass, diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index ab364a84d6cb2..9a4da0ef5a85d 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -1,12 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import pickle -import re from collections import defaultdict import matplotlib.pyplot as plt import pandas as pd +import regex as re import seaborn as sns from torch.utils.benchmark import Measurement as TMeasurement diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index 877a29feed9df..4bbb36bb43592 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from collections.abc import Iterable diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index 89b05d5882a38..a27f02394afbd 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) @@ -48,4 +49,50 @@ WEIGHT_SHAPES = { ([16384, 106496], 1), ([53248, 16384], 0), ], + "meta-llama/Llama-3.1-8B-Instruct": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-3.3-70B-Instruct": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 57344], 1), + ([28672, 8192], 0), + ], + "mistralai/Mistral-Large-Instruct-2407": [ + ([12288, 14336], 1), + ([12288, 12288], 0), + ([12288, 57344], 1), + ([28672, 12288], 0), + ], + "Qwen/Qwen2.5-7B-Instruct": [ + ([3584, 4608], 1), + ([3584, 3584], 0), + ([3584, 37888], 1), + ([18944, 3584], 0), + ], + "Qwen/Qwen2.5-32B-Instruct": [ + ([5120, 7168], 1), + ([5120, 5120], 0), + ([5120, 55296], 1), + ([27648, 5120], 0), + ], + "Qwen/Qwen2.5-72B-Instruct": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 59136], 1), + ([29568, 8192], 0), + ], + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [ + ([2048, 3072], 1), + ([2048, 4096], 1), + ([2048, 2048], 0), + ([2048, 576], 0), + ([2048, 21888], 1), + ([10944, 2048], 0), + ([2048, 2816], 1), + ([1408, 2048], 0), + ], } diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index d5701a8fbd6d8..0957a9c65f06c 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import cProfile import pstats diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml index f825cb203269c..65b1e09a247e2 100644 --- a/benchmarks/pyproject.toml +++ b/benchmarks/pyproject.toml @@ -6,11 +6,6 @@ [tool.ruff] line-length = 88 -exclude = [ - # External file, leaving license intact - "examples/other/fp8/quantizer/quantize.py", - "vllm/vllm_flash_attn/flash_attn_interface.pyi" -] [tool.ruff.lint.per-file-ignores] "vllm/third_party/**" = ["ALL"] diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index fb763db9fc359..5cd2c98f23438 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") else() find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND) + find_isa(${CPUINFO} "Power11" POWER11_FOUND) find_isa(${CPUINFO} "POWER10" POWER10_FOUND) find_isa(${CPUINFO} "POWER9" POWER9_FOUND) find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support @@ -106,13 +107,19 @@ elseif (AVX2_FOUND) list(APPEND CXX_COMPILE_FLAGS "-mavx2") message(WARNING "vLLM CPU backend using AVX2 ISA") -elseif (POWER9_FOUND OR POWER10_FOUND) +elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) message(STATUS "PowerPC detected") - # Check for PowerPC VSX support - list(APPEND CXX_COMPILE_FLAGS - "-mvsx" - "-mcpu=native" - "-mtune=native") + if (POWER9_FOUND) + list(APPEND CXX_COMPILE_FLAGS + "-mvsx" + "-mcpu=power9" + "-mtune=power9") + elseif (POWER10_FOUND OR POWER11_FOUND) + list(APPEND CXX_COMPILE_FLAGS + "-mvsx" + "-mcpu=power10" + "-mtune=power10") + endif() elseif (ASIMD_FOUND) message(STATUS "ARMv8 or later architecture detected") diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index b04e4c2d06edc..7b17018f65ab4 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491 + GIT_TAG 5f3644181c7a15345ce20bfc65af117d3601b524 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn @@ -46,22 +46,38 @@ else() endif() +# Ensure the vllm/vllm_flash_attn directory exists before installation +install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS) + +# Make sure vllm-flash-attn install rules are nested under vllm/ +# This is here to support installing all components under the same prefix with cmake --install. +# setup.py installs every component separately but uses the same prefix for all. +# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3, +# and these statements don't hurt when installing neither component. +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS) +install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS) + # Fetch the vllm-flash-attn library FetchContent_MakeAvailable(vllm-flash-attn) message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") +# Restore the install prefix +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) + # Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in # case only one is built, in the case both are built redundant work is done) install( DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn + DESTINATION vllm/vllm_flash_attn COMPONENT _vllm_fa2_C FILES_MATCHING PATTERN "*.py" ) install( DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn + DESTINATION vllm/vllm_flash_attn COMPONENT _vllm_fa3_C FILES_MATCHING PATTERN "*.py" ) diff --git a/cmake/hipify.py b/cmake/hipify.py index a15577125eb1f..55d378f5b1113 100755 --- a/cmake/hipify.py +++ b/cmake/hipify.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # # A command line tool for running pytorch's hipify preprocessor on CUDA diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 12e4e39024f5d..621179a701692 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) add_custom_target( hipify${NAME} - COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS} BYPRODUCTS ${HIP_SRCS} COMMENT "Running hipify on ${NAME} extension source files.") @@ -122,6 +122,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG) "-DENABLE_FP8" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__" + "-Werror=unused-variable" "-fno-gpu-rdc") endif() @@ -264,8 +265,8 @@ macro(set_gencode_flags_for_srcs) endmacro() # -# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form -# `.[letter]` compute the "loose intersection" with the +# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form +# `.[letter]` compute the "loose intersection" with the # `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in # `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there # is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the @@ -277,7 +278,7 @@ endmacro() # in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`. # We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is # in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add -# x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS). +# x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS). # The result is stored in `OUT_CUDA_ARCHS`. # # Example: @@ -312,21 +313,16 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS set(_CUDA_ARCHS) - if ("9.0a" IN_LIST _SRC_CUDA_ARCHS) - list(REMOVE_ITEM _SRC_CUDA_ARCHS "9.0a") - if ("9.0" IN_LIST TGT_CUDA_ARCHS) - list(REMOVE_ITEM _TGT_CUDA_ARCHS "9.0") - set(_CUDA_ARCHS "9.0a") + foreach(_arch ${_SRC_CUDA_ARCHS}) + if(_arch MATCHES "\\a$") + list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}") + string(REPLACE "a" "" _base "${_arch}") + if ("${_base}" IN_LIST TGT_CUDA_ARCHS) + list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}") + list(APPEND _CUDA_ARCHS "${_arch}") + endif() endif() - endif() - - if ("10.0a" IN_LIST _SRC_CUDA_ARCHS) - list(REMOVE_ITEM _SRC_CUDA_ARCHS "10.0a") - if ("10.0" IN_LIST TGT_CUDA_ARCHS) - list(REMOVE_ITEM _TGT_CUDA_ARCHS "10.0") - set(_CUDA_ARCHS "10.0a") - endif() - endif() + endforeach() list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) @@ -358,7 +354,7 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR endforeach() list(REMOVE_DUPLICATES _CUDA_ARCHS) - + # reapply +PTX suffix to architectures that requested PTX set(_FINAL_ARCHS) foreach(_arch ${_CUDA_ARCHS}) @@ -369,7 +365,7 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR endif() endforeach() set(_CUDA_ARCHS ${_FINAL_ARCHS}) - + set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) endfunction() diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu index 14e5edd7e283d..6bee9e4ce1166 100644 --- a/csrc/attention/merge_attn_states.cu +++ b/csrc/attention/merge_attn_states.cu @@ -143,6 +143,14 @@ void merge_attn_states_launcher(torch::Tensor& output, const uint pack_size = 16 / sizeof(scalar_t); TORCH_CHECK(head_size % pack_size == 0, "headsize must be multiple of pack_size:", pack_size); + TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1, + "output heads must be contiguous in memory"); + TORCH_CHECK( + prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1, + "prefix_output heads must be contiguous in memory"); + TORCH_CHECK( + suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1, + "suffix_output heads must be contiguous in memory"); float* output_lse_ptr = nullptr; if (output_lse.has_value()) { output_lse_ptr = output_lse.value().data_ptr(); diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu index 6743af0cf2dba..9d05d910dd81f 100644 --- a/csrc/attention/mla/cutlass_mla_kernels.cu +++ b/csrc/attention/mla/cutlass_mla_kernels.cu @@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options( {static_cast(out.data_ptr()), stride_O, static_cast(nullptr), stride_LSE}, hw_info, - -1, // split_kv + 1, // split_kv nullptr, // is_var_split_kv }; // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute @@ -207,7 +207,7 @@ void cutlass_mla_decode_sm100a(torch::Tensor const& out, "page_table must be a 32-bit integer tensor"); auto in_dtype = q_nope.dtype(); - at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()}; + const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(q_nope.get_device()); if (in_dtype == at::ScalarType::Half) { diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index 9b3a5c4b1014a..46108a32d719b 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -65,9 +65,6 @@ void paged_attention_v1_launcher( int kv_block_stride = key_cache.stride(0); int kv_head_stride = key_cache.stride(1); - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - // NOTE: alibi_slopes is optional. const float* alibi_slopes_ptr = alibi_slopes @@ -193,4 +190,4 @@ void paged_attention_v1( #undef WARP_SIZE #undef MAX #undef MIN -#undef DIVIDE_ROUND_UP \ No newline at end of file +#undef DIVIDE_ROUND_UP diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index 9935359e02fb1..9358c0d9f6a2a 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -66,9 +66,6 @@ void paged_attention_v2_launcher( int kv_block_stride = key_cache.stride(0); int kv_head_stride = key_cache.stride(1); - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - // NOTE: alibi_slopes is optional. const float* alibi_slopes_ptr = alibi_slopes @@ -203,4 +200,4 @@ void paged_attention_v2( #undef WARP_SIZE #undef MAX #undef MIN -#undef DIVIDE_ROUND_UP \ No newline at end of file +#undef DIVIDE_ROUND_UP diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index 0257d8ff16baf..82862fea7f2be 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -137,8 +137,8 @@ FORCE_INLINE std::pair reduceSoftmaxAlibi(T* data, const int size, } template -FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data, - const int size) { +FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data, + const int size) { T max = max_data[0]; for (int i = 1; i < size; ++i) { max = max >= max_data[i] ? max : max_data[i]; @@ -634,7 +634,7 @@ struct paged_attention_v2_impl { if (partition_num == 1) continue; - reducePartitonSoftmax( + reducePartitionSoftmax( max_logits + seq_idx * num_heads * max_num_partitions + head_idx * max_num_partitions, exp_sums + seq_idx * num_heads * max_num_partitions + diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index cf67847b45ba0..3952c43cbc727 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -19,6 +19,7 @@ namespace vec_op { #define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ @@ -82,7 +83,7 @@ struct FP16Vec16 : public Vec { explicit FP16Vec16(const void* ptr) : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - // non-temproal load + // non-temporal load explicit FP16Vec16(bool, void* ptr) : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} @@ -119,7 +120,7 @@ struct BF16Vec16 : public Vec { explicit BF16Vec16(const void* ptr) : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - // non-temproal load + // non-temporal load explicit BF16Vec16(bool, void* ptr) : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} @@ -326,7 +327,7 @@ struct FP32Vec16 : public Vec { // normal load explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} - // non-temproal load + // non-temporal load explicit FP32Vec16(bool, void* ptr) : reg((__m512)_mm512_stream_load_si512(ptr)) {} @@ -575,7 +576,7 @@ struct INT8Vec64 : public Vec { // normal load explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {} - // non-temproal load + // non-temporal load explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {} void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); } @@ -586,7 +587,7 @@ struct INT8Vec64 : public Vec { _mm512_mask_storeu_epi8(ptr, mask, reg); } - // non-temproal save + // non-temporal save void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); } }; #endif diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 447e826bc1c09..60304d229a8f5 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -131,16 +131,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantization #ifdef __AVX512F__ + at::Tag stride_tag = at::Tag::needs_fixed_stride_order; // Compute int8 quantized tensor for given scaling factor. ops.def( "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," - "Tensor? azp) -> ()"); + "Tensor? azp) -> ()", + {stride_tag}); ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant); // Compute int8 quantized tensor and scaling factor ops.def( "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " - "Tensor!? azp) -> ()"); + "Tensor!? azp) -> ()", + {stride_tag}); ops.impl("dynamic_scaled_int8_quant", torch::kCPU, &dynamic_scaled_int8_quant); // W8A8 GEMM, supporting symmetric per-tensor or per-row/column @@ -148,7 +151,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def( "cutlass_scaled_mm(Tensor! out, Tensor a," " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()"); + " Tensor b_scales, Tensor? bias) -> ()", + {stride_tag}); ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm); // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column // quantization. @@ -156,7 +160,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "cutlass_scaled_mm_azp(Tensor! out, Tensor a," " Tensor b, Tensor a_scales," " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()"); + " Tensor? azp, Tensor? bias) -> ()", + {stride_tag}); ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); #elif defined(__powerpc64__) // Compute int8 quantized tensor for given scaling factor. diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index c17a8961629a6..02514edce8073 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -54,8 +54,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp); int page_num = numa_migrate_pages(pid, src_mask, mask); if (page_num == -1) { - TORCH_CHECK(false, - "numa_migrate_pages failed. errno: " + std::to_string(errno)); + TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno)); } // restrict memory allocation node. @@ -105,4 +104,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { return ss.str(); } -#endif \ No newline at end of file +#endif diff --git a/csrc/custom_quickreduce.cu b/csrc/custom_quickreduce.cu new file mode 100644 index 0000000000000..33d0d4a7226e6 --- /dev/null +++ b/csrc/custom_quickreduce.cu @@ -0,0 +1,114 @@ +#include +#include +#include +#include + +#ifdef USE_ROCM + + #include "quickreduce/quick_reduce.h" + +quickreduce::fptr_t init_custom_qr(int64_t rank, int64_t world_size, + std::optional qr_max_size) { + if (world_size > 8) + throw std::invalid_argument("world size > 8 is not supported"); + if (world_size == 6) + throw std::invalid_argument("world size == 6 is not supported"); + if (world_size % 2 != 0) + throw std::invalid_argument("Odd num gpus is not supported for now"); + if (rank < 0 || rank >= world_size) + throw std::invalid_argument("invalid rank passed in"); + quickreduce::DeviceComms* fptr = new quickreduce::DeviceComms(); + fptr->init(world_size, rank, qr_max_size); + return (quickreduce::fptr_t)fptr; +} + +void qr_destroy(quickreduce::fptr_t _fa) { + if (_fa) { + auto fa = reinterpret_cast(_fa); + fa->destroy(); + delete fa; + } +} + +torch::Tensor qr_get_handle(quickreduce::fptr_t _fa) { + auto fa = reinterpret_cast(_fa); + hipIpcMemHandle_t handle = fa->get_handle(); + auto options = + torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU); + auto data_handle = + torch::empty({static_cast(sizeof(hipIpcMemHandle_t))}, options); + std::memcpy(data_handle.data_ptr(), &handle, sizeof(hipIpcMemHandle_t)); + return data_handle; +} + +void qr_open_handles(quickreduce::fptr_t _fa, + const std::vector& handles) { + auto fa = reinterpret_cast(_fa); + std::vector ipc_handles; + ipc_handles.reserve(handles.size()); + for (auto& handle : handles) { + // Ensure the tensor is on the same device as the current device. + hipIpcMemHandle_t ipc_handle; + std::memcpy(&ipc_handle, handle.data_ptr(), sizeof(hipIpcMemHandle_t)); + ipc_handles.push_back(ipc_handle); + } + fa->open_ipc_handles(ipc_handles); +} + +void qr_all_reduce(quickreduce::fptr_t _fa, torch::Tensor& inp, + torch::Tensor& out, int64_t quant_level, bool cast_bf2half) { + auto fa = reinterpret_cast(_fa); + const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); + auto stream = at::cuda::getCurrentHIPStreamMasqueradingAsCUDA(); + + TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type()); + TORCH_CHECK_EQ(inp.numel(), out.numel()); + TORCH_CHECK_LE(out.numel(), fa->kMaxProblemSize); + if (out.scalar_type() == at::ScalarType::Half) { + fa->allreduce(reinterpret_cast(inp.data_ptr()), + reinterpret_cast(out.data_ptr()), + out.numel(), quant_level, stream); + } else if (out.scalar_type() == at::ScalarType::BFloat16) { + if (cast_bf2half) { + fa->allreduce(reinterpret_cast(inp.data_ptr()), + reinterpret_cast(out.data_ptr()), + out.numel(), quant_level, stream); + } else { + fa->allreduce( + reinterpret_cast(inp.data_ptr()), + reinterpret_cast(out.data_ptr()), + out.numel(), quant_level, stream); + } + } else { + throw std::runtime_error( + "quick allreduce only supports float16 and bfloat16"); + } +} + +int64_t qr_max_size() { + // The default is 2GB (2,147,483,648 bytes) + return static_cast(std::numeric_limits::max()) + 1; +} + + #define INSTANTIATE_FOR_WORLDSIZE(T, Codec, cast_bf2half) \ + template struct quickreduce::AllReduceTwoshot, \ + cast_bf2half>; \ + template struct quickreduce::AllReduceTwoshot, \ + cast_bf2half>; \ + template struct quickreduce::AllReduceTwoshot, cast_bf2half>; + +INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, false) +INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, false) +INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, false) +INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, false) +INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, true) +INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, true) +INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, true) +INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, true) + +INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecFP, false) +INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ4, false) +INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ6, false) +INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ8, false) + +#endif // USE_ROCM \ No newline at end of file diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index d64f0d0a5c2a0..1dd7101acc27d 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from typing import Union diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index 98daf1a1b8e6c..c83d72751a55c 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -13,6 +13,10 @@ #include #include +#ifdef USE_ROCM + namespace cub = hipcub; +#endif + #include "static_switch.h" @@ -181,9 +185,7 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, params.conv_states_ptr = nullptr; } - // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)x.get_device()}; + const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); auto stream = at::cuda::getCurrentCUDAStream().stream(); DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] { causal_conv1d_fwd_cuda(params, stream); @@ -274,9 +276,7 @@ void causal_conv1d_update(const at::Tensor &x, params.conv_state_indices_ptr = nullptr; } - // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)x.get_device()}; + const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); auto stream = at::cuda::getCurrentCUDAStream().stream(); DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] { causal_conv1d_update_cuda(params, stream); @@ -501,15 +501,9 @@ void causal_conv1d_fwd_launch(ConvParamsBase ¶ms, cudaStream_t stream) { auto kernel = &causal_conv1d_fwd_kernel; if (kSmemSize >= 48 * 1024) { - #ifndef USE_ROCM - C10_CUDA_CHECK(cudaFuncSetAttribute( - kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); - #else - // There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function. C10_CUDA_CHECK(cudaFuncSetAttribute( (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl; - #endif } kernel<<>>(params); diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index bd0a34119c82b..785d316025eca 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -321,7 +321,7 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { auto kernel = &selective_scan_fwd_kernel; if (kSmemSize >= 48 * 1024) { C10_CUDA_CHECK(cudaFuncSetAttribute( - kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); + (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); } kernel<<>>(params); C10_CUDA_KERNEL_LAUNCH_CHECK(); @@ -647,9 +647,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, ); - // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)u.get_device()}; + const at::cuda::OptionalCUDAGuard device_guard(device_of(u)); auto stream = at::cuda::getCurrentCUDAStream().stream(); DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] { selective_scan_fwd_cuda(params, stream); diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py index 15f008d4f61ed..49f33718a21e8 100644 --- a/csrc/moe/marlin_moe_wna16/generate_kernels.py +++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import itertools import os diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 6b6a9d04a60f4..9335e2333b0d9 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -13,232 +13,45 @@ namespace vllm { namespace moe { -namespace { -__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, - int32_t col) { - // don't worry about overflow because num_experts is relatively small - return row * total_col + col; -} -} // namespace - -template -__global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, - int32_t* sorted_token_ids, - int32_t* expert_ids, - int32_t* total_tokens_post_pad, - int32_t num_experts, - int32_t block_size, size_t numel) { - const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); - const size_t start_idx = threadIdx.x * tokens_per_thread; - - extern __shared__ int32_t shared_mem[]; - int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1) - token_cnts_t* tokens_cnts = - (token_cnts_t*)(shared_mem + num_experts + - 1); // 2d tensor with shape (blockDim.x + 1, num_experts) - - for (int i = 0; i < num_experts; ++i) { - tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; - } - - /** - * In the first step we compute token_cnts[thread_index + 1][expert_index], - * which counts how many tokens in the token shard of thread_index are - * assigned to expert expert_index. - */ - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])]; - } - - __syncthreads(); - - // For each expert we accumulate the token counts from the different threads. - if (threadIdx.x < num_experts) { - tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; - for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[index(num_experts, i, threadIdx.x)] += - tokens_cnts[index(num_experts, i - 1, threadIdx.x)]; - } - } - - __syncthreads(); - - // We accumulate the token counts of all experts in thread 0. - if (threadIdx.x == 0) { - cumsum[0] = 0; - for (int i = 1; i <= num_experts; ++i) { - cumsum[i] = cumsum[i - 1] + - CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)], - block_size) * - block_size; - } - *total_tokens_post_pad = static_cast(cumsum[num_experts]); - } - - __syncthreads(); - - /** - * For each expert, each thread processes the tokens of the corresponding - * blocks and stores the corresponding expert_id for each block. - */ - if (threadIdx.x < num_experts) { - for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; - i += block_size) { - expert_ids[i / block_size] = threadIdx.x; - } - } - - /** - * Each thread processes a token shard, calculating the index of each token - * after sorting by expert number. Given the example topk_ids = - * [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *, - * *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a - * padding value(preset in python). - */ - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - int32_t expert_id = topk_ids[i]; - /** The cumsum[expert_id] stores the starting index of the tokens that the - * expert with expert_id needs to process, and - * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens - * processed by the expert with expert_id within the current thread's token - * shard. - */ - int32_t rank_post_pad = - tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + - cumsum[expert_id]; - sorted_token_ids[rank_post_pad] = i; - ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)]; - } -} - -// TODO(simon): this is temporarily adapted from -// https://github.com/sgl-project/sglang/commit/31548116a8dc8c6df7e146e0587335a59fc5b9d7 -// we did this to unblock Deepseek V3 but there should be a better -// implementation to manage shared memory. template -__global__ void moe_align_block_size_global_mem_kernel( - scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, - int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts, - int32_t block_size, size_t numel, int32_t* tokens_cnts, int32_t* cumsum) { - const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); - const size_t start_idx = threadIdx.x * tokens_per_thread; +__global__ void moe_align_block_size_kernel( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, + int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts, + int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size, + size_t numel, int32_t* __restrict__ cumsum) { + extern __shared__ int32_t shared_counts[]; - for (int i = 0; i < num_experts; ++i) { - tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; - } - - /** - * In the first step we compute token_cnts[thread_index + 1][expert_index], - * which counts how many tokens in the token shard of thread_index are - * assigned to expert expert_index. - */ - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])]; - } - - __syncthreads(); - - // For each expert we accumulate the token counts from the different threads. - if (threadIdx.x < num_experts) { - tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; - for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[index(num_experts, i, threadIdx.x)] += - tokens_cnts[index(num_experts, i - 1, threadIdx.x)]; - } - } - - __syncthreads(); - - // We accumulate the token counts of all experts in thread 0. - if (threadIdx.x == 0) { - cumsum[0] = 0; - for (int i = 1; i <= num_experts; ++i) { - cumsum[i] = cumsum[i - 1] + - CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)], - block_size) * - block_size; - } - *total_tokens_post_pad = cumsum[num_experts]; - } - - __syncthreads(); - - /** - * For each expert, each thread processes the tokens of the corresponding - * blocks and stores the corresponding expert_id for each block. - */ - if (threadIdx.x < num_experts) { - for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; - i += block_size) { - expert_ids[i / block_size] = threadIdx.x; - } - } - - /** - * Each thread processes a token shard, calculating the index of each token - * after sorting by expert number. Given the example topk_ids = - * [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *, - * *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a - * padding value(preset in python). - */ - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - int32_t expert_id = topk_ids[i]; - /** The cumsum[expert_id] stores the starting index of the tokens that the - * expert with expert_id needs to process, and - * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens - * processed by the expert with expert_id within the current thread's token - * shard. - */ - int32_t rank_post_pad = - tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + - cumsum[expert_id]; - sorted_token_ids[rank_post_pad] = i; - ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)]; - } -} - -// taken from -// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957 -template -__global__ void sgl_moe_align_block_size_kernel( - scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, - int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts, - int32_t block_size, size_t numel, int32_t* cumsum) { - __shared__ int32_t shared_counts[32][8]; - - const int warp_id = threadIdx.x / 32; - const int experts_per_warp = 8; + const int warp_id = threadIdx.x / WARP_SIZE; const int my_expert_start = warp_id * experts_per_warp; - // Initialize shared_counts for this warp's experts for (int i = 0; i < experts_per_warp; ++i) { - if (my_expert_start + i < num_experts) { - shared_counts[warp_id][i] = 0; + if (my_expert_start + i < padded_num_experts) { + shared_counts[warp_id * experts_per_warp + i] = 0; } } __syncthreads(); - const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); - const size_t start_idx = threadIdx.x * tokens_per_thread; + const size_t tid = threadIdx.x; + const size_t stride = blockDim.x; - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { + for (size_t i = tid; i < numel; i += stride) { int expert_id = topk_ids[i]; int warp_idx = expert_id / experts_per_warp; int expert_offset = expert_id % experts_per_warp; - atomicAdd(&shared_counts[warp_idx][expert_offset], 1); + atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1); } __syncthreads(); - // Single thread computes cumulative sum and total tokens if (threadIdx.x == 0) { cumsum[0] = 0; for (int i = 1; i <= num_experts; ++i) { int expert_count = 0; int warp_idx = (i - 1) / experts_per_warp; int expert_offset = (i - 1) % experts_per_warp; - expert_count = shared_counts[warp_idx][expert_offset]; + expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset]; cumsum[i] = cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size; @@ -248,7 +61,6 @@ __global__ void sgl_moe_align_block_size_kernel( __syncthreads(); - // Assign expert IDs to blocks if (threadIdx.x < num_experts) { for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; i += block_size) { @@ -257,13 +69,11 @@ __global__ void sgl_moe_align_block_size_kernel( } } -// taken from -// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957 template -__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids, - int32_t* sorted_token_ids, - int32_t* cumsum_buffer, - size_t numel) { +__global__ void count_and_sort_expert_tokens_kernel( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, + size_t numel) { const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; const size_t stride = blockDim.x * gridDim.x; @@ -290,132 +100,138 @@ __global__ void moe_sum_kernel( } } +template +__global__ void moe_align_block_size_small_batch_expert_kernel( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, + int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts, + int32_t block_size, size_t numel) { + const size_t tid = threadIdx.x; + const size_t stride = blockDim.x; + + extern __shared__ int32_t shared_mem[]; + int32_t* cumsum = shared_mem; + int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1); + + for (int i = 0; i < num_experts; ++i) { + tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0; + } + + for (size_t i = tid; i < numel; i += stride) { + ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]]; + } + + __syncthreads(); + + if (threadIdx.x < num_experts) { + tokens_cnts[threadIdx.x] = 0; + for (int i = 1; i <= blockDim.x; ++i) { + tokens_cnts[i * num_experts + threadIdx.x] += + tokens_cnts[(i - 1) * num_experts + threadIdx.x]; + } + } + + __syncthreads(); + + if (threadIdx.x == 0) { + cumsum[0] = 0; + for (int i = 1; i <= num_experts; ++i) { + cumsum[i] = + cumsum[i - 1] + + CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) * + block_size; + } + *total_tokens_post_pad = static_cast(cumsum[num_experts]); + } + + __syncthreads(); + + if (threadIdx.x < num_experts) { + for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; + i += block_size) { + expert_ids[i / block_size] = threadIdx.x; + } + } + + for (size_t i = tid; i < numel; i += stride) { + int32_t expert_id = topk_ids[i]; + int32_t rank_post_pad = + tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id]; + sorted_token_ids[rank_post_pad] = i; + ++tokens_cnts[threadIdx.x * num_experts + expert_id]; + } +} + } // namespace moe } // namespace vllm +// taken from +// https://github.com/sgl-project/sglang/blob/8b5f83ed3b7d2a49ad5c5cd5aa61c5d502f47dbc void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size, torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - int device_max_shared_mem; - auto dev = topk_ids.get_device(); - cudaDeviceGetAttribute(&device_max_shared_mem, - cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); - - const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - const int32_t shared_mem_i32 = - ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); - const int32_t shared_mem_i16 = - ((num_thread + 1) * num_experts) * sizeof(uint16_t) + - (num_experts + 1) * sizeof(int32_t); - - bool use_global_memory = false; - bool use_i16 = false; // Use uint16_t for shared memory token counts - if (shared_mem_i32 < device_max_shared_mem) { - // Do nothing in this case. We're all set to use int32_t token counts - } else if (shared_mem_i16 < device_max_shared_mem && - topk_ids.numel() <= 65535) { - // when nelements of topk_ids is smaller than 65535 (max value of uint16), - // element value of token_cnts would also smaller than 65535, - // so we can use uint16 as dtype of token_cnts - use_i16 = true; - } else { - use_global_memory = true; - } - - if (use_global_memory) { - VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( - topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] { - // calc needed amount of shared mem for `tokens_cnts` and `cumsum` - // tensors - const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - - auto options_int = torch::TensorOptions() - .dtype(torch::kInt) - .device(topk_ids.device()); - torch::Tensor token_cnts_buffer = - torch::empty({(num_experts + 1) * num_experts}, options_int); - torch::Tensor cumsum_buffer = - torch::empty({num_experts + 1}, options_int); - - auto kernel = - vllm::moe::moe_align_block_size_global_mem_kernel; - kernel<<<1, num_thread, 0, stream>>>( - topk_ids.data_ptr(), - sorted_token_ids.data_ptr(), - experts_ids.data_ptr(), - num_tokens_post_pad.data_ptr(), num_experts, block_size, - topk_ids.numel(), token_cnts_buffer.data_ptr(), - cumsum_buffer.data_ptr()); - }); - } else if (use_i16) { - VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( - topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - // set dynamic shared mem - auto kernel = - vllm::moe::moe_align_block_size_kernel; - AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( - (void*)kernel, shared_mem_i16)); - kernel<<<1, num_thread, shared_mem_i16, stream>>>( - topk_ids.data_ptr(), - sorted_token_ids.data_ptr(), - experts_ids.data_ptr(), - num_tokens_post_pad.data_ptr(), num_experts, block_size, - topk_ids.numel()); - }); - } else { - VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( - topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - auto kernel = - vllm::moe::moe_align_block_size_kernel; - AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( - (void*)kernel, shared_mem_i32)); - kernel<<<1, num_thread, shared_mem_i32, stream>>>( - topk_ids.data_ptr(), - sorted_token_ids.data_ptr(), - experts_ids.data_ptr(), - num_tokens_post_pad.data_ptr(), num_experts, block_size, - topk_ids.numel()); - }); - } -} - -void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, - int64_t block_size, - torch::Tensor sorted_token_ids, - torch::Tensor experts_ids, - torch::Tensor num_tokens_post_pad) { - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - TORCH_CHECK(num_experts == 256, - "sgl_moe_align_block_size kernel only supports deepseek v3."); + int64_t padded_num_experts = + ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE; + int experts_per_warp = WARP_SIZE; + int threads = 1024; + threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE; VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( - topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] { + topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { // calc needed amount of shared mem for `cumsum` tensors auto options_int = torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); torch::Tensor cumsum_buffer = torch::zeros({num_experts + 1}, options_int); + bool small_batch_expert_mode = + (topk_ids.numel() < 1024) && (num_experts <= 64); - auto align_kernel = - vllm::moe::sgl_moe_align_block_size_kernel; - align_kernel<<<1, 1024, 0, stream>>>( - topk_ids.data_ptr(), sorted_token_ids.data_ptr(), - experts_ids.data_ptr(), - num_tokens_post_pad.data_ptr(), num_experts, block_size, - topk_ids.numel(), cumsum_buffer.data_ptr()); + if (small_batch_expert_mode) { + const int32_t threads = max((int32_t)num_experts, WARP_SIZE); + const int32_t shared_mem_size = + ((threads + 1) * num_experts + (num_experts + 1)) * + sizeof(int32_t); - const int block_threads = 256; - const int num_blocks = - (topk_ids.numel() + block_threads - 1) / block_threads; - const int max_blocks = 65535; - const int actual_blocks = std::min(num_blocks, max_blocks); - auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel; - sort_kernel<<>>( - topk_ids.data_ptr(), sorted_token_ids.data_ptr(), - cumsum_buffer.data_ptr(), topk_ids.numel()); + auto small_batch_expert_kernel = + vllm::moe::moe_align_block_size_small_batch_expert_kernel< + scalar_t>; + small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>( + topk_ids.data_ptr(), + sorted_token_ids.data_ptr(), + experts_ids.data_ptr(), + num_tokens_post_pad.data_ptr(), num_experts, block_size, + topk_ids.numel()); + } else { + auto align_kernel = vllm::moe::moe_align_block_size_kernel; + + size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp); + size_t shared_mem_size = + num_warps * experts_per_warp * sizeof(int32_t); + + align_kernel<<<1, threads, shared_mem_size, stream>>>( + topk_ids.data_ptr(), + sorted_token_ids.data_ptr(), + experts_ids.data_ptr(), + num_tokens_post_pad.data_ptr(), num_experts, + padded_num_experts, experts_per_warp, block_size, + topk_ids.numel(), cumsum_buffer.data_ptr()); + + const int block_threads = std::min(256, (int)threads); + const int num_blocks = + (topk_ids.numel() + block_threads - 1) / block_threads; + const int max_blocks = 65535; + const int actual_blocks = std::min(num_blocks, max_blocks); + + auto sort_kernel = + vllm::moe::count_and_sort_expert_tokens_kernel; + sort_kernel<<>>( + topk_ids.data_ptr(), + sorted_token_ids.data_ptr(), + cumsum_buffer.data_ptr(), topk_ids.numel()); + } }); } diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 0bae119a7c460..661730c96867e 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -12,12 +12,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size, torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad); - -void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, - int64_t block_size, - torch::Tensor sorted_token_ids, - torch::Tensor experts_ids, - torch::Tensor num_tokens_post_pad); #ifndef USE_ROCM torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor b_qweight, torch::Tensor b_scales, @@ -28,4 +22,10 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor num_tokens_post_pad, int64_t top_k, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N, int64_t BLOCK_SIZE_K, int64_t bit); -#endif \ No newline at end of file +#endif + +bool moe_permute_unpermute_supported(); + +void shuffle_rows(const torch::Tensor& input_tensor, + const torch::Tensor& dst2src_map, + torch::Tensor& output_tensor); \ No newline at end of file diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index 76d5f0eab0218..a77471a7f2078 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -5,11 +5,14 @@ #include "permute_unpermute_kernels/dispatch.h" #include "core/registration.h" +// moe_permute kernels require at least CUDA 12.0 +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) + void moe_permute( const torch::Tensor& input, // [n_token, hidden] const torch::Tensor& topk_weights, //[n_token, topk] torch::Tensor& topk_ids, // [n_token, topk] - const torch::Tensor& token_expert_indicies, // [n_token, topk] + const torch::Tensor& token_expert_indices, // [n_token, topk] const std::optional& expert_map, // [n_expert] int64_t n_expert, int64_t n_local_expert, int64_t topk, const std::optional& align_block_size, @@ -24,15 +27,15 @@ void moe_permute( "expert_first_token_offset must be int64"); TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int, "topk_ids must be int32"); - TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int, - "token_expert_indicies must be int32"); + TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int, + "token_expert_indices must be int32"); TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int, "src_row_id2dst_row_id_map must be int32"); TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1, "expert_first_token_offset shape != n_local_expert+1") TORCH_CHECK( - src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(), - "token_expert_indicies shape must be same as src_row_id2dst_row_id_map"); + src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(), + "token_expert_indices shape must be same as src_row_id2dst_row_id_map"); auto n_token = input.sizes()[0]; auto n_hidden = input.sizes()[1]; auto align_block_size_value = @@ -68,7 +71,7 @@ void moe_permute( expert_map_ptr, n_expert, stream); } // expert sort topk expert id and scan expert id get expert_first_token_offset - sortAndScanExpert(get_ptr(topk_ids), get_ptr(token_expert_indicies), + sortAndScanExpert(get_ptr(topk_ids), get_ptr(token_expert_indices), get_ptr(permuted_experts_id), get_ptr(dst_row_id2src_row_id_map), get_ptr(expert_first_token_offset), n_token, @@ -127,7 +130,101 @@ void moe_unpermute( }); } +template +__global__ void shuffleInputRowsKernel(const T* input, + const int32_t* dst2src_map, T* output, + int64_t num_src_rows, + int64_t num_dst_rows, int64_t num_cols) { + int64_t dest_row_idx = blockIdx.x; + int64_t const source_row_idx = dst2src_map[dest_row_idx]; + + if (blockIdx.x < num_dst_rows) { + // Load 128-bits per thread + constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8; + using DataElem = cutlass::Array; + + // Duplicate and permute rows + auto const* source_row_ptr = + reinterpret_cast(input + source_row_idx * num_cols); + auto* dest_row_ptr = + reinterpret_cast(output + dest_row_idx * num_cols); + + int64_t const start_offset = threadIdx.x; + int64_t const stride = blockDim.x; + int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD; + + for (int elem_index = start_offset; elem_index < num_elems_in_col; + elem_index += stride) { + dest_row_ptr[elem_index] = source_row_ptr[elem_index]; + } + } +} + +void shuffle_rows(const torch::Tensor& input_tensor, + const torch::Tensor& dst2src_map, + torch::Tensor& output_tensor) { + TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(), + "Input and output tensors must have the same data type"); + + auto stream = at::cuda::getCurrentCUDAStream().stream(); + int64_t const blocks = output_tensor.size(0); + int64_t const threads = 256; + int64_t const num_dest_rows = output_tensor.size(0); + int64_t const num_src_rows = input_tensor.size(0); + int64_t const num_cols = input_tensor.size(1); + + TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)), + "num_cols must be divisible by 128 / " + "sizeof(input_tensor.scalar_type()) / 8"); + + MOE_DISPATCH(input_tensor.scalar_type(), [&] { + shuffleInputRowsKernel<<>>( + reinterpret_cast(input_tensor.data_ptr()), + dst2src_map.data_ptr(), + reinterpret_cast(output_tensor.data_ptr()), num_src_rows, + num_dest_rows, num_cols); + }); +} + +#else + +void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, + torch::Tensor& topk_ids, + const torch::Tensor& token_expert_indices, + const std::optional& expert_map, + int64_t n_expert, int64_t n_local_expert, int64_t topk, + const std::optional& align_block_size, + torch::Tensor& permuted_input, + torch::Tensor& expert_first_token_offset, + torch::Tensor& src_row_id2dst_row_id_map, + torch::Tensor& m_indices) { + TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); +} + +void moe_unpermute(const torch::Tensor& input, + const torch::Tensor& topk_weights, torch::Tensor& topk_ids, + const torch::Tensor& token_expert_indices, + const std::optional& expert_map, + int64_t n_expert, int64_t n_local_expert, int64_t topk, + const std::optional& align_block_size, + torch::Tensor& permuted_input, + torch::Tensor& expert_first_token_offset, + torch::Tensor& src_row_id2dst_row_id_map, + torch::Tensor& m_indices) { + TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); +} + +#endif + +bool moe_permute_unpermute_supported() { +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) + return true; +#else + return false; +#endif +} + TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("moe_permute", &moe_permute); m.impl("moe_unpermute", &moe_unpermute); -} \ No newline at end of file +} diff --git a/csrc/moe/permute_unpermute_kernels/dispatch.h b/csrc/moe/permute_unpermute_kernels/dispatch.h index 41932cdd85bcd..d0f1ea4aded33 100644 --- a/csrc/moe/permute_unpermute_kernels/dispatch.h +++ b/csrc/moe/permute_unpermute_kernels/dispatch.h @@ -14,12 +14,13 @@ __VA_ARGS__(); \ break; \ } -#define MOE_DISPATCH_FLOAT_CASE(...) \ - MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ - MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \ - MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) +#define MOE_DISPATCH_FLOAT_CASE(...) \ + MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) #define MOE_DISPATCH(TYPE, ...) \ MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__)) @@ -39,6 +40,11 @@ template <> struct ScalarType2CudaType { using type = __nv_bfloat16; }; +// uint8 for packed fp4 +template <> +struct ScalarType2CudaType { + using type = uint8_t; +}; // #if __CUDA_ARCH__ >= 890 // fp8 diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu index aa353d0f0437f..de2c153882d93 100644 --- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu @@ -1,6 +1,9 @@ #include "moe_permute_unpermute_kernel.h" +// moe_permute kernels require at least CUDA 12.0 +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) + // CubKeyValueSorter definition begin CubKeyValueSorter::CubKeyValueSorter() : num_experts_(0), num_bits_(sizeof(int) * 8) {} @@ -131,9 +134,6 @@ __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size, int num_experts) { auto tidx = threadIdx.x; auto bidx = blockIdx.x; - auto lidx = tidx & 31; - auto widx = tidx >> 5; - auto warp_count = (blockDim.x + 31) >> 5; auto offset = bidx * blockDim.x; auto bound = min(offset + blockDim.x, size); extern __shared__ int smem_expert_map[]; @@ -226,4 +226,6 @@ void getMIndices(int64_t* expert_first_token_offset, expert_first_token_offset, align_expert_first_token_offset, m_indices, num_local_expert, align_block_size); } -} \ No newline at end of file +} + +#endif diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl index 42441800fb110..ad0d390665a00 100644 --- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl @@ -20,7 +20,6 @@ __global__ void expandInputRowsKernel( int expert_id = sorted_experts[expanded_dest_row]; extern __shared__ int64_t smem_expert_first_token_offset[]; - int64_t align_expanded_row_accumulate = 0; if constexpr (ALIGN_BLOCK_SIZE) { // load g2s for (int idx = threadIdx.x; idx < num_local_experts + 1; @@ -63,7 +62,6 @@ __global__ void expandInputRowsKernel( using DataElem = cutlass::Array; // Duplicate and permute rows - int64_t const source_k_rank = expanded_source_row / num_rows; int64_t const source_row = expanded_source_row % num_rows; auto const* source_row_ptr = @@ -160,7 +158,6 @@ __global__ void finalizeMoeRoutingKernel( elem_index += stride) { ComputeElem thread_output; thread_output.fill(0); - float row_rescale{0.f}; for (int k_idx = 0; k_idx < k; ++k_idx) { int64_t const expanded_original_row = original_row + k_idx * num_rows; int64_t const expanded_permuted_row = @@ -177,8 +174,6 @@ __global__ void finalizeMoeRoutingKernel( auto const* expanded_permuted_rows_row_ptr = expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col; - int64_t const expert_idx = expert_for_source_row[k_offset]; - ComputeElem expert_result = arrayConvert( expanded_permuted_rows_row_ptr[elem_index]); thread_output = thread_output + row_scale * (expert_result); diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index a9379032245d9..dea5b1f21ec27 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \ topkGatingSoftmaxLauncherHelper( \ - gating_output, nullptr, topk_weights, topk_indicies, \ + gating_output, nullptr, topk_weights, topk_indices, \ token_expert_indices, num_tokens, topk, 0, num_experts, \ stream); @@ -433,7 +433,7 @@ template void topkGatingSoftmaxKernelLauncher( const float* gating_output, float* topk_weights, - IndType* topk_indicies, + IndType* topk_indices, int* token_expert_indices, float* softmax_workspace, const int num_tokens, @@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher( moeSoftmax<<>>( gating_output, nullptr, softmax_workspace, num_experts); moeTopK<<>>( - softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices, + softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices, num_experts, topk, 0, num_experts); } } @@ -516,9 +516,8 @@ void topk_softmax( topk, stream); } - else + else if (topk_indices.scalar_type() == at::ScalarType::UInt32) { - assert(topk_indices.scalar_type() == at::ScalarType::UInt32); vllm::moe::topkGatingSoftmaxKernelLauncher( gating_output.data_ptr(), topk_weights.data_ptr(), @@ -530,4 +529,17 @@ void topk_softmax( topk, stream); } + else { + assert(topk_indices.scalar_type() == at::ScalarType::Int64); + vllm::moe::topkGatingSoftmaxKernelLauncher( + gating_output.data_ptr(), + topk_weights.data_ptr(), + topk_indices.data_ptr(), + token_expert_indices.data_ptr(), + softmax_workspace.data_ptr(), + num_tokens, + num_experts, + topk, + stream); + } } diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index 810026d034c07..97df311d04409 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { // Calculate the result of moe by summing up the partial results // from all selected experts. - m.def("moe_sum(Tensor! input, Tensor output) -> ()"); + m.def("moe_sum(Tensor input, Tensor! output) -> ()"); m.impl("moe_sum", torch::kCUDA, &moe_sum); // Aligning the number of tokens to be processed by each expert such @@ -22,15 +22,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { " Tensor! num_tokens_post_pad) -> ()"); m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); - // temporarily adapted from - // https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a - m.def( - "sgl_moe_align_block_size(Tensor topk_ids, int num_experts," - " int block_size, Tensor! sorted_token_ids," - " Tensor! experts_ids," - " Tensor! num_tokens_post_pad) -> ()"); - m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size); - #ifndef USE_ROCM m.def( "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, " @@ -66,7 +57,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { m.def( "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids," - "Tensor token_expert_indicies, Tensor? expert_map, int n_expert," + "Tensor token_expert_indices, Tensor? expert_map, int n_expert," "int n_local_expert," "int topk, int? align_block_size,Tensor! permuted_input, Tensor! " "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! " @@ -77,7 +68,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor " "expert_first_token_offset, int n_expert, int n_local_expert,int " "topk, Tensor! hidden_states)->()"); - // conditionally compiled so impl registration is in source file + + m.def("moe_permute_unpermute_supported() -> bool"); + m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported); + + // Row shuffle for MoE + m.def( + "shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! " + "output_tensor) -> ()"); + m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows); #endif } diff --git a/csrc/ops.h b/csrc/ops.h index 7044b4588b81f..52c264d64ccad 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, torch::Tensor& weight, double epsilon); +void apply_repetition_penalties_(torch::Tensor& logits, + const torch::Tensor& prompt_mask, + const torch::Tensor& output_mask, + const torch::Tensor& repetition_penalties); + void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, torch::Tensor& scale, double epsilon); @@ -231,7 +236,8 @@ void cutlass_moe_mm( torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, - torch::Tensor const& b_strides, torch::Tensor const& c_strides); + torch::Tensor const& b_strides, torch::Tensor const& c_strides, + bool per_act_token, bool per_out_ch); void cutlass_fp4_group_mm( torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, @@ -243,7 +249,16 @@ void get_cutlass_moe_mm_data( const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, torch::Tensor& input_permutation, torch::Tensor& output_permutation, - const int64_t num_experts, const int64_t n, const int64_t k); + const int64_t num_experts, const int64_t n, const int64_t k, + const std::optional& blockscale_offsets); + +void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, + torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, + const torch::Tensor& expert_num_tokens, + const int64_t num_local_experts, + const int64_t padded_m, const int64_t n, + const int64_t k); void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, @@ -345,3 +360,14 @@ std::tuple allocate_shared_buffer_and_handle( int64_t size); int64_t open_mem_handle(torch::Tensor& mem_handle); void free_shared_buffer(int64_t buffer); + +#ifdef USE_ROCM +fptr_t init_custom_qr(int64_t rank, int64_t world_size, + std::optional qr_max_size = std::nullopt); +void qr_destroy(fptr_t _fa); +torch::Tensor qr_get_handle(fptr_t _fa); +void qr_open_handles(fptr_t _fa, const std::vector& handles); +void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, + int64_t quant_level, bool cast_bf2half = false); +int64_t qr_max_size(); +#endif \ No newline at end of file diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index fea4bc2ca0d8f..3d5077d9de461 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -274,7 +274,6 @@ void advance_step_flashinfer( cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - [[maybe_unused]] int block_tables_stride = block_tables.stride(0); TORCH_CHECK((blocks * threads > num_queries), "multi-step: not enough threads to map to num_queries = ", num_queries, " block_tables.stride(0) = ", block_tables.stride(0), diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index bf46cce60a233..87117a165fe92 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -1,15 +1,17 @@ #include #include + #include #include "../../dispatch_utils.h" +#include "../vectorization_utils.cuh" #ifndef USE_ROCM - #include #include + #include #else - #include #include + #include #endif static inline __device__ int8_t float_to_int8_rn(float x) { @@ -103,134 +105,170 @@ static inline __device__ int8_t int32_to_int8(int32_t x) { namespace vllm { -template +template __global__ void static_scaled_int8_quant_kernel( - scalar_t const* __restrict__ input, int8_t* __restrict__ out, - scale_type const* scale_ptr, const int hidden_size) { - int const tid = threadIdx.x; - int64_t const token_idx = blockIdx.x; - scale_type const scale = *scale_ptr; + const scalar_t* __restrict__ input, int8_t* __restrict__ output, + const scale_t* scale_ptr, const int hidden_size) { + const int tid = threadIdx.x; + const int stride = blockDim.x; + const int64_t token_idx = blockIdx.x; + const float scale = *scale_ptr; // Must be performed using 64-bit math to avoid integer overflow. - out += token_idx * hidden_size; - input += token_idx * hidden_size; + const scalar_t* row_in = input + token_idx * hidden_size; + int8_t* row_out = output + token_idx * hidden_size; - for (int i = tid; i < hidden_size; i += blockDim.x) { - out[i] = float_to_int8_rn(static_cast(input[i]) / scale); - } + vectorize_with_alignment<16>( + row_in, row_out, hidden_size, tid, stride, + [=] __device__(int8_t& dst, const scalar_t& src) { + dst = float_to_int8_rn(static_cast(src) / scale); + }); } -template +template __global__ void static_scaled_int8_azp_quant_kernel( - scalar_t const* __restrict__ input, int8_t* __restrict__ out, - scale_type const* scale_ptr, azp_type const* azp_ptr, - const int hidden_size) { - int const tid = threadIdx.x; - int64_t const token_idx = blockIdx.x; - scale_type const scale = *scale_ptr; - azp_type const azp = *azp_ptr; + const scalar_t* __restrict__ input, int8_t* __restrict__ output, + const scale_t* scale_ptr, const azp_t* azp_ptr, const int hidden_size) { + const int tid = threadIdx.x; + const int stride = blockDim.x; + const int64_t token_idx = blockIdx.x; + const float scale = *scale_ptr; + const azp_t azp = *azp_ptr; + const float inv_s = 1.0f / scale; // Must be performed using 64-bit math to avoid integer overflow. - out += token_idx * hidden_size; - input += token_idx * hidden_size; + const scalar_t* row_in = input + token_idx * hidden_size; + int8_t* row_out = output + token_idx * hidden_size; - for (int i = tid; i < hidden_size; i += blockDim.x) { - auto const val = static_cast(input[i]); - auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale) + azp); - out[i] = quant_val; - } + vectorize_with_alignment<16>( + row_in, row_out, hidden_size, tid, stride, + [=] __device__(int8_t& dst, const scalar_t& src) { + const auto v = static_cast(src) * inv_s; + dst = int32_to_int8(float_to_int32_rn(v) + azp); + }); } -template +template __global__ void dynamic_scaled_int8_quant_kernel( - scalar_t const* __restrict__ input, int8_t* __restrict__ out, - scale_type* scale, const int hidden_size) { - int const tid = threadIdx.x; - int64_t const token_idx = blockIdx.x; - float absmax_val = 0.0f; - float const zero = 0.0f; + const scalar_t* __restrict__ input, int8_t* __restrict__ output, + scale_t* scale_out, const int hidden_size) { + const int tid = threadIdx.x; + const int stride = blockDim.x; + const int64_t token_idx = blockIdx.x; // Must be performed using 64-bit math to avoid integer overflow. - out += token_idx * hidden_size; - input += token_idx * hidden_size; + const scalar_t* row_in = input + token_idx * hidden_size; + int8_t* row_out = output + token_idx * hidden_size; - for (int i = tid; i < hidden_size; i += blockDim.x) { - float val = static_cast(input[i]); - val = val > zero ? val : -val; - absmax_val = val > absmax_val ? val : absmax_val; + // calculate for absmax + float thread_max = 0.f; + for (int i = tid; i < hidden_size; i += stride) { + const auto v = fabsf(static_cast(row_in[i])); + thread_max = fmaxf(thread_max, v); } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage reduceStorage; - float const block_absmax_val_maybe = - BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x); - __shared__ float block_absmax_val; + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage tmp; + float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x); + __shared__ float absmax; if (tid == 0) { - block_absmax_val = block_absmax_val_maybe; - scale[token_idx] = block_absmax_val / 127.0f; + absmax = block_max; + scale_out[blockIdx.x] = absmax / 127.f; } __syncthreads(); - float const tmp_scale = 127.0f / block_absmax_val; - for (int i = tid; i < hidden_size; i += blockDim.x) { - out[i] = float_to_int8_rn(static_cast(input[i]) * tmp_scale); - } + float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax; + + // 2. quantize + vectorize_with_alignment<16>( + row_in, row_out, hidden_size, tid, stride, + [=] __device__(int8_t& dst, const scalar_t& src) { + dst = float_to_int8_rn(static_cast(src) * inv_s); + }); } -template +// MinMax structure to hold min and max values in one go +struct MinMax { + float min, max; + + __host__ __device__ MinMax() + : min(std::numeric_limits::max()), + max(std::numeric_limits::lowest()) {} + + __host__ __device__ explicit MinMax(float v) : min(v), max(v) {} + + // add a value to the MinMax + __host__ __device__ MinMax& operator+=(float v) { + min = fminf(min, v); + max = fmaxf(max, v); + return *this; + } + + // merge two MinMax objects + __host__ __device__ MinMax& operator&=(const MinMax& other) { + min = fminf(min, other.min); + max = fmaxf(max, other.max); + return *this; + } +}; + +__host__ __device__ inline MinMax operator+(MinMax a, float v) { + return a += v; +} +__host__ __device__ inline MinMax operator&(MinMax a, const MinMax& b) { + return a &= b; +} + +template __global__ void dynamic_scaled_int8_azp_quant_kernel( - scalar_t const* __restrict__ input, int8_t* __restrict__ out, - scale_type* scale, azp_type* azp, const int hidden_size) { - int64_t const token_idx = blockIdx.x; + const scalar_t* __restrict__ input, int8_t* __restrict__ output, + scale_t* scale_out, azp_t* azp_out, const int hidden_size) { + const int tid = threadIdx.x; + const int stride = blockDim.x; + const int64_t token_idx = blockIdx.x; // Must be performed using 64-bit math to avoid integer overflow. - out += token_idx * hidden_size; - input += token_idx * hidden_size; + const scalar_t* row_in = input + token_idx * hidden_size; + int8_t* row_out = output + token_idx * hidden_size; - // Scan for the min and max value for this token - float max_val = std::numeric_limits::min(); - float min_val = std::numeric_limits::max(); - for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) { - auto val = static_cast(input[i]); - max_val = std::max(max_val, val); - min_val = std::min(min_val, val); + // 1. calculate min & max + MinMax thread_mm; + for (int i = tid; i < hidden_size; i += stride) { + thread_mm += static_cast(row_in[i]); } - // Reduce the max and min values across the block - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage reduceStorage; - max_val = BlockReduce(reduceStorage).Reduce(max_val, cub::Max{}, blockDim.x); - __syncthreads(); // Make sure min doesn't mess with max shared memory - min_val = BlockReduce(reduceStorage).Reduce(min_val, cub::Min{}, blockDim.x); + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage tmp; - __shared__ scale_type scale_sh; - __shared__ azp_type azp_sh; + MinMax mm = BlockReduce(tmp).Reduce( + thread_mm, + [] __device__(MinMax a, const MinMax& b) { + a &= b; + return a; + }, + blockDim.x); - // Compute the scale and zero point and store them, only on the first thread - if (threadIdx.x == 0) { - float const scale_val = (max_val - min_val) / 255.0f; - // Use rounding to even (same as torch.round) - auto const azp_float = std::nearbyint(-128.0f - min_val / scale_val); - auto const azp_val = static_cast(azp_float); - - // Store the scale and azp into shared and global - scale[token_idx] = scale_sh = scale_val; - azp[token_idx] = azp_sh = azp_val; + __shared__ float scale_sh; + __shared__ azp_t azp_sh; + if (tid == 0) { + float s = (mm.max - mm.min) / 255.f; + float zp = nearbyintf(-128.f - mm.min / s); // round-to-even + scale_sh = s; + azp_sh = azp_t(zp); + scale_out[blockIdx.x] = s; + azp_out[blockIdx.x] = azp_sh; } - - // Wait for the scale and azp to be computed __syncthreads(); - float const scale_val = scale_sh; - azp_type const azp_val = azp_sh; + const float inv_s = 1.f / scale_sh; + const azp_t azp = azp_sh; - // Quantize the values - for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) { - auto const val = static_cast(input[i]); - auto const quant_val = - int32_to_int8(float_to_int32_rn(val / scale_val) + azp_val); - out[i] = quant_val; - } + // 2. quantize + vectorize_with_alignment<16>( + row_in, row_out, hidden_size, tid, stride, + [=] __device__(int8_t& dst, const scalar_t& src) { + const auto v = static_cast(src) * inv_s; + dst = int32_to_int8(float_to_int32_rn(v) + azp); + }); } } // namespace vllm @@ -247,7 +285,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] int const hidden_size = input.size(-1); int const num_tokens = input.numel() / hidden_size; dim3 const grid(num_tokens); - dim3 const block(std::min(hidden_size, 1024)); + dim3 const block(std::min(hidden_size, 256)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( input.scalar_type(), "static_scaled_int8_quant_kernel", [&] { @@ -278,7 +316,7 @@ void dynamic_scaled_int8_quant( int const hidden_size = input.size(-1); int const num_tokens = input.numel() / hidden_size; dim3 const grid(num_tokens); - dim3 const block(std::min(hidden_size, 1024)); + dim3 const block(std::min(hidden_size, 256)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] { diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu index 84492553c02f2..4a8a5ed02d6ce 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu @@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { - TORCH_CHECK( - a.size(0) % 4 == 0, - "Input tensor must have a number of rows that is a multiple of 4. ", - "but got: ", a.size(0), " rows."); if (out.dtype() == torch::kBFloat16) { cutlass_gemm_blockwise_sm100_fp8_dispatch( out, a, b, a_scales, b_scales); diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh index ef324364c6d5e..c841125dbb734 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh @@ -1,5 +1,6 @@ #pragma once +#include "cuda_utils.h" #include "cutlass/cutlass.h" #include "cutlass/numeric_types.h" @@ -22,49 +23,49 @@ namespace vllm { using namespace cute; -template +// clang-format off +template struct cutlass_3x_gemm_fp8_blockwise { + static constexpr bool swap_ab = swap_ab_; using ElementAB = cutlass::float_e4m3_t; using ElementA = ElementAB; using LayoutA = cutlass::layout::RowMajor; + using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; using ElementB = ElementAB; using LayoutB = cutlass::layout::ColumnMajor; + using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; - using ElementC = void; using ElementD = OutType; using LayoutD = cutlass::layout::RowMajor; + using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + using ElementC = void; // TODO: support bias using LayoutC = LayoutD; + using LayoutC_Transpose = LayoutD_Transpose; static constexpr int AlignmentC = AlignmentD; using ElementAccumulator = float; using ElementCompute = float; using ElementBlockScale = float; - // MMA and Cluster Tile Shapes - // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster - // Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>; - static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{}); - static constexpr int ScaleGranularityM = - size<0>(MmaTileShape{}) / ScaleMsPerTile; - static constexpr int ScaleGranularityN = - size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{}); - static constexpr int ScaleGranularityK = - size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{}); + using ScaleConfig = conditional_t, + cutlass::detail::Sm100BlockwiseScaleConfig< + ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, + cute::UMMA::Major::MN, cute::UMMA::Major::K>>; - // Shape of the threadblocks in a cluster - using ClusterShape_MNK = ClusterShape; - - using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig< - ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, - cute::UMMA::Major::MN, cute::UMMA::Major::K>; + // layout_SFA and layout_SFB cannot be swapped since they are deduced. using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); @@ -73,7 +74,6 @@ struct cutlass_3x_gemm_fp8_blockwise { static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest; using ElementScalar = float; - // clang-format off using DefaultOperation = cutlass::epilogue::fusion::LinearCombination; using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< ArchTag, @@ -84,33 +84,47 @@ struct cutlass_3x_gemm_fp8_blockwise { ElementAccumulator, ElementCompute, ElementC, - LayoutC, + conditional_t, AlignmentC, ElementD, - LayoutD, + conditional_t, AlignmentD, EpilogueScheduler, DefaultOperation >::CollectiveOp; using StageCountType = cutlass::gemm::collective::StageCountAuto; - using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< - ArchTag, - OperatorClass, - ElementA, - cute::tuple, - AlignmentA, - ElementB, - cute::tuple, - AlignmentB, - ElementAccumulator, - MmaTileShape, - ClusterShape, - + using CollectiveMainloop = conditional_t, + AlignmentB, + ElementA, + cute::tuple, + AlignmentA, + ElementAccumulator, + MmaTileShape, + ClusterShape, cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>, - MainloopScheduler - >::CollectiveOp; - // clang-format on + MainloopScheduler + >::CollectiveOp, + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, + OperatorClass, + ElementA, + cute::tuple, + AlignmentA, + ElementB, + cute::tuple, + AlignmentB, + ElementAccumulator, + MmaTileShape, + ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>, + MainloopScheduler + >::CollectiveOp>; using KernelType = enable_sm100_only, CollectiveMainloop, CollectiveEpilogue>>; @@ -123,6 +137,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { + static constexpr bool swap_ab = Gemm::swap_ab; using GemmKernel = typename Gemm::GemmKernel; using StrideA = typename Gemm::GemmKernel::StrideA; using StrideB = typename Gemm::GemmKernel::StrideB; @@ -136,7 +151,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, using ElementD = typename Gemm::ElementD; int32_t m = a.size(0), n = b.size(1), k = a.size(1); - auto prob_shape = cute::make_shape(m, n, k, 1); StrideA a_stride; StrideB b_stride; @@ -146,11 +160,13 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, b_stride = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1)); c_stride = - cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1)); + cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1)); - LayoutSFA layout_SFA = + LayoutSFA layout_SFA = swap_ab ? + ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) : ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1)); - LayoutSFB layout_SFB = + LayoutSFB layout_SFB = swap_ab ? + ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) : ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); auto a_ptr = static_cast(a.data_ptr()); @@ -158,9 +174,22 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, auto a_scales_ptr = static_cast(a_scales.data_ptr()); auto b_scales_ptr = static_cast(b_scales.data_ptr()); - typename GemmKernel::MainloopArguments mainloop_args{ - a_ptr, a_stride, b_ptr, b_stride, - a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB}; + auto mainloop_args = [&](){ + // layout_SFA and layout_SFB cannot be swapped since they are deduced. + if (swap_ab) { + return typename GemmKernel::MainloopArguments{ + b_ptr, b_stride, a_ptr, a_stride, + b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB + }; + } + else { + return typename GemmKernel::MainloopArguments{ + a_ptr, a_stride, b_ptr, b_stride, + a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB + }; + } + }(); + auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1); auto c_ptr = static_cast(out.data_ptr()); typename GemmKernel::EpilogueArguments epilogue_args{ @@ -175,29 +204,74 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { - auto m = a.size(0); - auto k = a.size(1); - auto n = b.size(1); - int sms; + int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms; cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device()); - auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) { - return std::ceil(static_cast(m) / tile1SM) * - std::ceil(static_cast(n) / tile1SM) >= - sms; - }; - bool use_2sm = should_use_2sm(m, n); - if (use_2sm) { - cutlass_gemm_caller_blockwise, Shape<_256, _1, _1>, - Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm, - cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( - out, a, b, a_scales, b_scales); + constexpr int TILE_K = 128; + // TODO: better heuristics + bool swap_ab = (m < 16) || (m % 4 != 0); + bool use_tma_epilogue = (m * n) % 4 == 0; + if (!swap_ab) { + constexpr int TILE_N = 128; + int tile_m = 256; + if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) { + tile_m = 64; + } + else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) { + tile_m = 128; + } + if (tile_m == 64) { + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } + } else if (tile_m == 128) { + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } + } else { // tile_m == 256 + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( + out, a, b, a_scales, b_scales); + } + } } else { + // TODO: Test more tile N configs + constexpr int TILE_M = 128; + constexpr int TILE_N = 16; + // TMA epilogue isn't compatible with Swap A/B cutlass_gemm_caller_blockwise, Shape<_128, _1, _1>, - Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, - cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + OutType, TILE_M, 1, TILE_K, Shape, Int, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>( out, a, b, a_scales, b_scales); } } diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh index 468b77d9593bc..24564efbd21be 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh @@ -15,16 +15,59 @@ using c3x::cutlass_gemm_caller; template typename Epilogue> struct sm100_fp8_config_default { + // M in (256, inf) static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; - using TileShape = Shape<_256, _128, _64>; + using TileShape = Shape<_256, _128, _128>; using ClusterShape = Shape<_2, _2, _1>; using Cutlass3xGemm = cutlass_3x_gemm_sm100; }; +template typename Epilogue> +struct sm100_fp8_config_M256 { + // M in (64, 256] + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_128, _128, _128>; + using ClusterShape = Shape<_2, _1, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + +template typename Epilogue> +struct sm100_fp8_config_M64 { + // M in (16, 64] + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_64, _64, _128>; + using ClusterShape = Shape<_1, _1, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + +template typename Epilogue> +struct sm100_fp8_config_M16 { + // M in [1, 16] + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_64, _64, _128>; + using ClusterShape = Shape<_1, _4, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + template typename Epilogue, typename... EpilogueArgs> @@ -39,8 +82,34 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out, using Cutlass3xGemmDefault = typename sm100_fp8_config_default::Cutlass3xGemm; - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); + using Cutlass3xGemmM16 = + typename sm100_fp8_config_M16::Cutlass3xGemm; + using Cutlass3xGemmM64 = + typename sm100_fp8_config_M64::Cutlass3xGemm; + using Cutlass3xGemmM256 = + typename sm100_fp8_config_M256::Cutlass3xGemm; + + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(16), next_pow_2(m)); // next power of 2 + + if (mp2 <= 16) { + // m in [1, 16] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 64) { + // m in (16, 64] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 256) { + // m in (64, 256] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + // m in (256, inf) + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } } template