diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 37e2980eea974..2ef36089b6afb 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -8,7 +8,7 @@ This benchmark aims to:
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
## Setup
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 53b5b23db3c21..8c6ef7817aaf8 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,24 +1,22 @@
steps:
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build arm64 wheel - CUDA 12.9"
+ depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- - block: "Build CUDA 12.8 wheel"
- key: block-build-cu128-wheel
-
- label: "Build wheel - CUDA 12.8"
- depends_on: block-build-cu128-wheel
+ depends_on: ~
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
@@ -30,12 +28,8 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- - block: "Build CUDA 12.6 wheel"
- key: block-build-cu126-wheel
- depends_on: ~
-
- label: "Build wheel - CUDA 12.6"
- depends_on: block-build-cu126-wheel
+ depends_on: ~
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
@@ -102,8 +96,6 @@ steps:
depends_on:
- create-multi-arch-manifest
- build-wheel-cuda-12-8
- - build-wheel-cuda-12-6
- - build-wheel-cuda-12-9
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
@@ -150,18 +142,24 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- - block: "Build Neuron release image"
- key: block-neuron-release-image-build
- depends_on: ~
-
- - label: "Build and publish Neuron release image"
- depends_on: block-neuron-release-image-build
+ - label: "Build and publish nightly multi-arch image to DockerHub"
+ depends_on:
+ - create-multi-arch-manifest
+ if: build.env("NIGHTLY") == "1"
agents:
- queue: neuron-postmerge
+ queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
- - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+ - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+ - "docker push vllm/vllm-openai:nightly"
+ - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+ # Clean up old nightly builds (keep only last 14)
+ - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+ plugins:
+ - docker-login#v3.0.0:
+ username: vllmbot
+ password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index 94e0ac2398f34..fde48603ad3cd 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
-docker tag vllm/vllm-openai vllm/vllm-openai:latest
-docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
-docker push vllm/vllm-openai:latest
-docker push vllm/vllm-openai:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF
\ No newline at end of file
diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh
new file mode 100755
index 0000000000000..1a82f7d085233
--- /dev/null
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+set -ex
+
+# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
+# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
+
+# DockerHub API endpoint for vllm/vllm-openai repository
+REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
+
+# Get DockerHub token from environment
+if [ -z "$DOCKERHUB_TOKEN" ]; then
+ echo "Error: DOCKERHUB_TOKEN environment variable is not set"
+ exit 1
+fi
+
+# Function to get all tags from DockerHub
+get_all_tags() {
+ local page=1
+ local all_tags=""
+
+ while true; do
+ local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
+ "$REPO_API_URL?page=$page&page_size=100")
+
+ # Get both last_updated timestamp and tag name, separated by |
+ local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
+
+ if [ -z "$tags" ]; then
+ break
+ fi
+
+ all_tags="$all_tags$tags"$'\n'
+ page=$((page + 1))
+ done
+
+ # Sort by timestamp (newest first) and extract just the tag names
+ echo "$all_tags" | sort -r | cut -d'|' -f2
+}
+
+delete_tag() {
+ local tag_name="$1"
+ echo "Deleting tag: $tag_name"
+
+ local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
+ local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
+
+ if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
+ echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
+ else
+ echo "Successfully deleted tag: $tag_name"
+ fi
+}
+
+# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
+echo "Fetching all tags from DockerHub..."
+all_tags=$(get_all_tags)
+
+if [ -z "$all_tags" ]; then
+ echo "No tags found to clean up"
+ exit 0
+fi
+
+# Count total tags
+total_tags=$(echo "$all_tags" | wc -l)
+echo "Found $total_tags tags"
+
+# Keep only the last 14 builds (including the current one)
+tags_to_keep=14
+tags_to_delete=$((total_tags - tags_to_keep))
+
+if [ $tags_to_delete -le 0 ]; then
+ echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
+ exit 0
+fi
+
+echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
+
+# Get tags to delete (skip the first $tags_to_keep tags)
+tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
+
+if [ -z "$tags_to_delete_list" ]; then
+ echo "No tags to delete"
+ exit 0
+fi
+
+# Delete old tags
+echo "Deleting old tags..."
+while IFS= read -r tag; do
+ if [ -n "$tag" ]; then
+ delete_tag "$tag"
+ # Add a small delay to avoid rate limiting
+ sleep 1
+ fi
+done <<< "$tags_to_delete_list"
+
+echo "Cleanup completed successfully"
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index c395011a24485..aa4cc7b35a543 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
-if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
- commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
-fi
-
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
@@ -167,12 +163,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
-#Obsolete currently
-##ignore certain Entrypoints/llm tests
-#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
-#fi
-
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 0f734763f13fd..64943d2a15a79 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -66,7 +66,6 @@ function cpu_tests() {
pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -x -v -s tests/models/multimodal/generation \
- --ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
deleted file mode 100644
index a397457c83261..0000000000000
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-# This script build the Neuron docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -e
-set -v
-
-image_name="neuron/vllm-ci"
-container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p "${HF_CACHE}"
-HF_MOUNT="/root/.cache/huggingface"
-HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
-
-NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
-mkdir -p "${NEURON_COMPILE_CACHE_URL}"
-NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
-
-# Try building the docker image
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
-
-# prune old image and containers to save disk space, and only once a day
-# by using a timestamp file in tmp.
-if [ -f /tmp/neuron-docker-build-timestamp ]; then
- last_build=$(cat /tmp/neuron-docker-build-timestamp)
- current_time=$(date +%s)
- if [ $((current_time - last_build)) -gt 86400 ]; then
- # Remove dangling images (those that are not tagged and not used by any container)
- docker image prune -f
- # Remove unused volumes / force the system prune for old images as well.
- docker volume prune -f && docker system prune -f
- echo "$current_time" > /tmp/neuron-docker-build-timestamp
- fi
-else
- date "+%s" > /tmp/neuron-docker-build-timestamp
-fi
-
-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
-
-# Setup cleanup
-remove_docker_container() {
- docker image rm -f "${image_name}" || true;
-}
-trap remove_docker_container EXIT
-
-# Run the image
-docker run --rm -it --device=/dev/neuron0 --network bridge \
- -v "${HF_CACHE}:${HF_MOUNT}" \
- -e "HF_HOME=${HF_MOUNT}" \
- -e "HF_TOKEN=${HF_TOKEN}" \
- -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
- -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
- --name "${container_name}" \
- ${image_name} \
- /bin/bash -c "
- set -e; # Exit on first error
- python3 /workspace/vllm/examples/offline_inference/neuron.py;
- python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
- for f in /workspace/vllm/tests/neuron/2_core/*.py; do
- echo \"Running test file: \$f\";
- python3 -m pytest \$f -v --capture=tee-sys;
- done
- "
\ No newline at end of file
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index 1073a4ee30afa..e76528a178205 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
- && python3 -m pip install --progress-bar off hf-transfer
+ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 505664f3aecd0..69366cd503219 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
- && python3 -m pip install --progress-bar off hf-transfer
+ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index efcd10acf0b93..8c9b00990e995 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -30,6 +30,7 @@ docker run \
bash -c '
set -e
echo $ZE_AFFINITY_MASK
+ pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b0d4c4456d339..c4ea4b675649c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -6,24 +6,28 @@
# to generate the final pipeline yaml file.
# Documentation
-# label(str): the name of the test. emoji allowed.
-# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
-# fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
+# label(str): the name of the test. emojis allowed.
+# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# fast_check_only(bool): run this test on the fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
# command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for test. incompatbile with command.
-# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
-# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
-# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
-# in this case, commands must be specified. the first command runs on first host, the second
+# commands(list): the list of commands to run for the test. incompatible with command.
+# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+# in this case, commands must be specified. the first command runs on the first host, the second
# command runs on the second host.
-# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
+# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
+# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
# When adding a test
-# - If the test belong to an existing group, add it there
+# - If the test belongs to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
@@ -46,23 +50,19 @@ steps:
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- - tests/mq_llm_engine
- - tests/async_engine
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/utils_
- - tests/worker
- tests/standalone_tests/lazy_imports.py
+ - tests/transformers_utils
commands:
- python3 standalone_tests/lazy_imports.py
- - pytest -v -s mq_llm_engine # MQLLMEngine
- - pytest -v -s async_engine # AsyncLLMEngine
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s multimodal
- pytest -v -s utils_ # Utils
- - pytest -v -s worker # Worker
+ - pytest -v -s transformers_utils # transformers_utils
- label: Python-only Installation Test # 10min
timeout_in_minutes: 20
@@ -82,27 +82,25 @@ steps:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- - tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-- label: Core Test # 22min
- timeout_in_minutes: 35
- mirror_hardwares: [amdexperimental]
+- label: Entrypoints Unit Tests # 5min
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
fast_check: true
source_file_dependencies:
- - vllm/core
- - vllm/distributed
- - tests/core
+ - vllm/entrypoints
+ - tests/entrypoints/
commands:
- - pytest -v -s core
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-- label: Entrypoints Test (LLM) # 30min
+- label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
@@ -114,12 +112,11 @@ steps:
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+ - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-- label: Entrypoints Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server) # 100min
timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
@@ -132,9 +129,22 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
+- label: Entrypoints Integration Test (Pooling)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
- label: Distributed Tests (4 GPUs) # 35min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
@@ -142,7 +152,6 @@ steps:
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- - vllm/core/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
@@ -156,11 +165,18 @@ steps:
- tests/v1/test_hybrid_lb_dp.py
- tests/v1/engine/test_engine_core_client.py
commands:
- # test with tp=2 and external_dp=2
- - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- # test with tp=2 and pp=2
+ # test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=4 and dp=1
+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2, pp=2 and dp=1
+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=1 and dp=4 with ep
+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2 and dp=2 with ep
+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
@@ -204,16 +220,14 @@ steps:
num_gpus: 2
source_file_dependencies:
- vllm/
- - tests/metrics
- - tests/tracing
+ - tests/v1/tracing
commands:
- - pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- - pytest -v -s tracing
+ - pytest -v -s v1/tracing
##### fast check tests #####
##### 1 GPU test #####
@@ -276,6 +290,7 @@ steps:
# split the test to avoid interference
- pytest -v -s v1/core
- pytest -v -s v1/executor
+ - pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
@@ -309,13 +324,11 @@ steps:
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- - python3 offline_inference/encoder_decoder.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Platform Tests (CUDA) # 4min
timeout_in_minutes: 15
@@ -369,6 +382,7 @@ steps:
- pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
+ - pytest -v -s compile/test_noop_elimination.py
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@@ -379,11 +393,7 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
- # these tests need to be separated, cannot combine
- - pytest -v -s compile/piecewise/test_simple.py
- - pytest -v -s compile/piecewise/test_toy_llama.py
- - pytest -v -s compile/piecewise/test_full_cudagraph.py
- - pytest -v -s compile/piecewise/test_multiple_graphs.py
+ - pytest -v -s compile/piecewise/
- label: PyTorch Fullgraph Test # 20min
timeout_in_minutes: 30
@@ -501,6 +511,10 @@ steps:
commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
@@ -523,15 +537,6 @@ steps:
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
-- label: Encoder Decoder tests # 12min
- timeout_in_minutes: 20
- mirror_hardwares: [amdexperimental]
- source_file_dependencies:
- - vllm/
- - tests/encoder_decoder
- commands:
- - pytest -v -s encoder_decoder
-
- label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
@@ -546,36 +551,85 @@ steps:
##### models test #####
-- label: Basic Models Test # 57min
- timeout_in_minutes: 75
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- - tests/models
+ - tests/models/test_initialization.py
commands:
- - pytest -v -s models/test_transformers.py
- - pytest -v -s models/test_registry.py
- - pytest -v -s models/test_utils.py
- - pytest -v -s models/test_vision.py
- - pytest -v -s models/test_initialization.py
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
-- label: Language Models Test (Standard) # 35min
+- label: Basic Models Tests (Extra Initialization) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
+ commands:
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py \
+ -k 'not test_can_initialize_small_subset' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Basic Models Tests (Other)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ commands:
+ - pytest -v -s models/test_transformers.py \
+ models/test_registry.py \
+ models/test_utils.py \
+ models/test_vision.py
+
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
- vllm/
- tests/models/language
commands:
+ # Test standard language models, excluding a subset of slow tests
- pip freeze | grep -E 'torch'
- - pytest -v -s models/language -m core_model
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
-- label: Language Models Test (Hybrid) # 35 min
+- label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and slow_test' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
@@ -583,7 +637,12 @@ steps:
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- - pytest -v -s models/language/generation -m hybrid_model
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation \
+ -m hybrid_model \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
- label: Language Models Test (Extended Generation) # 80min
timeout_in_minutes: 110
@@ -597,6 +656,16 @@ steps:
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
- label: Language Models Test (Extended Pooling) # 36min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
@@ -607,6 +676,16 @@ steps:
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
+
- label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60
source_file_dependencies:
@@ -627,7 +706,7 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental]
@@ -713,11 +792,12 @@ steps:
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+ - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -729,6 +809,20 @@ steps:
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+- label: GPT-OSS Eval (Blackwell)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true # disable while debugging
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
+
##### 1 GPU test #####
##### multi gpus test #####
@@ -743,6 +837,8 @@ steps:
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
- label: 2 Node Tests (4 GPUs in total) # 16min
timeout_in_minutes: 30
@@ -782,8 +878,6 @@ steps:
- tests/distributed/
- vllm/compilation
- vllm/worker/worker_base.py
- - vllm/worker/worker.py
- - vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py
- tests/v1/test_external_lb_dp.py
@@ -801,12 +895,13 @@ steps:
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel
- pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently.
# TODO: investigate and fix
- - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s models/multimodal/generation/test_maverick.py
@@ -827,7 +922,7 @@ steps:
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
- pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_io_processor_plugins.py
- - pip uninstall prithvi_io_processor_plugin -y
+ - pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
@@ -851,7 +946,6 @@ steps:
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
- # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
- label: LoRA TP Test (Distributed) # 17 min
timeout_in_minutes: 30
@@ -875,7 +969,7 @@ steps:
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
- num_gpus: 2
+ num_gpus: 2
optional: true
source_file_dependencies:
- vllm/
@@ -925,9 +1019,21 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-- label: Qwen MoE EP Test # optional
+##### H200 test #####
+- label: Distrubted Tests (H200) # optional
gpu: h200
optional: true
+ working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000000000..bc6342956109b
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,32 @@
+[run]
+source = vllm
+omit =
+ */tests/*
+ */test_*
+ */__pycache__/*
+ */build/*
+ */dist/*
+ */vllm.egg-info/*
+ */third_party/*
+ */examples/*
+ */benchmarks/*
+ */docs/*
+
+[report]
+exclude_lines =
+ pragma: no cover
+ def __repr__
+ if self.debug:
+ if settings.DEBUG
+ raise AssertionError
+ raise NotImplementedError
+ if 0:
+ if __name__ == .__main__.:
+ class .*\bProtocol\):
+ @(abc\.)?abstractmethod
+
+[html]
+directory = htmlcov
+
+[xml]
+output = coverage.xml
diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml
new file mode 100644
index 0000000000000..443dfa45af22c
--- /dev/null
+++ b/.github/.bc-linter.yml
@@ -0,0 +1,24 @@
+# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
+version: 1
+paths:
+# We temporarily disable globally, and will only enable with `annotations.include`
+# include:
+# - "vllm/v1/attetion/*.py"
+# - "vllm/v1/core/*.py"
+exclude:
+ - "**/*.py"
+
+scan:
+ functions: true # check free functions and methods
+ classes: true # check classes/dataclasses
+ public_only: true # ignore names starting with "_" at any level
+
+annotations:
+ include: # decorators that force‑include a symbol
+ - name: "bc_linter_include" # matched by simple name or dotted suffix
+ propagate_to_members: false # for classes, include methods/inner classes
+ exclude: # decorators that force‑exclude a symbol
+ - name: "bc_linter_skip" # matched by simple name or dotted suffix
+ propagate_to_members: true # for classes, exclude methods/inner classes
+
+excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 682b27ac8986e..9d749fe8d3238 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,23 +2,24 @@
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
+/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/fused_moe @mgoin
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn
-/vllm/multimodal @DarkLight1337 @ywang96
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/v1/attention @LucasWilkinson
/vllm/v1/sample @22quinn @houseroad
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
@@ -28,41 +29,60 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
-/vllm/v1/spec_decode @benchislett
+/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/triton_attn.py @tdoublep
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/kv_cache_interface.py @heheda12345
+/vllm/v1/offloading @ApostaC
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
-/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/evals @mgoin
+/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96
-/tests/prefix_caching @comaniac @KuntaiDu
+/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
+/tests/v1/kv_connector/nixl_integration @NickLucche
+/tests/v1/kv_connector @ApostaC
+/tests/v1/offloading @ApostaC
+
+# Transformers backend
+/vllm/model_executor/models/transformers.py @hmellor
+/tests/models/test_transformers.py @hmellor
# Docs
-/docs @hmellor
+/docs/mkdocs @hmellor
+/docs/**/*.yml @hmellor
+/requirements/docs.txt @hmellor
+.readthedocs.yaml @hmellor
mkdocs.yaml @hmellor
+# Linting
+.markdownlint.yaml @hmellor
+.pre-commit-config.yaml @hmellor
+/tools/pre_commit @hmellor
+
# CPU
-/vllm/v1/worker/^cpu @bigPYJ1151
+/vllm/v1/worker/cpu* @bigPYJ1151
/csrc/cpu @bigPYJ1151
/vllm/platforms/cpu.py @bigPYJ1151
/cmake/cpu_extension.cmake @bigPYJ1151
/docker/Dockerfile.cpu @bigPYJ1151
# Intel GPU
-/vllm/v1/worker/^xpu @jikunshang
+/vllm/v1/worker/xpu* @jikunshang
/vllm/platforms/xpu.py @jikunshang
/docker/Dockerfile.xpu @jikunshang
@@ -70,6 +90,9 @@ mkdocs.yaml @hmellor
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
/vllm/model_executor/models/qwen* @sighingnow
+# MTP-specific files
+/vllm/model_executor/models/deepseek_mtp.py @luccafong
+
# Mistral-specific files
/vllm/model_executor/models/mistral*.py @patrickvonplaten
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
@@ -88,3 +111,12 @@ mkdocs.yaml @hmellor
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
/vllm/attention/ops/rocm*.py @gshtras
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+
+# TPU
+/vllm/v1/worker/tpu* @NickLucche
+/vllm/platforms/tpu.py @NickLucche
+/vllm/v1/sample/tpu @NickLucche
+/vllm/tests/v1/tpu @NickLucche
+
+# KVConnector installation files
+/requirements/kv_connectors.txt @NickLucche
diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml
index 7ee57c42895ca..c0e009855964a 100644
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@@ -43,10 +43,6 @@ body:
Any other things you would like to mention.
validations:
required: false
-- type: markdown
- attributes:
- value: >
- Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
- type: checkboxes
id: askllm
attributes:
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 495d207d44260..75ee3e3c55b46 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -124,9 +124,16 @@ pull_request_rules:
- or:
- files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py
+ - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
+ - files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
+ - files~=^vllm/entrypoints/harmony_utils.py
+ - files~=^vllm/entrypoints/tool_server.py
+ - files~=^vllm/entrypoints/tool.py
+ - files~=^vllm/entrypoints/context.py
- title~=(?i)gpt[-_]?oss
+ - title~=(?i)harmony
actions:
label:
add:
@@ -164,7 +171,7 @@ pull_request_rules:
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^tests/v1/structured_output/
- - files=tests/v1/entrypoints/llm/test_guided_generate.py
+ - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
@@ -273,6 +280,20 @@ pull_request_rules:
users:
- "sangstar"
+- name: assign reviewer for modelopt changes
+ conditions:
+ - or:
+ - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
+ - files~=^vllm/model_executor/layers/quantization/__init__\.py$
+ - files~=^tests/models/quantization/test_modelopt\.py$
+ - files~=^tests/quantization/test_modelopt\.py$
+ - files~=^tests/models/quantization/test_nvfp4\.py$
+ - files~=^docs/features/quantization/modelopt\.md$
+ actions:
+ assign:
+ users:
+ - "Edwardf0t1"
+
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
- -conflict
@@ -281,3 +302,20 @@ pull_request_rules:
label:
remove:
- needs-rebase
+
+- name: label-kv-connector
+ description: Automatically apply kv-connector label
+ conditions:
+ - or:
+ - files~=^examples/online_serving/disaggregated[^/]*/.*
+ - files~=^examples/offline_inference/disaggregated[^/]*/.*
+ - files~=^examples/others/lmcache/
+ - files~=^tests/v1/kv_connector/
+ - files~=^vllm/distributed/kv_transfer/
+ - title~=(?i)\bP/?D\b
+ - title~=(?i)NIXL
+ - title~=(?i)LMCache
+ actions:
+ label:
+ add:
+ - kv-connector
\ No newline at end of file
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
index 315042fbf5cf4..d8bbedef3174b 100644
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -10,7 +10,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Add label
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
github.rest.issues.addLabels({
diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml
new file mode 100644
index 0000000000000..823695a921321
--- /dev/null
+++ b/.github/workflows/bc-lint.yml
@@ -0,0 +1,29 @@
+name: BC Lint
+
+on:
+ pull_request:
+ types:
+ - opened
+ - synchronize
+ - reopened
+ - labeled
+ - unlabeled
+
+jobs:
+ bc_lint:
+ if: github.repository_owner == 'vllm-project'
+ runs-on: ubuntu-latest
+ steps:
+ - name: Run BC Lint Action
+ uses: pytorch/test-infra/.github/actions/bc-lint@main
+ with:
+ repo: ${{ github.event.pull_request.head.repo.full_name }}
+ base_sha: ${{ github.event.pull_request.base.sha }}
+ head_sha: ${{ github.event.pull_request.head.sha }}
+ suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
+ docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
+ config_dir: .github
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+ cancel-in-progress: true
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index d5c6b8d43a6ef..c3e132a536a42 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+ uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: '3.12'
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index e0ab3872d8fa3..c2b17abe811cd 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Label issues based on keywords
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Add new labels and keywords here
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 195579f206a2f..e21d13b8161f3 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+ - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 1ee605dc7bb0d..8884359fa0ce4 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
try {
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 656f3d3fa7bc4..82844810a633a 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+ - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
diff --git a/.gitignore b/.gitignore
index 465935d488f84..b1df673e83ca8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
# vllm-flash-attn built from source
vllm/vllm_flash_attn/*
-# triton jit
+# triton jit
.triton
# Byte-compiled / optimized / DLL files
@@ -177,6 +177,14 @@ cython_debug/
# VSCode
.vscode/
+# Claude
+CLAUDE.md
+.claude/
+
+# Codex
+AGENTS.md
+.codex/
+
# DS Store
.DS_Store
@@ -209,4 +217,4 @@ shellcheck*/
csrc/moe/marlin_moe_wna16/kernel_*
# Ignore ep_kernels_workspace folder
-ep_kernels_workspace/
\ No newline at end of file
+ep_kernels_workspace/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c16bdeeecd07a..8ca414ee4269b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -49,7 +49,7 @@ repos:
rev: 0.6.17
hooks:
- id: pip-compile
- args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
+ args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
@@ -60,38 +60,32 @@ repos:
files: ^requirements/test\.(in|txt)$
- id: mypy-local
name: Run mypy for local Python installation
- entry: tools/mypy.sh 0 "local"
- language: python
- types: [python]
- additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
+ entry: python tools/pre_commit/mypy.py 0 "local"
stages: [pre-commit] # Don't run in CI
+ <<: &mypy_common
+ language: python
+ types_or: [python, pyi]
+ require_serial: true
+ additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
- entry: tools/mypy.sh 1 "3.9"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.9"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
- entry: tools/mypy.sh 1 "3.10"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.10"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.11
- entry: tools/mypy.sh 1 "3.11"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.11"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.12
- entry: tools/mypy.sh 1 "3.12"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.12"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: shellcheck
name: Lint shell scripts
@@ -155,18 +149,15 @@ repos:
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
- entry: python tools/check_pickle_imports.py
+ entry: python tools/pre_commit/check_pickle_imports.py
language: python
types: [python]
- pass_filenames: false
- additional_dependencies: [pathspec, regex]
+ additional_dependencies: [regex]
- id: validate-config
name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py
language: python
- types: [python]
- pass_filenames: true
- files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
+ additional_dependencies: [regex]
# Keep `suggestion` last
- id: suggestion
name: Suggestion
diff --git a/.yapfignore b/.yapfignore
index 2d6dcf8380cac..38158259032a6 100644
--- a/.yapfignore
+++ b/.yapfignore
@@ -1 +1,2 @@
collect_env.py
+vllm/model_executor/layers/fla/ops/*.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f1f9a781a07a..180b896a7abac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
# cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
@@ -171,6 +175,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()
+#
+# Set CUDA include flags for CXX compiler.
+#
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
+ if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
+ endif()
+endif()
+
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@@ -294,7 +308,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp"
- "csrc/attention/mla/cutlass_mla_entry.cu"
"csrc/quantization/fp8/per_token_group_quant.cu")
set_gencode_flags_for_srcs(
@@ -581,7 +594,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
set(SRCS
- "csrc/attention/mla/cutlass_mla_kernels.cu"
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -779,6 +791,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
+ # Hadacore kernels
+ cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+ if(HADACORE_ARCHS)
+ set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${SRCS}"
+ CUDA_ARCHS "${HADACORE_ARCHS}")
+ list(APPEND VLLM_EXT_SRC "${SRCS}")
+ message(STATUS "Building hadacore")
+ endif()
+
# if CUDA endif
endif()
diff --git a/MANIFEST.in b/MANIFEST.in
index 82fd22b845f09..fb3cccbb4a9c1 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,7 +2,6 @@ include LICENSE
include requirements/common.txt
include requirements/cuda.txt
include requirements/rocm.txt
-include requirements/neuron.txt
include requirements/cpu.txt
include CMakeLists.txt
diff --git a/README.md b/README.md
index 4e03df758c261..0c6e5aa6b31d2 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,9 @@ Easy, fast, and cheap LLM serving for everyone
| Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |
+---
+Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
+
---
*Latest News* 🔥
@@ -78,7 +81,7 @@ vLLM is flexible and easy to use with:
- Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Prefix caching support
- Multi-LoRA support
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 98b3600d13635..269a4d51ec2ef 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,807 +1,20 @@
-# Benchmarking vLLM
+# Benchmarks
-This README guides you through running benchmark tests with the extensive
-datasets supported on vLLM. It’s a living document, updated as new features and datasets
-become available.
+This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
-## Dataset Overview
+## Contents
-
-
-
- | Dataset |
- Online |
- Offline |
- Data Path |
-
-
-
-
- | ShareGPT |
- ✅ |
- ✅ |
- wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json |
-
-
- | ShareGPT4V (Image) |
- ✅ |
- ✅ |
-
- wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
-
- Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:
- wget http://images.cocodataset.org/zips/train2017.zip
- |
-
-
- | ShareGPT4Video (Video) |
- ✅ |
- ✅ |
-
- git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video
- |
-
-
- | BurstGPT |
- ✅ |
- ✅ |
- wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv |
-
-
- | Sonnet (deprecated) |
- ✅ |
- ✅ |
- Local file: benchmarks/sonnet.txt |
-
-
- | Random |
- ✅ |
- ✅ |
- synthetic |
-
-
- | RandomMultiModal (Image/Video) |
- 🟡 |
- 🚧 |
- synthetic |
-
-
- | Prefix Repetition |
- ✅ |
- ✅ |
- synthetic |
-
-
- | HuggingFace-VisionArena |
- ✅ |
- ✅ |
- lmarena-ai/VisionArena-Chat |
-
-
- | HuggingFace-InstructCoder |
- ✅ |
- ✅ |
- likaixin/InstructCoder |
-
-
- | HuggingFace-AIMO |
- ✅ |
- ✅ |
- AI-MO/aimo-validation-aime , AI-MO/NuminaMath-1.5, AI-MO/NuminaMath-CoT |
-
-
- | HuggingFace-Other |
- ✅ |
- ✅ |
- lmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered |
-
-
- | Custom |
- ✅ |
- ✅ |
- Local file: data.jsonl |
-
-
-
+- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
+- **Throughput benchmarks**: Scripts for testing offline batch inference performance
+- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
+- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
-✅: supported
+## Usage
-🟡: Partial support
+For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
-🚧: to be supported
+For full CLI reference see:
-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
-For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
-
-```bash
---dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
-```
-
-## 🚀 Example - Online Benchmark
-
-
-Show more
-
-
-
-First start serving your model
-
-```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B
-```
-
-Then run the benchmarking script
-
-```bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-vllm bench serve \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --endpoint /v1/completions \
- --dataset-name sharegpt \
- --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \
- --num-prompts 10
-```
-
-If successful, you will see the following output
-
-```text
-============ Serving Benchmark Result ============
-Successful requests: 10
-Benchmark duration (s): 5.78
-Total input tokens: 1369
-Total generated tokens: 2212
-Request throughput (req/s): 1.73
-Output token throughput (tok/s): 382.89
-Total Token throughput (tok/s): 619.85
----------------Time to First Token----------------
-Mean TTFT (ms): 71.54
-Median TTFT (ms): 73.88
-P99 TTFT (ms): 79.49
------Time per Output Token (excl. 1st token)------
-Mean TPOT (ms): 7.91
-Median TPOT (ms): 7.96
-P99 TPOT (ms): 8.03
----------------Inter-token Latency----------------
-Mean ITL (ms): 7.74
-Median ITL (ms): 7.70
-P99 ITL (ms): 8.39
-==================================================
-```
-
-### Custom Dataset
-
-If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
-
-```json
-{"prompt": "What is the capital of India?"}
-{"prompt": "What is the capital of Iran?"}
-{"prompt": "What is the capital of China?"}
-```
-
-```bash
-# start server
-VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
-```
-
-```bash
-# run benchmarking script
-vllm bench serve --port 9001 --save-result --save-detailed \
- --backend vllm \
- --model meta-llama/Llama-3.1-8B-Instruct \
- --endpoint /v1/completions \
- --dataset-name custom \
- --dataset-path \
- --custom-skip-chat-template \
- --num-prompts 80 \
- --max-concurrency 1 \
- --temperature=0.3 \
- --top-p=0.75 \
- --result-dir "./log/"
-```
-
-You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
-
-### VisionArena Benchmark for Vision Language Models
-
-```bash
-# need a model with vision capability here
-vllm serve Qwen/Qwen2-VL-7B-Instruct
-```
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --endpoint-type openai-chat \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --endpoint /v1/chat/completions \
- --dataset-name hf \
- --dataset-path lmarena-ai/VisionArena-Chat \
- --hf-split train \
- --num-prompts 1000
-```
-
-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
- --speculative-config $'{"method": "ngram",
- "num_speculative_tokens": 5, "prompt_lookup_max": 5,
- "prompt_lookup_min": 2}'
-```
-
-``` bash
-vllm bench serve \
- --model meta-llama/Meta-Llama-3-8B-Instruct \
- --dataset-name hf \
- --dataset-path likaixin/InstructCoder \
- --num-prompts 2048
-```
-
-### Other HuggingFaceDataset Examples
-
-```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct
-```
-
-`lmms-lab/LLaVA-OneVision-Data`:
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --endpoint-type openai-chat \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --endpoint /v1/chat/completions \
- --dataset-name hf \
- --dataset-path lmms-lab/LLaVA-OneVision-Data \
- --hf-split train \
- --hf-subset "chart2text(cauldron)" \
- --num-prompts 10
-```
-
-`Aeala/ShareGPT_Vicuna_unfiltered`:
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --endpoint-type openai-chat \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --endpoint /v1/chat/completions \
- --dataset-name hf \
- --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
- --hf-split train \
- --num-prompts 10
-```
-
-`AI-MO/aimo-validation-aime`:
-
-``` bash
-vllm bench serve \
- --model Qwen/QwQ-32B \
- --dataset-name hf \
- --dataset-path AI-MO/aimo-validation-aime \
- --num-prompts 10 \
- --seed 42
-```
-
-`philschmid/mt-bench`:
-
-``` bash
-vllm bench serve \
- --model Qwen/QwQ-32B \
- --dataset-name hf \
- --dataset-path philschmid/mt-bench \
- --num-prompts 80
-```
-
-### Running With Sampling Parameters
-
-When using OpenAI-compatible backends such as `vllm`, optional sampling
-parameters can be specified. Example client command:
-
-```bash
-vllm bench serve \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --endpoint /v1/completions \
- --dataset-name sharegpt \
- --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \
- --top-k 10 \
- --top-p 0.9 \
- --temperature 0.5 \
- --num-prompts 10
-```
-
-### Running With Ramp-Up Request Rate
-
-The benchmark tool also supports ramping up the request rate over the
-duration of the benchmark run. This can be useful for stress testing the
-server or finding the maximum throughput that it can handle, given some latency budget.
-
-Two ramp-up strategies are supported:
-
-- `linear`: Increases the request rate linearly from a start value to an end value.
-- `exponential`: Increases the request rate exponentially.
-
-The following arguments can be used to control the ramp-up:
-
-- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
-- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
-- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
-
-
-
-## 📈 Example - Offline Throughput Benchmark
-
-
-Show more
-
-
-
-```bash
-vllm bench throughput \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset-name sonnet \
- --dataset-path vllm/benchmarks/sonnet.txt \
- --num-prompts 10
-```
-
-If successful, you will see the following output
-
-```text
-Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
-Total num prompt tokens: 5014
-Total num output tokens: 1500
-```
-
-### VisionArena Benchmark for Vision Language Models
-
-```bash
-vllm bench throughput \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --backend vllm-chat \
- --dataset-name hf \
- --dataset-path lmarena-ai/VisionArena-Chat \
- --num-prompts 1000 \
- --hf-split train
-```
-
-The `num prompt tokens` now includes image token counts
-
-```text
-Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
-Total num prompt tokens: 14527
-Total num output tokens: 1280
-```
-
-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_WORKER_MULTIPROC_METHOD=spawn \
-VLLM_USE_V1=1 \
-vllm bench throughput \
- --dataset-name=hf \
- --dataset-path=likaixin/InstructCoder \
- --model=meta-llama/Meta-Llama-3-8B-Instruct \
- --input-len=1000 \
- --output-len=100 \
- --num-prompts=2048 \
- --async-engine \
- --speculative-config $'{"method": "ngram",
- "num_speculative_tokens": 5, "prompt_lookup_max": 5,
- "prompt_lookup_min": 2}'
-```
-
-```text
-Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
-Total num prompt tokens: 261136
-Total num output tokens: 204800
-```
-
-### Other HuggingFaceDataset Examples
-
-`lmms-lab/LLaVA-OneVision-Data`:
-
-```bash
-vllm bench throughput \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --backend vllm-chat \
- --dataset-name hf \
- --dataset-path lmms-lab/LLaVA-OneVision-Data \
- --hf-split train \
- --hf-subset "chart2text(cauldron)" \
- --num-prompts 10
-```
-
-`Aeala/ShareGPT_Vicuna_unfiltered`:
-
-```bash
-vllm bench throughput \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --backend vllm-chat \
- --dataset-name hf \
- --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
- --hf-split train \
- --num-prompts 10
-```
-
-`AI-MO/aimo-validation-aime`:
-
-```bash
-vllm bench throughput \
- --model Qwen/QwQ-32B \
- --backend vllm \
- --dataset-name hf \
- --dataset-path AI-MO/aimo-validation-aime \
- --hf-split train \
- --num-prompts 10
-```
-
-Benchmark with LoRA adapters:
-
-``` bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-vllm bench throughput \
- --model meta-llama/Llama-2-7b-hf \
- --backend vllm \
- --dataset_path /ShareGPT_V3_unfiltered_cleaned_split.json \
- --dataset_name sharegpt \
- --num-prompts 10 \
- --max-loras 2 \
- --max-lora-rank 8 \
- --enable-lora \
- --lora-path yard1/llama-2-7b-sql-lora-test
- ```
-
-
-
-## 🛠️ Example - Structured Output Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of structured output generation (JSON, grammar, regex).
-
-### Server Setup
-
-```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B
-```
-
-### JSON Schema Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset json \
- --structured-output-ratio 1.0 \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### Grammar-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset grammar \
- --structure-type grammar \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### Regex-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset regex \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### Choice-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset choice \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### XGrammar Benchmark Dataset
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset xgrammar_bench \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-
-
-## 📚 Example - Long Document QA Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of long document question-answering with prefix caching.
-
-### Basic Long Document QA Test
-
-```bash
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 16 \
- --document-length 2000 \
- --output-len 50 \
- --repeat-count 5
-```
-
-### Different Repeat Modes
-
-```bash
-# Random mode (default) - shuffle prompts randomly
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 8 \
- --document-length 3000 \
- --repeat-count 3 \
- --repeat-mode random
-
-# Tile mode - repeat entire prompt list in sequence
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 8 \
- --document-length 3000 \
- --repeat-count 3 \
- --repeat-mode tile
-
-# Interleave mode - repeat each prompt consecutively
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 8 \
- --document-length 3000 \
- --repeat-count 3 \
- --repeat-mode interleave
-```
-
-
-
-## 🗂️ Example - Prefix Caching Benchmark
-
-
-Show more
-
-
-
-Benchmark the efficiency of automatic prefix caching.
-
-### Fixed Prompt with Prefix Caching
-
-```bash
-python3 benchmarks/benchmark_prefix_caching.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-prompts 1 \
- --repeat-count 100 \
- --input-length-range 128:256
-```
-
-### ShareGPT Dataset with Prefix Caching
-
-```bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-python3 benchmarks/benchmark_prefix_caching.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
- --enable-prefix-caching \
- --num-prompts 20 \
- --repeat-count 5 \
- --input-length-range 128:256
-```
-
-### Prefix Repetition Dataset
-
-```bash
-vllm bench serve \
- --backend openai \
- --model meta-llama/Llama-2-7b-chat-hf \
- --dataset-name prefix_repetition \
- --num-prompts 100 \
- --prefix-repetition-prefix-len 512 \
- --prefix-repetition-suffix-len 128 \
- --prefix-repetition-num-prefixes 5 \
- --prefix-repetition-output-len 128
-```
-
-
-
-## ⚡ Example - Request Prioritization Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of request prioritization in vLLM.
-
-### Basic Prioritization Test
-
-```bash
-python3 benchmarks/benchmark_prioritization.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --input-len 128 \
- --output-len 64 \
- --num-prompts 100 \
- --scheduling-policy priority
-```
-
-### Multiple Sequences per Prompt
-
-```bash
-python3 benchmarks/benchmark_prioritization.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --input-len 128 \
- --output-len 64 \
- --num-prompts 100 \
- --scheduling-policy priority \
- --n 2
-```
-
-
-
-## 👁️ Example - Multi-Modal Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of multi-modal requests in vLLM.
-
-### Images (ShareGPT4V)
-
-Start vLLM:
-
-```bash
-python -m vllm.entrypoints.openai.api_server \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dtype bfloat16 \
- --limit-mm-per-prompt '{"image": 1}' \
- --allowed-local-media-path /path/to/sharegpt4v/images
-```
-
-Send requests with images:
-
-```bash
-python benchmarks/benchmark_serving.py \
- --backend openai-chat \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dataset-name sharegpt \
- --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
- --num-prompts 100 \
- --save-result \
- --result-dir ~/vllm_benchmark_results \
- --save-detailed \
- --endpoint /v1/chat/completion
-```
-
-### Videos (ShareGPT4Video)
-
-Start vLLM:
-
-```bash
-python -m vllm.entrypoints.openai.api_server \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dtype bfloat16 \
- --limit-mm-per-prompt '{"video": 1}' \
- --allowed-local-media-path /path/to/sharegpt4video/videos
-```
-
-Send requests with videos:
-
-```bash
-python benchmarks/benchmark_serving.py \
- --backend openai-chat \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dataset-name sharegpt \
- --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
- --num-prompts 100 \
- --save-result \
- --result-dir ~/vllm_benchmark_results \
- --save-detailed \
- --endpoint /v1/chat/completion
-```
-
-### Synthetic Random Images (random-mm)
-
-Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
-
-Notes:
-
-- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
-- Video sampling is not yet implemented.
-
-Start the server (example):
-
-```bash
-vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
- --dtype bfloat16 \
- --max-model-len 16384 \
- --limit-mm-per-prompt '{"image": 3, "video": 0}' \
- --mm-processor-kwargs max_pixels=1003520
-```
-
-Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
-
-Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --model Qwen/Qwen2.5-VL-3B-Instruct \
- --endpoint /v1/chat/completions \
- --dataset-name random-mm \
- --num-prompts 100 \
- --max-concurrency 10 \
- --random-prefix-len 25 \
- --random-input-len 300 \
- --random-output-len 40 \
- --random-range-ratio 0.2 \
- --random-mm-base-items-per-request 2 \
- --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
- --random-mm-bucket-config '{(224, 224, 1): 1.0}' \
- --request-rate inf \
- --ignore-eos \
- --seed 42
-```
-
-The number of items per request can be controlled by passing multiple image buckets:
-
-```bash
- --random-mm-base-items-per-request 2 \
- --random-mm-num-mm-items-range-ratio 0.5 \
- --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
- --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
-```
-
-Flags specific to `random-mm`:
-
-- `--random-mm-base-items-per-request`: base number of multimodal items per request.
-- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
-- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
-- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
-
-Behavioral notes:
-
-- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
-
-How sampling works:
-
-- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
-- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
-- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
-This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
-- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
-
-
+-
+-
+-
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index 3aa988aac2548..d1bdb4c43f10b 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -149,3 +149,70 @@ The script follows a systematic process to find the optimal parameters:
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
+
+## Batched `auto_tune`
+
+The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
+
+### Prerequisites
+
+- **jq**: This script requires `jq` to parse the JSON configuration file.
+- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
+
+### How to Run
+
+1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
+
+2. **Execute the script**:
+
+ ```bash
+ bash batch_auto_tune.sh [gcs_upload_path]
+ ```
+
+ - ``: **Required.** Path to your JSON configuration file.
+ - `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
+
+### Configuration File
+
+The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
+
+Here is an example `runs_config.json` with two benchmark configurations:
+
+```json
+[
+ {
+ "base": "/home/user",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "system": "TPU", # OR GPU
+ "tp": 8,
+ "input_len": 128,
+ "output_len": 2048,
+ "max_model_len": 2300,
+ "num_seqs_list": "128 256",
+ "num_batched_tokens_list": "8192 16384"
+ },
+ {
+ "base": "/home/user",
+ "model": "meta-llama/Llama-3.1-70B-Instruct",
+ "system": "TPU", # OR GPU
+ "tp": 8,
+ "input_len": 4000,
+ "output_len": 16,
+ "max_model_len": 4096,
+ "num_seqs_list": "64 128",
+ "num_batched_tokens_list": "4096 8192",
+ "max_latency_allowed_ms": 500
+ }
+]
+```
+
+### Output
+
+The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
+
+- `run_id`: A unique identifier for the run, derived from the timestamp.
+- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
+- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
+- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
+
+A summary of successful and failed runs is also printed to the console upon completion.
diff --git a/benchmarks/auto_tune/batch_auto_tune.sh b/benchmarks/auto_tune/batch_auto_tune.sh
new file mode 100755
index 0000000000000..57ef20daf6b71
--- /dev/null
+++ b/benchmarks/auto_tune/batch_auto_tune.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+INPUT_JSON="$1"
+GCS_PATH="$2" # Optional GCS path for uploading results for each run
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
+
+if [[ -z "$INPUT_JSON" ]]; then
+ echo "Error: Input JSON file not provided."
+ echo "Usage: $0 [gcs_upload_path]"
+ exit 1
+fi
+
+if [[ ! -f "$INPUT_JSON" ]]; then
+ echo "Error: File not found at '$INPUT_JSON'"
+ exit 1
+fi
+
+if ! command -v jq &> /dev/null; then
+ echo "Error: 'jq' command not found. Please install jq to process the JSON input."
+ exit 1
+fi
+
+if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
+ echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
+ exit 1
+fi
+
+SUCCESS_COUNT=0
+FAILURE_COUNT=0
+FAILED_RUNS=()
+SCRIPT_START_TIME=$(date +%s)
+
+json_content=$(cat "$INPUT_JSON")
+if ! num_runs=$(echo "$json_content" | jq 'length'); then
+ echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
+ exit 1
+fi
+
+echo "Found $num_runs benchmark configurations in $INPUT_JSON."
+echo "Starting benchmark runs..."
+echo "--------------------------------------------------"
+
+for i in $(seq 0 $(($num_runs - 1))); do
+ run_object=$(echo "$json_content" | jq ".[$i]")
+
+ RUN_START_TIME=$(date +%s)
+ ENV_VARS_ARRAY=()
+ # Dynamically create env vars from the JSON object's keys
+ for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
+ value=$(echo "$run_object" | jq -r ".$key")
+ var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
+ ENV_VARS_ARRAY+=("${var_name}=${value}")
+ done
+
+ echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
+
+ # Execute auto_tune.sh and capture output
+ RUN_OUTPUT_FILE=$(mktemp)
+ if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
+ STATUS="SUCCESS"
+ ((SUCCESS_COUNT++))
+ else
+ STATUS="FAILURE"
+ ((FAILURE_COUNT++))
+ FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
+ fi
+
+ RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
+ rm "$RUN_OUTPUT_FILE"
+
+ # Parse results and optionally upload them to GCS
+ RUN_ID=""
+ RESULTS=""
+ GCS_RESULTS_URL=""
+ if [[ "$STATUS" == "SUCCESS" ]]; then
+ RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
+
+ if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
+ RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
+ RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
+ RESULTS=$(cat "$RESULT_FILE_PATH")
+
+ if [[ -n "$GCS_PATH" ]]; then
+ GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
+ echo "Uploading results to GCS..."
+ if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
+ echo "GCS upload successful."
+ else
+ echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
+ fi
+ fi
+ else
+ echo "Warning: Could not find result file for a successful run."
+ STATUS="WARNING_NO_RESULT_FILE"
+ fi
+ fi
+
+ # Add the results back into the JSON object for this run
+ json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
+ '.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
+
+ RUN_END_TIME=$(date +%s)
+ echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
+ echo "--------------------------------------------------"
+
+ # Save intermediate progress back to the file
+ echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
+
+done
+
+SCRIPT_END_TIME=$(date +%s)
+echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
+echo
+echo "====================== SUMMARY ======================"
+echo "Successful runs: $SUCCESS_COUNT"
+echo "Failed runs: $FAILURE_COUNT"
+echo "==================================================="
+
+if [[ $FAILURE_COUNT -gt 0 ]]; then
+ echo "Details of failed runs (see JSON file for full parameters):"
+ for failed in "${FAILED_RUNS[@]}"; do
+ echo " - $failed"
+ done
+fi
+
+echo "Updated results have been saved to '$INPUT_JSON'."
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
deleted file mode 100644
index 64ffa62c04d85..0000000000000
--- a/benchmarks/benchmark_dataset.py
+++ /dev/null
@@ -1,1288 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This module defines a framework for sampling benchmark requests from various
-datasets. Each dataset subclass of BenchmarkDataset must implement sample
-generation. Supported dataset types include:
- - ShareGPT
- - Random (synthetic)
- - Sonnet
- - BurstGPT
- - HuggingFace
- - VisionArena
-"""
-
-import base64
-import io
-import json
-import logging
-import random
-from abc import ABC, abstractmethod
-from collections.abc import Mapping
-from copy import deepcopy
-from dataclasses import dataclass
-from functools import cache
-from io import BytesIO
-from typing import Any, Callable, Optional, Union
-
-import numpy as np
-import pandas as pd
-from datasets import load_dataset
-from PIL import Image
-from transformers import PreTrainedTokenizerBase
-
-from vllm.lora.request import LoRARequest
-from vllm.lora.utils import get_adapter_absolute_path
-from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.image import convert_image_mode
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
-
-logger = logging.getLogger(__name__)
-
-# -----------------------------------------------------------------------------
-# Data Classes
-# -----------------------------------------------------------------------------
-
-
-@dataclass
-class SampleRequest:
- """
- Represents a single inference request for benchmarking.
- """
-
- prompt: Union[str, Any]
- prompt_len: int
- expected_output_len: int
- multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
- lora_request: Optional[LoRARequest] = None
- request_id: Optional[str] = None
-
-
-# -----------------------------------------------------------------------------
-# Benchmark Dataset Base Class
-# -----------------------------------------------------------------------------
-
-
-class BenchmarkDataset(ABC):
- DEFAULT_SEED = 0
- IS_MULTIMODAL = False
-
- def __init__(
- self,
- dataset_path: Optional[str] = None,
- random_seed: int = DEFAULT_SEED,
- ) -> None:
- """
- Initialize the BenchmarkDataset with an optional dataset path and random
- seed. Args:
- dataset_path (Optional[str]): Path to the dataset. If None, it
- indicates that a default or random dataset might be used.
- random_seed (int): Seed value for reproducible shuffling or
- sampling. Defaults to DEFAULT_SEED.
- """
- self.dataset_path = dataset_path
- # Set the random seed, ensuring that a None value is replaced with the
- # default seed.
- self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
- self.data = None
-
- def apply_multimodal_chat_transformation(
- self, prompt: str, mm_content: Optional[MultiModalDataDict] = None
- ) -> list[dict]:
- """
- Transform a prompt and optional multimodal content into a chat format.
- This method is used for chat models that expect a specific conversation
- format.
- """
- content = [{"text": prompt, "type": "text"}]
- if mm_content is not None:
- content.append(mm_content)
- return [{"role": "user", "content": content}]
-
- def load_data(self) -> None:
- """
- Load data from the dataset path into self.data.
-
- This method must be overridden by subclasses since the method to load
- data will vary depending on the dataset format and source.
-
- Raises:
- NotImplementedError: If a subclass does not implement this method.
- """
- # TODO (jenniferzhao): add support for downloading data
- raise NotImplementedError("load_data must be implemented in subclasses.")
-
- def get_random_lora_request(
- self,
- tokenizer: PreTrainedTokenizerBase,
- max_loras: Optional[int] = None,
- lora_path: Optional[str] = None,
- ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
- """
- Optionally select a random LoRA request and return its associated
- tokenizer.
-
- This method is used when LoRA parameters are provided. It randomly
- selects a LoRA based on max_loras and retrieves a cached tokenizer for
- that LoRA if available. Otherwise, it returns the base tokenizer.
-
- Args:
- tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
- LoRA is selected. max_loras (Optional[int]): The maximum number of
- LoRAs available. If None, LoRA is not used. lora_path
- (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
- is not used.
-
- Returns:
- tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
- element is a LoRARequest (or None if not applicable) and the second
- element is the tokenizer associated with the LoRA request (or the
- base tokenizer).
- """
- if max_loras is None or lora_path is None:
- return None, tokenizer
-
- # Generate a random LoRA ID in the range [1, max_loras].
- lora_id = random.randint(1, max_loras)
- lora_request = LoRARequest(
- lora_name=str(lora_id),
- lora_int_id=lora_id,
- lora_path=lora_path_on_disk(lora_path),
- )
- if lora_id not in lora_tokenizer_cache:
- lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
- # Return lora_request and the cached tokenizer if available; otherwise,
- # return the base tokenizer
- return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
-
- @abstractmethod
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- request_id_prefix: str = "",
- ) -> list[SampleRequest]:
- """
- Abstract method to generate sample requests from the dataset.
-
- Subclasses must override this method to implement dataset-specific logic
- for generating a list of SampleRequest objects.
-
- Args:
- tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
- for processing the dataset's text.
- num_requests (int): The number of sample requests to generate.
- request_id_prefix (str) The prefix of request_id.
-
- Returns:
- list[SampleRequest]: A list of sample requests generated from the
- dataset.
- """
- raise NotImplementedError("sample must be implemented in subclasses.")
-
- def maybe_oversample_requests(
- self,
- requests: list[SampleRequest],
- num_requests: int,
- request_id_prefix: str = "",
- ) -> None:
- """
- Oversamples the list of requests if its size is less than the desired
- number.
-
- Args:
- requests (List[SampleRequest]): The current list of sampled
- requests.
- num_requests (int): The target number of requests.
- request_id_prefix (str) The prefix of the request ids.
- """
- if len(requests) < num_requests:
- random.seed(self.random_seed)
- additional = deepcopy(
- random.choices(requests, k=num_requests - len(requests))
- )
- for i in range(len(additional)):
- req = additional[i]
- req.request_id = request_id_prefix + str(len(requests) + i)
- requests.extend(additional)
- logger.info("Oversampled requests to reach %d total samples.", num_requests)
-
-
-# -----------------------------------------------------------------------------
-# Utility Functions and Global Caches
-# -----------------------------------------------------------------------------
-
-
-def is_valid_sequence(
- prompt_len: int,
- output_len: int,
- min_len: int = 4,
- max_prompt_len: int = 1024,
- max_total_len: int = 2048,
- skip_min_output_len_check: bool = False,
-) -> bool:
- """
- Validate a sequence based on prompt and output lengths.
-
- Default pruning criteria are copied from the original `sample_hf_requests`
- and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
- from `sample_requests` in benchmark_throughput.py.
- """
- # Check for invalid conditions
- prompt_too_short = prompt_len < min_len
- output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
- prompt_too_long = prompt_len > max_prompt_len
- combined_too_long = (prompt_len + output_len) > max_total_len
-
- # Return True if none of the invalid conditions are met
- return not (
- prompt_too_short or output_too_short or prompt_too_long or combined_too_long
- )
-
-
-@cache
-def lora_path_on_disk(lora_path: str) -> str:
- return get_adapter_absolute_path(lora_path)
-
-
-# Global cache for LoRA tokenizers.
-lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
-
-
-def process_image(image: Any) -> Mapping[str, Any]:
- """
- Process a single image input and return a multimedia content dictionary.
-
- Supports three input types:
-
- 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
- containing raw image data. - Loads the bytes as a PIL.Image.Image.
-
- 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
- a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
- a dictionary with the image as a base64 data URL.
-
- 3. String input: - Treats the string as a URL or local file path. -
- Prepends "file://" if the string doesn't start with "http://" or
- "file://". - Returns a dictionary with the image URL.
-
- Raises:
- ValueError: If the input is not a supported type.
- """
- if isinstance(image, dict) and "bytes" in image:
- image = Image.open(BytesIO(image["bytes"]))
- if isinstance(image, Image.Image):
- image = convert_image_mode(image, "RGB")
- with io.BytesIO() as image_data:
- image.save(image_data, format="JPEG")
- image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
- return {
- "type": "image_url",
- "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
- }
-
- if isinstance(image, str):
- image_url = (
- image if image.startswith(("http://", "file://")) else f"file://{image}"
- )
- return {"type": "image_url", "image_url": {"url": image_url}}
-
- raise ValueError(
- f"Invalid image input {image}. Must be a PIL.Image.Image"
- " or str or dictionary with raw image bytes."
- )
-
-
-def process_video(video: Any) -> Mapping[str, Any]:
- """
- Process a single video input and return a multimedia content dictionary.
-
- Supports the following input types:
-
- 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
- containing raw video data.
-
- 2. String input: - Treats the string as a URL or local file path. -
- Prepends "file://" if the string doesn't start with "http://" or
- "file://". - Returns a dictionary with the image URL.
-
- Raises:
- ValueError: If the input is not a supported type.
- """
- if isinstance(video, dict) and "bytes" in video:
- video_bytes = video["bytes"]
- video_base64 = base64.b64encode(video_bytes).decode("utf-8")
- return {
- "type": "video_url",
- "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
- }
-
- if isinstance(video, str):
- video_url = (
- video if video.startswith(("http://", "file://")) else f"file://{video}"
- )
- return {"type": "video_url", "video_url": {"url": video_url}}
-
- raise ValueError(
- f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
- )
-
-
-# -----------------------------------------------------------------------------
-# Random Dataset Implementation (Synthetic Data)
-# -----------------------------------------------------------------------------
-
-
-class RandomDataset(BenchmarkDataset):
- # Default values copied from benchmark_serving.py for the random dataset.
- DEFAULT_PREFIX_LEN = 0
- DEFAULT_RANGE_RATIO = 0.0
- DEFAULT_INPUT_LEN = 1024
- DEFAULT_OUTPUT_LEN = 128
-
- def __init__(
- self,
- **kwargs,
- ) -> None:
- super().__init__(**kwargs)
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- prefix_len: int = DEFAULT_PREFIX_LEN,
- range_ratio: float = DEFAULT_RANGE_RATIO,
- input_len: int = DEFAULT_INPUT_LEN,
- output_len: int = DEFAULT_OUTPUT_LEN,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list[SampleRequest]:
- # Enforce range_ratio < 1
- assert range_ratio < 1.0, (
- "random_range_ratio must be < 1.0 to ensure a valid sampling range"
- )
-
- vocab_size = tokenizer.vocab_size
- num_special_tokens = tokenizer.num_special_tokens_to_add()
- real_input_len = input_len - num_special_tokens
-
- prefix_token_ids = (
- np.random.randint(0, vocab_size, size=prefix_len).tolist()
- if prefix_len > 0
- else []
- )
-
- # New sampling logic: [X * (1 - b), X * (1 + b)]
- input_low = int(real_input_len * (1 - range_ratio))
- input_high = int(real_input_len * (1 + range_ratio))
- output_low = int(output_len * (1 - range_ratio))
- # Ensure the lower bound for output length is at least 1 to prevent
- # sampling 0 tokens, which can cause request failures.
- output_low = max(output_low, 1)
- output_high = int(output_len * (1 + range_ratio))
-
- # Add logging for debugging
- logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
- logger.info("Sampling output_len from [%s, %s]", output_low, output_high)
-
- input_lens = np.random.randint(input_low, input_high + 1, size=num_requests)
- output_lens = np.random.randint(output_low, output_high + 1, size=num_requests)
- offsets = np.random.randint(0, vocab_size, size=num_requests)
-
- requests = []
- for i in range(num_requests):
- inner_seq = (
- (offsets[i] + i + np.arange(input_lens[i])) % vocab_size
- ).tolist()
- token_sequence = prefix_token_ids + inner_seq
- prompt = tokenizer.decode(token_sequence)
- # After decoding the prompt we have to encode and decode it again.
- # This is done because in some cases N consecutive tokens
- # give a string tokenized into != N number of tokens.
- # For example for GPT2Tokenizer:
- # [6880, 6881] -> ['Ġcalls', 'here'] ->
- # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
- # To avoid uncontrolled change of the prompt length,
- # the encoded sequence is truncated before being decoded again.
- total_input_len = prefix_len + int(input_lens[i])
- re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
- :total_input_len
- ]
- prompt = tokenizer.decode(re_encoded_sequence)
- total_input_len = len(re_encoded_sequence)
- requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=total_input_len,
- expected_output_len=int(output_lens[i]),
- request_id=request_id_prefix + str(i),
- )
- )
-
- return requests
-
-
-# -----------------------------------------------------------------------------
-# ShareGPT Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ShareGPTDataset(BenchmarkDataset):
- """
- Implements the ShareGPT dataset. Loads data from a JSON file and generates
- sample requests based on conversation turns.
- """
-
- def __init__(self, **kwargs) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(self) -> None:
- if self.dataset_path is None:
- raise ValueError("dataset_path must be provided for loading data.")
-
- with open(self.dataset_path, encoding="utf-8") as f:
- self.data = json.load(f)
- # Filter entries with at least two conversation turns.
- self.data = [
- entry
- for entry in self.data
- if "conversations" in entry and len(entry["conversations"]) >= 2
- ]
- random.seed(self.random_seed)
- random.shuffle(self.data)
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- lora_path: Optional[str] = None,
- max_loras: Optional[int] = None,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- samples: list = []
- ind = 0
- for entry in self.data:
- if len(samples) >= num_requests:
- break
- prompt, completion = (
- entry["conversations"][0]["value"],
- entry["conversations"][1]["value"],
- )
-
- lora_request, tokenizer = self.get_random_lora_request(
- tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
- )
- prompt_ids = tokenizer(prompt).input_ids
- completion_ids = tokenizer(completion).input_ids
- prompt_len = len(prompt_ids)
- new_output_len = len(completion_ids) if output_len is None else output_len
- if not is_valid_sequence(
- prompt_len,
- new_output_len,
- skip_min_output_len_check=output_len is not None,
- ):
- continue
- if image_path := entry.get("image"):
- mm_content = process_image(image_path)
- elif video_path := entry.get("video"):
- mm_content = process_video(video_path)
- else:
- mm_content = None
- if enable_multimodal_chat:
- prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
- samples.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=new_output_len,
- lora_request=lora_request,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
- return samples
-
-
-# -----------------------------------------------------------------------------
-# Custom Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class CustomDataset(BenchmarkDataset):
- """
- Implements the Custom dataset. Loads data from a JSONL file and generates
- sample requests based on conversation turns. E.g.,
- ```
- {"prompt": "What is the capital of India?"}
- {"prompt": "What is the capital of Iran?"}
- {"prompt": "What is the capital of China?"}
- ```
- """
-
- def __init__(self, **kwargs) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(self) -> None:
- if self.dataset_path is None:
- raise ValueError("dataset_path must be provided for loading data.")
-
- # self.data will be a list of dictionaries
- # e.g., [{"prompt": "What is the capital of India?"}, ...]
- # This will be the standardized format which load_data()
- # has to convert into depending on the filetype of dataset_path.
- # sample() will assume this standardized format of self.data
- self.data = []
-
- # Load the JSONL file
- if self.dataset_path.endswith(".jsonl"):
- jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
-
- # check if the JSONL file has a 'prompt' column
- if "prompt" not in jsonl_data.columns:
- raise ValueError("JSONL file must contain a 'prompt' column.")
-
- # Convert each row to a dictionary and append to self.data
- # This will convert the DataFrame to a list of dictionaries
- # where each dictionary corresponds to a row in the DataFrame.
- # This is the standardized format we want for self.data
- for _, row in jsonl_data.iterrows():
- self.data.append(row.to_dict())
- else:
- raise NotImplementedError(
- "Only JSONL format is supported for CustomDataset."
- )
-
- random.seed(self.random_seed)
- random.shuffle(self.data)
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- lora_path: Optional[str] = None,
- max_loras: Optional[int] = None,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- skip_chat_template: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- sampled_requests = []
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- prompt = item["prompt"]
-
- # apply template
- if not skip_chat_template:
- prompt = tokenizer.apply_chat_template(
- [{"role": "user", "content": prompt}],
- add_generation_prompt=True,
- tokenize=False,
- )
-
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
-
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Sonnet Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class SonnetDataset(BenchmarkDataset):
- """
- Simplified implementation of the Sonnet dataset. Loads poem lines from a
- text file and generates sample requests. Default values here copied from
- `benchmark_serving.py` for the sonnet dataset.
- """
-
- DEFAULT_PREFIX_LEN = 200
- DEFAULT_INPUT_LEN = 550
- DEFAULT_OUTPUT_LEN = 150
-
- def __init__(
- self,
- **kwargs,
- ) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(self) -> None:
- if not self.dataset_path:
- raise ValueError("dataset_path must be provided.")
- with open(self.dataset_path, encoding="utf-8") as f:
- self.data = f.readlines()
-
- def sample(
- self,
- tokenizer,
- num_requests: int,
- prefix_len: int = DEFAULT_PREFIX_LEN,
- input_len: int = DEFAULT_INPUT_LEN,
- output_len: int = DEFAULT_OUTPUT_LEN,
- return_prompt_formatted: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- # Calculate average token length for a poem line.
- tokenized_lines = [tokenizer(line).input_ids for line in self.data]
- avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
-
- # Build the base prompt.
- base_prompt = "Pick as many lines as you can from these poem lines:\n"
- base_msg = [{"role": "user", "content": base_prompt}]
- base_fmt = tokenizer.apply_chat_template(
- base_msg, add_generation_prompt=True, tokenize=False
- )
- base_offset = len(tokenizer(base_fmt).input_ids)
- if input_len <= base_offset:
- raise ValueError(
- f"'input_len' must be higher than the base prompt length "
- f"({base_offset})."
- )
-
- # Determine how many poem lines to use.
- num_input_lines = round((input_len - base_offset) / avg_len)
- num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
- prefix_lines = self.data[:num_prefix_lines]
-
- samples = []
- ind = 0
- while len(samples) < num_requests:
- extra_lines = random.choices(
- self.data, k=num_input_lines - num_prefix_lines
- )
- prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
- msg = [{"role": "user", "content": prompt}]
- prompt_formatted = tokenizer.apply_chat_template(
- msg, add_generation_prompt=True, tokenize=False
- )
- prompt_len = len(tokenizer(prompt_formatted).input_ids)
-
- if prompt_len <= input_len:
- samples.append(
- SampleRequest(
- prompt=prompt_formatted if return_prompt_formatted else prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- return samples
-
-
-# -----------------------------------------------------------------------------
-# BurstGPT Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class BurstGPTDataset(BenchmarkDataset):
- """
- Implements the BurstGPT dataset. Loads data from a CSV file and generates
- sample requests based on synthetic prompt generation. Only rows with Model
- "GPT-4" and positive response tokens are used.
- """
-
- def __init__(self, **kwargs) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(
- self,
- ):
- if self.dataset_path is None:
- raise ValueError("dataset_path must be provided for loading data.")
-
- df = pd.read_csv(self.dataset_path)
- # Filter to keep only GPT-4 rows.
- gpt4_df = df[df["Model"] == "GPT-4"]
- # Remove failed requests (where Response tokens is 0 or less).
- gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
- # Sample the desired number of rows.
- self.data = gpt4_df
-
- def _sample_loaded_data(self, num_requests: int) -> list:
- if num_requests <= len(self.data):
- data = self.data.sample(n=num_requests, random_state=self.random_seed)
- else:
- data = self.data.sample(
- n=num_requests,
- random_state=self.random_seed,
- replace=True,
- )
- # Convert the dataframe to a list of lists.
- return data.values.tolist()
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- max_loras: Optional[int] = None,
- lora_path: Optional[str] = None,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list[SampleRequest]:
- samples = []
- data = self._sample_loaded_data(num_requests=num_requests)
- for i in range(num_requests):
- input_len = int(data[i][2])
- output_len = int(data[i][3])
- lora_req, tokenizer = self.get_random_lora_request(
- tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
- )
- vocab_size = tokenizer.vocab_size
- # Generate a synthetic prompt: a list of token IDs computed as (i +
- # j) modulo vocab_size.
- token_ids = [(i + j) % vocab_size for j in range(input_len)]
- prompt = tokenizer.decode(token_ids)
- samples.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=input_len,
- expected_output_len=output_len,
- lora_request=lora_req,
- request_id=request_id_prefix + str(i),
- )
- )
- return samples
-
-
-# -----------------------------------------------------------------------------
-# HuggingFace Dataset Base Implementation
-# -----------------------------------------------------------------------------
-class HuggingFaceDataset(BenchmarkDataset):
- """Base class for datasets hosted on HuggingFace."""
-
- SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
-
- def __init__(
- self,
- dataset_path: str,
- dataset_split: str,
- no_stream: bool = False,
- dataset_subset: Optional[str] = None,
- **kwargs,
- ) -> None:
- super().__init__(dataset_path=dataset_path, **kwargs)
-
- self.dataset_split = dataset_split
- self.dataset_subset = dataset_subset
- self.load_stream = not no_stream
- self.load_data()
-
- def load_data(self) -> None:
- """Load data from HuggingFace datasets."""
- self.data = load_dataset(
- self.dataset_path,
- name=self.dataset_subset,
- split=self.dataset_split,
- streaming=self.load_stream,
- )
- self.data = self.data.shuffle(seed=self.random_seed)
-
-
-# -----------------------------------------------------------------------------
-# Conversation Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ConversationDataset(HuggingFaceDataset):
- """Dataset for conversation data with multimodal support."""
-
- SUPPORTED_DATASET_PATHS = {
- "lmms-lab/LLaVA-OneVision-Data",
- "Aeala/ShareGPT_Vicuna_unfiltered",
- }
- IS_MULTIMODAL = True
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- # Filter examples with at least 2 conversations
- filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
- sampled_requests = []
- dynamic_output = output_len is None
- ind = 0
-
- for item in filtered_data:
- if len(sampled_requests) >= num_requests:
- break
- conv = item["conversations"]
- prompt, completion = conv[0]["value"], conv[1]["value"]
-
- prompt_ids = tokenizer(prompt).input_ids
- completion_ids = tokenizer(completion).input_ids
- prompt_len = len(prompt_ids)
- completion_len = len(completion_ids)
- output_len = completion_len if dynamic_output else output_len
- assert isinstance(output_len, int) and output_len > 0
- if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
- continue
- mm_content = process_image(item["image"]) if "image" in item else None
- if enable_multimodal_chat:
- # Note: when chat is enabled the request prompt_len is no longer
- # accurate and we will be using request output to count the
- # actual prompt len and output len
- prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Vision Arena Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class VisionArenaDataset(HuggingFaceDataset):
- """
- Vision Arena Dataset.
- """
-
- DEFAULT_OUTPUT_LEN = 128
- SUPPORTED_DATASET_PATHS = {
- "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
- "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
- }
- IS_MULTIMODAL = True
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- sampled_requests = []
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
- if parser_fn is None:
- raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
- prompt = parser_fn(item)
- mm_content = process_image(item["images"][0])
- prompt_len = len(tokenizer(prompt).input_ids)
- if enable_multimodal_chat:
- # Note: when chat is enabled the request prompt_len is no longer
- # accurate and we will be using request output to count the
- # actual prompt len
- prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Instruct Coder Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class InstructCoderDataset(HuggingFaceDataset):
- """
- InstructCoder Dataset.
- https://huggingface.co/datasets/likaixin/InstructCoder
-
- InstructCoder is the dataset designed for general code editing. It consists
- of 114,239 instruction-input-output triplets, and covers multiple distinct
- code editing scenario.
- """
-
- DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
- SUPPORTED_DATASET_PATHS = {
- "likaixin/InstructCoder",
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- sampled_requests = []
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- prompt = (
- f"{item['input']}\n\n{item['instruction']} Just output "
- "the code, do not include any explanation."
- )
-
- # apply template
- prompt = tokenizer.apply_chat_template(
- [{"role": "user", "content": prompt}],
- add_generation_prompt=True,
- tokenize=False,
- )
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# MT-Bench Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class MTBenchDataset(HuggingFaceDataset):
- """
- MT-Bench Dataset.
- https://huggingface.co/datasets/philschmid/mt-bench
-
- We create a single turn dataset for MT-Bench.
- This is similar to Spec decoding benchmark setup in vLLM
- https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
- """ # noqa: E501
-
- DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
- SUPPORTED_DATASET_PATHS = {
- "philschmid/mt-bench",
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- sampled_requests = []
-
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- prompt = item["turns"][0]
-
- # apply template
- prompt = tokenizer.apply_chat_template(
- [{"role": "user", "content": prompt}],
- add_generation_prompt=True,
- tokenize=False,
- )
-
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# AIMO Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class AIMODataset(HuggingFaceDataset):
- """
- Dataset class for processing a AIMO dataset with reasoning questions.
- """
-
- SUPPORTED_DATASET_PATHS = {
- "AI-MO/aimo-validation-aime",
- "AI-MO/NuminaMath-1.5",
- "AI-MO/NuminaMath-CoT",
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- sampled_requests = []
- dynamic_output = output_len is None
- ind = 0
-
- for item in self.data:
- if len(sampled_requests) >= num_requests:
- break
- prompt, completion = item["problem"], item["solution"]
-
- prompt_ids = tokenizer(prompt).input_ids
- completion_ids = tokenizer(completion).input_ids
- prompt_len = len(prompt_ids)
- completion_len = len(completion_ids)
- output_len = completion_len if dynamic_output else output_len
- assert isinstance(output_len, int) and output_len > 0
- if dynamic_output and not is_valid_sequence(
- prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
- ):
- continue
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=None,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Next Edit Prediction Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-zeta_prompt = """### Instruction:
-You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
-
-### User Edits:
-
-{}
-
-### User Excerpt:
-
-{}
-
-### Response:
-
-""" # noqa: E501
-
-
-def _format_zeta_prompt(
- sample: dict, original_start_marker: str = "<|editable_region_start|>"
-) -> dict:
- """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
-
- This function formats examples from the NEP dataset
- into prompts and expected outputs. It could be
- further extended to support more NEP datasets.
-
- Args:
- sample: The dataset sample containing events,
- inputs, and outputs.
- original_start_marker: The marker indicating the
- start of the editable region. Defaults to
- "<|editable_region_start|>".
-
- Returns:
- A dictionary with the formatted prompts and expected outputs.
- """
- events = sample["events"]
- input = sample["input"]
- output = sample["output"]
- prompt = zeta_prompt.format(events, input)
-
- # following the original implementation, extract the focused region
- # from the raw output
- output_start_index = output.find(original_start_marker)
- output_focused_region = output[output_start_index:]
- expected_output = output_focused_region
-
- return {"prompt": prompt, "expected_output": expected_output}
-
-
-class NextEditPredictionDataset(HuggingFaceDataset):
- """
- Dataset class for processing a Next Edit Prediction dataset.
- """
-
- SUPPORTED_DATASET_PATHS = {
- "zed-industries/zeta",
- }
- MAPPING_PROMPT_FUNCS = {
- "zed-industries/zeta": _format_zeta_prompt,
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- request_id_prefix: str = "",
- **kwargs,
- ):
- formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
- if formatting_prompt_func is None:
- raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
- samples = []
- for i, sample in enumerate(self.data):
- sample = formatting_prompt_func(sample)
- samples.append(
- SampleRequest(
- prompt=sample["prompt"],
- prompt_len=len(tokenizer(sample["prompt"]).input_ids),
- expected_output_len=len(
- tokenizer(sample["expected_output"]).input_ids
- ),
- request_id=request_id_prefix + str(i),
- )
- )
- if len(samples) >= num_requests:
- break
- self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
- return samples
-
-
-# -----------------------------------------------------------------------------
-# ASR Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ASRDataset(HuggingFaceDataset):
- """
- Dataset class for processing a ASR dataset for transcription.
- Tested on the following set:
-
- +----------------+----------------------------------------+--------------------------+-----------------------------+
- | Dataset | Domain | Speaking Style | hf-subset |
- +----------------+----------------------------------------+--------------------------+-----------------------------+
- | TED-LIUM | TED talks | Oratory | release1, release2, release3|
- | | | | release3-speaker-adaptation |
- | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
- | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
- | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
- | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
- | AMI | Meetings | Spontaneous | ihm, sdm |
- +----------------+----------------------------------------+--------------------------+-----------------------------+
-
- """ # noqa: E501
-
- SUPPORTED_DATASET_PATHS = {
- "openslr/librispeech_asr",
- "facebook/voxpopuli",
- "LIUM/tedlium",
- "edinburghcstr/ami",
- "speechcolab/gigaspeech",
- "kensho/spgispeech",
- }
-
- DEFAULT_OUTPUT_LEN = 128
- IS_MULTIMODAL = True
-
- # TODO Whisper-specific. Abstract interface when more models are supported.
- TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
- skip_long_audios: bool = True
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- import librosa
-
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests = []
- skipped = 0
- ind = 0
- for item in self.data:
- if len(sampled_requests) >= num_requests:
- break
- audio = item["audio"]
- y, sr = audio["array"], audio["sampling_rate"]
- duration_s = librosa.get_duration(y=y, sr=sr)
- # Whisper max supported duration
- if self.skip_long_audios and duration_s > 30:
- skipped += 1
- continue
-
- mm_content = {"audio": (y, sr)}
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- if skipped:
- logger.warning(
- "%d samples discarded from dataset due to"
- " their length being greater than"
- " what Whisper supports.",
- skipped,
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index d8b960edaa468..a7892f3f71243 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,191 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark the latency of processing a single batch of requests."""
-
-import argparse
-import dataclasses
-import json
-import os
-import time
-from typing import Any, Optional
-
-import numpy as np
-from tqdm import tqdm
-from typing_extensions import deprecated
-
-import vllm.envs as envs
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptType
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any]
-) -> None:
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={"latency": results["latencies"]},
- extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
- )
- if pt_records:
- pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-@deprecated(
- "benchmark_latency.py is deprecated and will be removed in a "
- "future version. Please use 'vllm bench latency' instead.",
-)
-def main(args: argparse.Namespace):
- print(args)
-
- engine_args = EngineArgs.from_cli_args(args)
-
- # NOTE(woosuk): If the request cannot be processed in a single batch,
- # the engine will automatically process the request in multiple batches.
- llm = LLM(**dataclasses.asdict(engine_args))
- assert llm.llm_engine.model_config.max_model_len >= (
- args.input_len + args.output_len
- ), (
- "Please ensure that max_model_len is greater than"
- " the sum of input_len and output_len."
- )
-
- sampling_params = SamplingParams(
- n=args.n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=args.output_len,
- detokenize=not args.disable_detokenize,
- )
- print(sampling_params)
- dummy_prompt_token_ids = np.random.randint(
- 10000, size=(args.batch_size, args.input_len)
- )
- dummy_prompts: list[PromptType] = [
- {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
- ]
-
- def llm_generate():
- if not args.use_beam_search:
- llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
- else:
- llm.beam_search(
- dummy_prompts,
- BeamSearchParams(
- beam_width=args.n,
- max_tokens=args.output_len,
- ignore_eos=True,
- ),
- )
-
- def run_to_completion(profile_dir: Optional[str] = None):
- if profile_dir:
- llm.start_profile()
- llm_generate()
- llm.stop_profile()
- else:
- start_time = time.perf_counter()
- llm_generate()
- end_time = time.perf_counter()
- latency = end_time - start_time
- return latency
-
- print("Warming up...")
- for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
- run_to_completion(profile_dir=None)
-
- if args.profile:
- profile_dir = envs.VLLM_TORCH_PROFILER_DIR
- print(f"Profiling (results will be saved to '{profile_dir}')...")
- run_to_completion(profile_dir=profile_dir)
- return
-
- # Benchmark.
- latencies = []
- for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
- latencies.append(run_to_completion(profile_dir=None))
- latencies = np.array(latencies)
- percentages = [10, 25, 50, 75, 90, 99]
- percentiles = np.percentile(latencies, percentages)
- print(f"Avg latency: {np.mean(latencies)} seconds")
- for percentage, percentile in zip(percentages, percentiles):
- print(f"{percentage}% percentile latency: {percentile} seconds")
-
- # Output JSON results if specified
- if args.output_json:
- results = {
- "avg_latency": np.mean(latencies),
- "latencies": latencies.tolist(),
- "percentiles": dict(zip(percentages, percentiles.tolist())),
- }
- with open(args.output_json, "w") as f:
- json.dump(results, f, indent=4)
- save_to_pytorch_benchmark_format(args, results)
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(
- description="Benchmark the latency of processing a single batch of "
- "requests till completion."
- )
- parser.add_argument("--input-len", type=int, default=32)
- parser.add_argument("--output-len", type=int, default=128)
- parser.add_argument("--batch-size", type=int, default=8)
- parser.add_argument(
- "--n",
- type=int,
- default=1,
- help="Number of generated sequences per prompt.",
- )
- parser.add_argument("--use-beam-search", action="store_true")
- parser.add_argument(
- "--num-iters-warmup",
- type=int,
- default=10,
- help="Number of iterations to run for warmup.",
- )
- parser.add_argument(
- "--num-iters", type=int, default=30, help="Number of iterations to run."
- )
- parser.add_argument(
- "--profile",
- action="store_true",
- help="profile the generation process of a single batch",
- )
- parser.add_argument(
- "--output-json",
- type=str,
- default=None,
- help="Path to save the latency results in JSON format.",
- )
- parser.add_argument(
- "--disable-detokenize",
- action="store_true",
- help=(
- "Do not detokenize responses (i.e. do not include "
- "detokenization time in the latency measurement)"
- ),
- )
-
- parser = EngineArgs.add_cli_args(parser)
- # V1 enables prefix caching by default which skews the latency
- # numbers. We need to disable prefix caching by default.
- parser.set_defaults(enable_prefix_caching=False)
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
- raise OSError(
- "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
- "Please set it to a valid path to use torch profiler."
- )
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench latency
+
+For help with the new command, run:
+ vllm bench latency --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench latency --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 934df05efac17..76cf51498020b 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,1324 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-r"""Benchmark online serving throughput.
-
-On the server side, run one of the following commands:
- vLLM OpenAI API server
- vllm serve \
- --swap-space 16
-
-On the client side, run:
- python benchmarks/benchmark_serving.py \
- --backend \
- --model \
- --dataset-name sharegpt \
- --dataset-path \
- --request-rate \ # By default is inf
- --num-prompts # By default is 1000
-
- when using tgi backend, add
- --endpoint /generate_stream
- to the end of the command above.
-"""
-
-import argparse
-import asyncio
-import gc
-import json
-import os
-import random
-import time
-import warnings
-from collections.abc import Iterable
-from dataclasses import dataclass
-from datetime import datetime
-from typing import Any, Literal, Optional
-
-import numpy as np
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
-from typing_extensions import deprecated
-
-from backend_request_func import (
- ASYNC_REQUEST_FUNCS,
- OPENAI_COMPATIBLE_BACKENDS,
- RequestFuncInput,
- RequestFuncOutput,
-)
-
-try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
-except ImportError:
- from backend_request_func import get_tokenizer
-
-try:
- from vllm.utils import FlexibleArgumentParser
-except ImportError:
- from argparse import ArgumentParser as FlexibleArgumentParser
-
-from benchmark_dataset import (
- AIMODataset,
- ASRDataset,
- BurstGPTDataset,
- ConversationDataset,
- CustomDataset,
- HuggingFaceDataset,
- InstructCoderDataset,
- MTBenchDataset,
- NextEditPredictionDataset,
- RandomDataset,
- SampleRequest,
- ShareGPTDataset,
- SonnetDataset,
- VisionArenaDataset,
-)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.benchmarks.serve import get_request
-
-MILLISECONDS_TO_SECONDS_CONVERSION = 1000
-
-
-@dataclass
-class BenchmarkMetrics:
- completed: int
- total_input: int
- total_output: int
- request_throughput: float
- request_goodput: float
- output_throughput: float
- total_token_throughput: float
- mean_ttft_ms: float
- median_ttft_ms: float
- std_ttft_ms: float
- percentiles_ttft_ms: list[tuple[float, float]]
- mean_tpot_ms: float
- median_tpot_ms: float
- std_tpot_ms: float
- percentiles_tpot_ms: list[tuple[float, float]]
- mean_itl_ms: float
- median_itl_ms: float
- std_itl_ms: float
- percentiles_itl_ms: list[tuple[float, float]]
- # E2EL stands for end-to-end latency per request.
- # It is the time taken on the client side from sending
- # a request to receiving a complete response.
- mean_e2el_ms: float
- median_e2el_ms: float
- std_e2el_ms: float
- percentiles_e2el_ms: list[tuple[float, float]]
-
-
-def calculate_metrics(
- input_requests: list[SampleRequest],
- outputs: list[RequestFuncOutput],
- dur_s: float,
- tokenizer: PreTrainedTokenizerBase,
- selected_percentile_metrics: list[str],
- selected_percentiles: list[float],
- goodput_config_dict: dict[str, float],
-) -> tuple[BenchmarkMetrics, list[int]]:
- actual_output_lens: list[int] = []
- total_input = 0
- completed = 0
- good_completed = 0
- itls: list[float] = []
- tpots: list[float] = []
- all_tpots: list[float] = []
- ttfts: list[float] = []
- e2els: list[float] = []
- for i in range(len(outputs)):
- if outputs[i].success:
- output_len = outputs[i].output_tokens
-
- if not output_len:
- # We use the tokenizer to count the number of output tokens
- # for some serving backends instead of looking at
- # len(outputs[i].itl) since multiple output tokens may be
- # bundled together
- # Note : this may inflate the output token count slightly
- output_len = len(
- tokenizer(
- outputs[i].generated_text, add_special_tokens=False
- ).input_ids
- )
- actual_output_lens.append(output_len)
- total_input += input_requests[i].prompt_len
- tpot = 0
- if output_len > 1:
- latency_minus_ttft = outputs[i].latency - outputs[i].ttft
- tpot = latency_minus_ttft / (output_len - 1)
- tpots.append(tpot)
- # Note: if output_len <= 1, we regard tpot as 0 for goodput
- all_tpots.append(tpot)
- itls += outputs[i].itl
- ttfts.append(outputs[i].ttft)
- e2els.append(outputs[i].latency)
- completed += 1
- else:
- actual_output_lens.append(0)
-
- if goodput_config_dict:
- valid_metrics = []
- slo_values = []
-
- if "ttft" in goodput_config_dict:
- valid_metrics.append(ttfts)
- slo_values.append(
- goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
- if "tpot" in goodput_config_dict:
- valid_metrics.append(all_tpots)
- slo_values.append(
- goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
- if "e2el" in goodput_config_dict:
- valid_metrics.append(e2els)
- slo_values.append(
- goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
-
- for req_metric in zip(*valid_metrics):
- is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
- if is_good_req:
- good_completed += 1
-
- if completed == 0:
- warnings.warn(
- "All requests failed. This is likely due to a misconfiguration "
- "on the benchmark arguments.",
- stacklevel=2,
- )
- metrics = BenchmarkMetrics(
- completed=completed,
- total_input=total_input,
- total_output=sum(actual_output_lens),
- request_throughput=completed / dur_s,
- request_goodput=good_completed / dur_s,
- output_throughput=sum(actual_output_lens) / dur_s,
- total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
- mean_ttft_ms=np.mean(ttfts or 0)
- * 1000, # ttfts is empty if streaming is not supported by backend
- std_ttft_ms=np.std(ttfts or 0) * 1000,
- median_ttft_ms=np.median(ttfts or 0) * 1000,
- percentiles_ttft_ms=[
- (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_tpot_ms=np.mean(tpots or 0) * 1000,
- std_tpot_ms=np.std(tpots or 0) * 1000,
- median_tpot_ms=np.median(tpots or 0) * 1000,
- percentiles_tpot_ms=[
- (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_itl_ms=np.mean(itls or 0) * 1000,
- std_itl_ms=np.std(itls or 0) * 1000,
- median_itl_ms=np.median(itls or 0) * 1000,
- percentiles_itl_ms=[
- (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_e2el_ms=np.mean(e2els or 0) * 1000,
- std_e2el_ms=np.std(e2els or 0) * 1000,
- median_e2el_ms=np.median(e2els or 0) * 1000,
- percentiles_e2el_ms=[
- (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
- ],
- )
-
- return metrics, actual_output_lens
-
-
-async def benchmark(
- backend: str,
- api_url: str,
- base_url: str,
- model_id: str,
- model_name: str,
- tokenizer: PreTrainedTokenizerBase,
- input_requests: list[SampleRequest],
- logprobs: Optional[int],
- request_rate: float,
- burstiness: float,
- disable_tqdm: bool,
- profile: bool,
- selected_percentile_metrics: list[str],
- selected_percentiles: list[float],
- ignore_eos: bool,
- goodput_config_dict: dict[str, float],
- max_concurrency: Optional[int],
- lora_modules: Optional[Iterable[str]],
- extra_body: Optional[dict],
- ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
- ramp_up_start_rps: Optional[int] = None,
- ramp_up_end_rps: Optional[int] = None,
-):
- if backend in ASYNC_REQUEST_FUNCS:
- request_func = ASYNC_REQUEST_FUNCS[backend]
- else:
- raise ValueError(f"Unknown backend: {backend}")
-
- print("Starting initial single prompt test run...")
- test_prompt, test_prompt_len, test_output_len, test_mm_content = (
- input_requests[0].prompt,
- input_requests[0].prompt_len,
- input_requests[0].expected_output_len,
- input_requests[0].multi_modal_data,
- )
-
- assert (
- test_mm_content is None
- or isinstance(test_mm_content, dict)
- or (
- isinstance(test_mm_content, list)
- and all(isinstance(item, dict) for item in test_mm_content)
- )
- ), "multi_modal_data must be a dict or list[dict]"
- test_input = RequestFuncInput(
- model=model_id,
- model_name=model_name,
- prompt=test_prompt,
- api_url=api_url,
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- multi_modal_content=test_mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- )
-
- test_output = await request_func(request_func_input=test_input)
- if not test_output.success:
- raise ValueError(
- "Initial test run failed - Please make sure benchmark arguments "
- f"are correctly specified. Error: {test_output.error}"
- )
- else:
- print("Initial test run completed. Starting main benchmark run...")
-
- if lora_modules:
- # For each input request, choose a LoRA module at random.
- lora_modules = iter(
- [random.choice(lora_modules) for _ in range(len(input_requests))]
- )
-
- if profile:
- print("Starting profiler...")
- profile_input = RequestFuncInput(
- model=model_id,
- model_name=model_name,
- prompt=test_prompt,
- api_url=base_url + "/start_profile",
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- multi_modal_content=test_mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- )
- profile_output = await request_func(request_func_input=profile_input)
- if profile_output.success:
- print("Profiler started")
-
- distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
-
- if ramp_up_strategy is not None:
- print(
- f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
- f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
- "the duration of the benchmark."
- )
- else:
- print(f"Traffic request rate: {request_rate} RPS.")
-
- print(f"Burstiness factor: {burstiness} ({distribution})")
- print(f"Maximum request concurrency: {max_concurrency}")
-
- pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-
- # This can be used once the minimum Python version is 3.10 or higher,
- # and it will simplify the code in limited_request_func.
- # semaphore = (asyncio.Semaphore(max_concurrency)
- # if max_concurrency else contextlib.nullcontext())
- semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
-
- async def limited_request_func(request_func_input, pbar):
- if semaphore is None:
- return await request_func(request_func_input=request_func_input, pbar=pbar)
- async with semaphore:
- return await request_func(request_func_input=request_func_input, pbar=pbar)
-
- benchmark_start_time = time.perf_counter()
- tasks: list[asyncio.Task] = []
-
- rps_change_events = []
- last_int_rps = -1
- if ramp_up_strategy is not None and ramp_up_start_rps is not None:
- last_int_rps = ramp_up_start_rps
- rps_change_events.append(
- {
- "rps": last_int_rps,
- "timestamp": datetime.now().isoformat(),
- }
- )
-
- async for request, current_request_rate in get_request(
- input_requests,
- request_rate,
- burstiness,
- ramp_up_strategy,
- ramp_up_start_rps,
- ramp_up_end_rps,
- ):
- if ramp_up_strategy is not None:
- current_int_rps = int(current_request_rate)
- if current_int_rps > last_int_rps:
- timestamp = datetime.now().isoformat()
- for rps_val in range(last_int_rps + 1, current_int_rps + 1):
- rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
- last_int_rps = current_int_rps
-
- prompt, prompt_len, output_len, mm_content, request_id = (
- request.prompt,
- request.prompt_len,
- request.expected_output_len,
- request.multi_modal_data,
- request.request_id,
- )
- req_model_id, req_model_name = model_id, model_name
- if lora_modules:
- req_lora_module = next(lora_modules)
- req_model_id, req_model_name = req_lora_module, req_lora_module
-
- request_func_input = RequestFuncInput(
- model=req_model_id,
- model_name=req_model_name,
- prompt=prompt,
- api_url=api_url,
- prompt_len=prompt_len,
- output_len=output_len,
- logprobs=logprobs,
- multi_modal_content=mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- request_id=request_id,
- )
- task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
- tasks.append(asyncio.create_task(task))
- outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
-
- if pbar is not None:
- pbar.close()
-
- benchmark_duration = time.perf_counter() - benchmark_start_time
-
- metrics, actual_output_lens = calculate_metrics(
- input_requests=input_requests,
- outputs=outputs,
- dur_s=benchmark_duration,
- tokenizer=tokenizer,
- selected_percentile_metrics=selected_percentile_metrics,
- selected_percentiles=selected_percentiles,
- goodput_config_dict=goodput_config_dict,
- )
-
- print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
- print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
- if max_concurrency is not None:
- print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
- if request_rate != float("inf"):
- print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
- print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
- print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
- print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
- print(
- "{:<40} {:<10.2f}".format(
- "Request throughput (req/s):", metrics.request_throughput
- )
- )
- if goodput_config_dict:
- print(
- "{:<40} {:<10.2f}".format(
- "Request goodput (req/s):", metrics.request_goodput
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- "Output token throughput (tok/s):", metrics.output_throughput
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- "Total Token throughput (tok/s):", metrics.total_token_throughput
- )
- )
-
- result = {
- "duration": benchmark_duration,
- "completed": metrics.completed,
- "total_input_tokens": metrics.total_input,
- "total_output_tokens": metrics.total_output,
- "request_throughput": metrics.request_throughput,
- "request_goodput": metrics.request_goodput if goodput_config_dict else None,
- "output_throughput": metrics.output_throughput,
- "total_token_throughput": metrics.total_token_throughput,
- "input_lens": [output.prompt_len for output in outputs],
- "output_lens": actual_output_lens,
- "ttfts": [output.ttft for output in outputs],
- "itls": [output.itl for output in outputs],
- "generated_texts": [output.generated_text for output in outputs],
- "errors": [output.error for output in outputs],
- }
-
- if rps_change_events:
- result["rps_change_events"] = rps_change_events
-
- def process_one_metric(
- # E.g., "ttft"
- metric_attribute_name: str,
- # E.g., "TTFT"
- metric_name: str,
- # E.g., "Time to First Token"
- metric_header: str,
- ):
- # This function prints and adds statistics of the specified
- # metric.
- if metric_attribute_name not in selected_percentile_metrics:
- return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
- print(
- "{:<40} {:<10.2f}".format(
- f"Mean {metric_name} (ms):",
- getattr(metrics, f"mean_{metric_attribute_name}_ms"),
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- f"Median {metric_name} (ms):",
- getattr(metrics, f"median_{metric_attribute_name}_ms"),
- )
- )
- result[f"mean_{metric_attribute_name}_ms"] = getattr(
- metrics, f"mean_{metric_attribute_name}_ms"
- )
- result[f"median_{metric_attribute_name}_ms"] = getattr(
- metrics, f"median_{metric_attribute_name}_ms"
- )
- result[f"std_{metric_attribute_name}_ms"] = getattr(
- metrics, f"std_{metric_attribute_name}_ms"
- )
- for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
- p_word = str(int(p)) if int(p) == p else str(p)
- print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
- result[f"p{p_word}_{metric_attribute_name}_ms"] = value
-
- process_one_metric("ttft", "TTFT", "Time to First Token")
- process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
- process_one_metric("itl", "ITL", "Inter-token Latency")
- process_one_metric("e2el", "E2EL", "End-to-end Latency")
-
- print("=" * 50)
-
- if profile:
- print("Stopping profiler...")
- profile_input = RequestFuncInput(
- model=model_id,
- prompt=test_prompt,
- api_url=base_url + "/stop_profile",
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- )
- profile_output = await request_func(request_func_input=profile_input)
- if profile_output.success:
- print("Profiler stopped")
-
- return result
-
-
-def check_goodput_args(args):
- # Check and parse goodput arguments
- goodput_config_dict = {}
- VALID_NAMES = ["ttft", "tpot", "e2el"]
- if args.goodput:
- goodput_config_dict = parse_goodput(args.goodput)
- for slo_name, slo_val in goodput_config_dict.items():
- if slo_name not in VALID_NAMES:
- raise ValueError(
- f"Invalid metric name found, {slo_name}: {slo_val}. "
- "The service level objective name should be one of "
- f"{str(VALID_NAMES)}. "
- )
- if slo_val < 0:
- raise ValueError(
- f"Invalid value found, {slo_name}: {slo_val}. "
- "The service level objective value should be "
- "non-negative."
- )
- return goodput_config_dict
-
-
-def parse_goodput(slo_pairs):
- goodput_config_dict = {}
- try:
- for slo_pair in slo_pairs:
- slo_name, slo_val = slo_pair.split(":")
- goodput_config_dict[slo_name] = float(slo_val)
- except ValueError as err:
- raise argparse.ArgumentTypeError(
- "Invalid format found for service level objectives. "
- 'Specify service level objectives for goodput as "KEY:VALUE" '
- "pairs, where the key is a metric name, and the value is a "
- "number in milliseconds."
- ) from err
- return goodput_config_dict
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any], file_name: str
-) -> None:
- metrics = [
- "median_ttft_ms",
- "mean_ttft_ms",
- "std_ttft_ms",
- "p99_ttft_ms",
- "mean_tpot_ms",
- "median_tpot_ms",
- "std_tpot_ms",
- "p99_tpot_ms",
- "median_itl_ms",
- "mean_itl_ms",
- "std_itl_ms",
- "p99_itl_ms",
- ]
- # These raw data might be useful, but they are rather big. They can be added
- # later if needed
- ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={k: [results[k]] for k in metrics},
- extra_info={
- k: results[k]
- for k in results
- if k not in metrics and k not in ignored_metrics
- },
- )
- if pt_records:
- # Don't use json suffix here as we don't want CI to pick it up
- pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-@deprecated(
- "benchmark_serving.py is deprecated and will be removed in a future "
- "version. Please use 'vllm bench serve' instead.",
-)
-def main(args: argparse.Namespace):
- print(args)
- random.seed(args.seed)
- np.random.seed(args.seed)
-
- backend = args.backend
- model_id = args.model
- model_name = args.served_model_name
- tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
- tokenizer_mode = args.tokenizer_mode
-
- # Validate ramp-up arguments
- if args.ramp_up_strategy is not None:
- if args.request_rate != float("inf"):
- raise ValueError(
- "When using ramp-up, do not specify --request-rate. "
- "The request rate will be controlled by ramp-up parameters. "
- "Please remove the --request-rate argument."
- )
- if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
- raise ValueError(
- "When using --ramp-up-strategy, both --ramp-up-start-rps and "
- "--ramp-up-end-rps must be specified"
- )
- if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
- raise ValueError("Ramp-up start and end RPS must be non-negative")
- if args.ramp_up_start_rps > args.ramp_up_end_rps:
- raise ValueError("Ramp-up start RPS must be less than end RPS")
- if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
- raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
-
- if args.base_url is not None:
- api_url = f"{args.base_url}{args.endpoint}"
- base_url = f"{args.base_url}"
- else:
- api_url = f"http://{args.host}:{args.port}{args.endpoint}"
- base_url = f"http://{args.host}:{args.port}"
-
- tokenizer = get_tokenizer(
- tokenizer_id,
- tokenizer_mode=tokenizer_mode,
- trust_remote_code=args.trust_remote_code,
- )
-
- if args.dataset_name is None:
- raise ValueError(
- "Please specify '--dataset-name' and the corresponding "
- "'--dataset-path' if required."
- )
-
- if args.dataset_name == "custom":
- dataset = CustomDataset(dataset_path=args.dataset_path)
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- tokenizer=tokenizer,
- output_len=args.custom_output_len,
- skip_chat_template=args.custom_skip_chat_template,
- request_id_prefix=args.request_id_prefix,
- )
-
- elif args.dataset_name == "sonnet":
- dataset = SonnetDataset(dataset_path=args.dataset_path)
- # For the "sonnet" dataset, formatting depends on the backend.
- if args.backend == "openai-chat":
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- input_len=args.sonnet_input_len,
- output_len=args.sonnet_output_len,
- prefix_len=args.sonnet_prefix_len,
- tokenizer=tokenizer,
- return_prompt_formatted=False,
- request_id_prefix=args.request_id_prefix,
- )
- else:
- assert tokenizer.chat_template or tokenizer.default_chat_template, (
- "Tokenizer/model must have chat template for sonnet dataset."
- )
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- input_len=args.sonnet_input_len,
- output_len=args.sonnet_output_len,
- prefix_len=args.sonnet_prefix_len,
- tokenizer=tokenizer,
- return_prompt_formatted=True,
- request_id_prefix=args.request_id_prefix,
- )
-
- elif args.dataset_name == "hf":
- # all following datasets are implemented from the
- # HuggingFaceDataset base class
- if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = VisionArenaDataset
- args.hf_split = "train"
- args.hf_subset = None
- elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = InstructCoderDataset
- args.hf_split = "train"
- elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = MTBenchDataset
- args.hf_split = "train"
- elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = ConversationDataset
- elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
- dataset_class = AIMODataset
- args.hf_split = "train"
- elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501
- dataset_class = NextEditPredictionDataset
- args.hf_split = "train"
- elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = ASRDataset
- args.hf_split = "train"
- else:
- supported_datasets = set(
- [
- dataset_name
- for cls in HuggingFaceDataset.__subclasses__()
- for dataset_name in cls.SUPPORTED_DATASET_PATHS
- ]
- )
- raise ValueError(
- f"Unsupported dataset path: {args.dataset_path}. "
- "Huggingface dataset only supports dataset_path"
- f" from one of following: {supported_datasets}. "
- "Please consider contributing if you would "
- "like to add support for additional dataset formats."
- )
-
- if dataset_class.IS_MULTIMODAL and backend not in [
- "openai-chat",
- "openai-audio",
- ]:
- # multi-modal benchmark is only available on OpenAI Chat backend.
- raise ValueError(
- "Multi-modal content is only supported on 'openai-chat' and "
- "'openai-audio' backend."
- )
- input_requests = dataset_class(
- dataset_path=args.dataset_path,
- dataset_subset=args.hf_subset,
- dataset_split=args.hf_split,
- random_seed=args.seed,
- no_stream=args.no_stream,
- ).sample(
- num_requests=args.num_prompts,
- tokenizer=tokenizer,
- output_len=args.hf_output_len,
- request_id_prefix=args.request_id_prefix,
- )
-
- else:
- # For datasets that follow a similar structure, use a mapping.
- dataset_mapping = {
- "sharegpt": lambda: ShareGPTDataset(
- random_seed=args.seed, dataset_path=args.dataset_path
- ).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- output_len=args.sharegpt_output_len,
- request_id_prefix=args.request_id_prefix,
- ),
- "burstgpt": lambda: BurstGPTDataset(
- random_seed=args.seed, dataset_path=args.dataset_path
- ).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- request_id_prefix=args.request_id_prefix,
- ),
- "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- prefix_len=args.random_prefix_len,
- input_len=args.random_input_len,
- output_len=args.random_output_len,
- range_ratio=args.random_range_ratio,
- request_id_prefix=args.request_id_prefix,
- ),
- }
-
- try:
- input_requests = dataset_mapping[args.dataset_name]()
- except KeyError as err:
- raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
- goodput_config_dict = check_goodput_args(args)
-
- # Collect the sampling parameters.
- sampling_params = {
- k: v
- for k, v in {
- "top_p": args.top_p,
- "top_k": args.top_k,
- "min_p": args.min_p,
- "temperature": args.temperature,
- }.items()
- if v is not None
- }
-
- # Sampling parameters are only supported by openai-compatible backend.
- if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
- raise ValueError(
- "Sampling parameters are only supported by openai-compatible backends."
- )
-
- if "temperature" not in sampling_params:
- sampling_params["temperature"] = 0.0 # Default to greedy decoding.
-
- if args.backend == "llama.cpp":
- # Disable prompt caching in llama.cpp backend
- sampling_params["cache_prompt"] = False
-
- # Avoid GC processing "static" data - reduce pause times.
- gc.collect()
- gc.freeze()
-
- benchmark_result = asyncio.run(
- benchmark(
- backend=backend,
- api_url=api_url,
- base_url=base_url,
- model_id=model_id,
- model_name=model_name,
- tokenizer=tokenizer,
- input_requests=input_requests,
- logprobs=args.logprobs,
- request_rate=args.request_rate,
- burstiness=args.burstiness,
- disable_tqdm=args.disable_tqdm,
- profile=args.profile,
- selected_percentile_metrics=args.percentile_metrics.split(","),
- selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
- ignore_eos=args.ignore_eos,
- goodput_config_dict=goodput_config_dict,
- max_concurrency=args.max_concurrency,
- lora_modules=args.lora_modules,
- extra_body=sampling_params,
- ramp_up_strategy=args.ramp_up_strategy,
- ramp_up_start_rps=args.ramp_up_start_rps,
- ramp_up_end_rps=args.ramp_up_end_rps,
- )
- )
-
- # Save config and results to json
- if args.save_result or args.append_result:
- result_json: dict[str, Any] = {}
-
- # Setup
- current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
- result_json["date"] = current_dt
- result_json["backend"] = backend
- result_json["model_id"] = model_id
- result_json["tokenizer_id"] = tokenizer_id
- result_json["num_prompts"] = args.num_prompts
-
- # Metadata
- if args.metadata:
- for item in args.metadata:
- if "=" in item:
- kvstring = item.split("=")
- result_json[kvstring[0].strip()] = kvstring[1].strip()
- else:
- raise ValueError(
- "Invalid metadata format. Please use KEY=VALUE format."
- )
- # Traffic
- result_json["request_rate"] = (
- args.request_rate if args.request_rate < float("inf") else "inf"
- )
- result_json["burstiness"] = args.burstiness
- result_json["max_concurrency"] = args.max_concurrency
-
- if args.ramp_up_strategy is not None:
- result_json["ramp_up_strategy"] = args.ramp_up_strategy
- result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
- result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
-
- # Merge with benchmark result
- result_json = {**result_json, **benchmark_result}
-
- if not args.save_detailed:
- # Remove fields with too many data points
- for field in [
- "input_lens",
- "output_lens",
- "ttfts",
- "itls",
- "generated_texts",
- "errors",
- ]:
- if field in result_json:
- del result_json[field]
- if field in benchmark_result:
- del benchmark_result[field]
-
- # Save to file
- base_model_id = model_id.split("/")[-1]
- max_concurrency_str = (
- f"-concurrency{args.max_concurrency}"
- if args.max_concurrency is not None
- else ""
- )
- if args.ramp_up_strategy is not None:
- file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
- else:
- file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
- if args.result_filename:
- file_name = args.result_filename
- if args.result_dir:
- os.makedirs(args.result_dir, exist_ok=True)
- file_name = os.path.join(args.result_dir, file_name)
- with open(
- file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
- ) as outfile:
- # Append a newline.
- if args.append_result and outfile.tell() != 0:
- outfile.write("\n")
- json.dump(result_json, outfile)
- save_to_pytorch_benchmark_format(args, result_json, file_name)
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(
- description="Benchmark the online serving throughput."
- )
- parser.add_argument(
- "--backend",
- type=str,
- default="vllm",
- choices=list(ASYNC_REQUEST_FUNCS.keys()),
- )
- parser.add_argument(
- "--base-url",
- type=str,
- default=None,
- help="Server or API base url if not using http host and port.",
- )
- # Use 127.0.0.1 here instead of localhost to force the use of ipv4
- parser.add_argument("--host", type=str, default="127.0.0.1")
- parser.add_argument("--port", type=int, default=8000)
- parser.add_argument(
- "--endpoint",
- type=str,
- default="/v1/completions",
- help="API endpoint.",
- )
- parser.add_argument(
- "--dataset-name",
- type=str,
- default="sharegpt",
- choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
- help="Name of the dataset to benchmark on.",
- )
- parser.add_argument(
- "--dataset-path",
- type=str,
- default=None,
- help="Path to the sharegpt/sonnet dataset. "
- "Or the huggingface dataset ID if using HF dataset.",
- )
- parser.add_argument(
- "--no-stream",
- action="store_true",
- help="Do not load the dataset in streaming mode.",
- )
- parser.add_argument(
- "--max-concurrency",
- type=int,
- default=None,
- help="Maximum number of concurrent requests. This can be used "
- "to help simulate an environment where a higher level component "
- "is enforcing a maximum number of concurrent requests. While the "
- "--request-rate argument controls the rate at which requests are "
- "initiated, this argument will control how many are actually allowed "
- "to execute at a time. This means that when used in combination, the "
- "actual request rate may be lower than specified with --request-rate, "
- "if the server is not processing requests fast enough to keep up.",
- )
-
- parser.add_argument(
- "--model",
- type=str,
- required=True,
- help="Name of the model.",
- )
- parser.add_argument(
- "--tokenizer",
- type=str,
- help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
- )
- parser.add_argument("--use-beam-search", action="store_true")
- parser.add_argument(
- "--num-prompts",
- type=int,
- default=1000,
- help="Number of prompts to process.",
- )
- parser.add_argument(
- "--logprobs",
- type=int,
- default=None,
- help=(
- "Number of logprobs-per-token to compute & return as part of "
- "the request. If unspecified, then either (1) if beam search "
- "is disabled, no logprobs are computed & a single dummy "
- "logprob is returned for each token; or (2) if beam search "
- "is enabled 1 logprob per token is computed"
- ),
- )
- parser.add_argument(
- "--request-rate",
- type=float,
- default=float("inf"),
- help="Number of requests per second. If this is inf, "
- "then all the requests are sent at time 0. "
- "Otherwise, we use Poisson process or gamma distribution "
- "to synthesize the request arrival times.",
- )
- parser.add_argument(
- "--burstiness",
- type=float,
- default=1.0,
- help="Burstiness factor of the request generation. "
- "Only take effect when request_rate is not inf. "
- "Default value is 1, which follows Poisson process. "
- "Otherwise, the request intervals follow a gamma distribution. "
- "A lower burstiness value (0 < burstiness < 1) results in more "
- "bursty requests. A higher burstiness value (burstiness > 1) "
- "results in a more uniform arrival of requests.",
- )
- parser.add_argument("--seed", type=int, default=0)
- parser.add_argument(
- "--trust-remote-code",
- action="store_true",
- help="Trust remote code from huggingface",
- )
- parser.add_argument(
- "--disable-tqdm",
- action="store_true",
- help="Specify to disable tqdm progress bar.",
- )
- parser.add_argument(
- "--profile",
- action="store_true",
- help="Use Torch Profiler. The endpoint must be launched with "
- "VLLM_TORCH_PROFILER_DIR to enable profiler.",
- )
- parser.add_argument(
- "--save-result",
- action="store_true",
- help="Specify to save benchmark results to a json file",
- )
- parser.add_argument(
- "--save-detailed",
- action="store_true",
- help="When saving the results, whether to include per request "
- "information such as response, error, ttfs, tpots, etc.",
- )
- parser.add_argument(
- "--append-result",
- action="store_true",
- help="Append the benchmark result to the existing json file.",
- )
- parser.add_argument(
- "--metadata",
- metavar="KEY=VALUE",
- nargs="*",
- help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
- "for metadata of this run to be saved in the result JSON file "
- "for record keeping purposes.",
- )
- parser.add_argument(
- "--result-dir",
- type=str,
- default=None,
- help="Specify directory to save benchmark json results."
- "If not specified, results are saved in the current directory.",
- )
- parser.add_argument(
- "--result-filename",
- type=str,
- default=None,
- help="Specify the filename to save benchmark json results."
- "If not specified, results will be saved in "
- "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
- " format.",
- )
- parser.add_argument(
- "--ignore-eos",
- action="store_true",
- help="Set ignore_eos flag when sending the benchmark request."
- "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
- )
- parser.add_argument(
- "--percentile-metrics",
- type=str,
- default="ttft,tpot,itl",
- help="Comma-separated list of selected metrics to report percentiles. "
- "This argument specifies the metrics to report percentiles. "
- 'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
- 'Default value is "ttft,tpot,itl".',
- )
- parser.add_argument(
- "--metric-percentiles",
- type=str,
- default="99",
- help="Comma-separated list of percentiles for selected metrics. "
- 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
- 'Default value is "99". '
- 'Use "--percentile-metrics" to select metrics.',
- )
- parser.add_argument(
- "--goodput",
- nargs="+",
- required=False,
- help='Specify service level objectives for goodput as "KEY:VALUE" '
- "pairs, where the key is a metric name, and the value is in "
- 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
- "separated by spaces. Allowed request level metric names are "
- '"ttft", "tpot", "e2el". For more context on the definition of '
- "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
- "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
- )
- parser.add_argument(
- "--request-id-prefix",
- type=str,
- required=False,
- default="benchmark-serving",
- help="Specify the prefix of request id.",
- )
-
- # group for dataset specific arguments
- custom_group = parser.add_argument_group("custom dataset options")
- custom_group.add_argument(
- "--custom-output-len",
- type=int,
- default=256,
- help="Number of output tokens per request, used only for custom dataset.",
- )
- custom_group.add_argument(
- "--custom-skip-chat-template",
- action="store_true",
- help="Skip applying chat template to prompt, used only for custom dataset.",
- )
-
- sonnet_group = parser.add_argument_group("sonnet dataset options")
- sonnet_group.add_argument(
- "--sonnet-input-len",
- type=int,
- default=550,
- help="Number of input tokens per request, used only for sonnet dataset.",
- )
- sonnet_group.add_argument(
- "--sonnet-output-len",
- type=int,
- default=150,
- help="Number of output tokens per request, used only for sonnet dataset.",
- )
- sonnet_group.add_argument(
- "--sonnet-prefix-len",
- type=int,
- default=200,
- help="Number of prefix tokens per request, used only for sonnet dataset.",
- )
-
- sharegpt_group = parser.add_argument_group("sharegpt dataset options")
- sharegpt_group.add_argument(
- "--sharegpt-output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the output length "
- "from the ShareGPT dataset.",
- )
-
- random_group = parser.add_argument_group("random dataset options")
- random_group.add_argument(
- "--random-input-len",
- type=int,
- default=1024,
- help="Number of input tokens per request, used only for random sampling.",
- )
- random_group.add_argument(
- "--random-output-len",
- type=int,
- default=128,
- help="Number of output tokens per request, used only for random sampling.",
- )
- random_group.add_argument(
- "--random-range-ratio",
- type=float,
- default=0.0,
- help="Range ratio for sampling input/output length, "
- "used only for random sampling. Must be in the range [0, 1) to define "
- "a symmetric sampling range"
- "[length * (1 - range_ratio), length * (1 + range_ratio)].",
- )
- random_group.add_argument(
- "--random-prefix-len",
- type=int,
- default=0,
- help=(
- "Number of fixed prefix tokens before the random context "
- "in a request. "
- "The total input length is the sum of `random-prefix-len` and "
- "a random "
- "context length sampled from [input_len * (1 - range_ratio), "
- "input_len * (1 + range_ratio)]."
- ),
- )
-
- hf_group = parser.add_argument_group("hf dataset options")
- hf_group.add_argument(
- "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
- )
- hf_group.add_argument(
- "--hf-split", type=str, default=None, help="Split of the HF dataset."
- )
- hf_group.add_argument(
- "--hf-output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the output lengths "
- "from the sampled HF dataset.",
- )
-
- sampling_group = parser.add_argument_group("sampling parameters")
- sampling_group.add_argument(
- "--top-p",
- type=float,
- default=None,
- help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--top-k",
- type=int,
- default=None,
- help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--min-p",
- type=float,
- default=None,
- help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--temperature",
- type=float,
- default=None,
- help="Temperature sampling parameter. Only has effect on "
- "openai-compatible backends. If not specified, default to greedy "
- "decoding (i.e. temperature==0.0).",
- )
-
- parser.add_argument(
- "--tokenizer-mode",
- type=str,
- default="auto",
- choices=["auto", "slow", "mistral", "custom"],
- help='The tokenizer mode.\n\n* "auto" will use the '
- 'fast tokenizer if available.\n* "slow" will '
- "always use the slow tokenizer. \n* "
- '"mistral" will always use the `mistral_common` tokenizer. \n*'
- '"custom" will use --tokenizer to select the preregistered tokenizer.',
- )
-
- parser.add_argument(
- "--served-model-name",
- type=str,
- default=None,
- help="The model name used in the API. "
- "If not specified, the model name will be the "
- "same as the ``--model`` argument. ",
- )
-
- parser.add_argument(
- "--lora-modules",
- nargs="+",
- default=None,
- help="A subset of LoRA module names passed in when "
- "launching the server. For each request, the "
- "script chooses a LoRA module at random.",
- )
-
- parser.add_argument(
- "--ramp-up-strategy",
- type=str,
- default=None,
- choices=["linear", "exponential"],
- help="The ramp-up strategy. This would be used to "
- "ramp up the request rate from initial RPS to final "
- "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
- "over the duration of the benchmark.",
- )
- parser.add_argument(
- "--ramp-up-start-rps",
- type=int,
- default=None,
- help="The starting request rate for ramp-up (RPS). "
- "Needs to be specified when --ramp-up-strategy is used.",
- )
- parser.add_argument(
- "--ramp-up-end-rps",
- type=int,
- default=None,
- help="The ending request rate for ramp-up (RPS). "
- "Needs to be specified when --ramp-up-strategy is used.",
- )
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench serve
+
+For help with the new command, run:
+ vllm bench serve --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench serve --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 4aae755eb4e44..73b4aa5a87e07 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -696,11 +696,11 @@ def evaluate(ret, args):
return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual):
- if args.structure_type == "guided_json":
+ if args.structure_type == "json":
return _eval_correctness_json(expected, actual)
- elif args.structure_type == "guided_regex":
+ elif args.structure_type == "regex":
return _eval_correctness_regex(expected, actual)
- elif args.structure_type == "guided_choice":
+ elif args.structure_type == "choice":
return _eval_correctness_choice(expected, actual)
else:
return None
@@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
)
if args.dataset == "grammar":
- args.structure_type = "guided_grammar"
+ args.structure_type = "grammar"
elif args.dataset == "regex":
- args.structure_type = "guided_regex"
+ args.structure_type = "regex"
elif args.dataset == "choice":
- args.structure_type = "guided_choice"
+ args.structure_type = "choice"
else:
- args.structure_type = "guided_json"
+ args.structure_type = "json"
if args.no_structured_output:
args.structured_output_ratio = 0
if args.save_results:
- result_file_name = f"{args.structured_output_ratio}guided"
+ result_file_name = f"{args.structured_output_ratio}so"
result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}"
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 34a525f00d910..b6dc0918fd4d1 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,741 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark offline inference throughput."""
-
-import argparse
-import dataclasses
-import json
-import os
-import random
-import time
-import warnings
-from typing import Any, Optional, Union
-
-import torch
-import uvloop
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
-from typing_extensions import deprecated
-
-from benchmark_dataset import (
- AIMODataset,
- BurstGPTDataset,
- ConversationDataset,
- InstructCoderDataset,
- RandomDataset,
- SampleRequest,
- ShareGPTDataset,
- SonnetDataset,
- VisionArenaDataset,
-)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.openai.api_server import (
- build_async_engine_client_from_engine_args,
-)
-from vllm.inputs import TextPrompt, TokensPrompt
-from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser, merge_async_iterators
-
-
-def run_vllm(
- requests: list[SampleRequest],
- n: int,
- engine_args: EngineArgs,
- disable_detokenize: bool = False,
-) -> tuple[float, Optional[list[RequestOutput]]]:
- from vllm import LLM, SamplingParams
-
- llm = LLM(**dataclasses.asdict(engine_args))
- assert all(
- llm.llm_engine.model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of"
- " prompt_len and expected_output_len for all requests."
- )
- # Add the requests to the engine.
- prompts: list[Union[TextPrompt, TokensPrompt]] = []
- sampling_params: list[SamplingParams] = []
- for request in requests:
- prompts.append(
- TokensPrompt(
- prompt_token_ids=request.prompt["prompt_token_ids"],
- multi_modal_data=request.multi_modal_data,
- )
- if "prompt_token_ids" in request.prompt
- else TextPrompt(
- prompt=request.prompt, multi_modal_data=request.multi_modal_data
- )
- )
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- lora_requests: Optional[list[LoRARequest]] = None
- if engine_args.enable_lora:
- lora_requests = [request.lora_request for request in requests]
-
- use_beam_search = False
-
- outputs = None
- if not use_beam_search:
- start = time.perf_counter()
- outputs = llm.generate(
- prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
- )
- end = time.perf_counter()
- else:
- assert lora_requests is None, "BeamSearch API does not support LoRA"
- # output_len should be the same for all requests.
- output_len = requests[0].expected_output_len
- for request in requests:
- assert request.expected_output_len == output_len
- start = time.perf_counter()
- llm.beam_search(
- prompts,
- BeamSearchParams(
- beam_width=n,
- max_tokens=output_len,
- ignore_eos=True,
- ),
- )
- end = time.perf_counter()
- return end - start, outputs
-
-
-def run_vllm_chat(
- requests: list[SampleRequest],
- n: int,
- engine_args: EngineArgs,
- disable_detokenize: bool = False,
-) -> tuple[float, list[RequestOutput]]:
- """
- Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
- multimodal models as it properly handles multimodal inputs and chat
- formatting. For non-multimodal models, use run_vllm() instead.
- """
- from vllm import LLM, SamplingParams
-
- llm = LLM(**dataclasses.asdict(engine_args))
-
- assert all(
- llm.llm_engine.model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of "
- "prompt_len and expected_output_len for all requests."
- )
-
- prompts = []
- sampling_params: list[SamplingParams] = []
- for request in requests:
- prompts.append(request.prompt)
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- start = time.perf_counter()
- outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
- end = time.perf_counter()
- return end - start, outputs
-
-
-async def run_vllm_async(
- requests: list[SampleRequest],
- n: int,
- engine_args: AsyncEngineArgs,
- disable_frontend_multiprocessing: bool = False,
- disable_detokenize: bool = False,
-) -> float:
- from vllm import SamplingParams
-
- async with build_async_engine_client_from_engine_args(
- engine_args,
- disable_frontend_multiprocessing=disable_frontend_multiprocessing,
- ) as llm:
- model_config = await llm.get_model_config()
- assert all(
- model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of"
- " prompt_len and expected_output_len for all requests."
- )
-
- # Add the requests to the engine.
- prompts: list[Union[TextPrompt, TokensPrompt]] = []
- sampling_params: list[SamplingParams] = []
- lora_requests: list[Optional[LoRARequest]] = []
- for request in requests:
- prompts.append(
- TokensPrompt(
- prompt_token_ids=request.prompt["prompt_token_ids"],
- multi_modal_data=request.multi_modal_data,
- )
- if "prompt_token_ids" in request.prompt
- else TextPrompt(
- prompt=request.prompt, multi_modal_data=request.multi_modal_data
- )
- )
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- lora_requests.append(request.lora_request)
-
- generators = []
- start = time.perf_counter()
- for i, (prompt, sp, lr) in enumerate(
- zip(prompts, sampling_params, lora_requests)
- ):
- generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
- generators.append(generator)
- all_gens = merge_async_iterators(*generators)
- async for i, res in all_gens:
- pass
- end = time.perf_counter()
- return end - start
-
-
-def run_hf(
- requests: list[SampleRequest],
- model: str,
- tokenizer: PreTrainedTokenizerBase,
- n: int,
- max_batch_size: int,
- trust_remote_code: bool,
- disable_detokenize: bool = False,
-) -> float:
- llm = AutoModelForCausalLM.from_pretrained(
- model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
- )
- if llm.config.model_type == "llama":
- # To enable padding in the HF backend.
- tokenizer.pad_token = tokenizer.eos_token
- llm = llm.cuda()
-
- pbar = tqdm(total=len(requests))
- start = time.perf_counter()
- batch: list[str] = []
- max_prompt_len = 0
- max_output_len = 0
- for i in range(len(requests)):
- prompt = requests[i].prompt
- prompt_len = requests[i].prompt_len
- output_len = requests[i].expected_output_len
- # Add the prompt to the batch.
- batch.append(prompt)
- max_prompt_len = max(max_prompt_len, prompt_len)
- max_output_len = max(max_output_len, output_len)
- if len(batch) < max_batch_size and i != len(requests) - 1:
- # Check if we can add more requests to the batch.
- next_prompt_len = requests[i + 1].prompt_len
- next_output_len = requests[i + 1].expected_output_len
- if (
- max(max_prompt_len, next_prompt_len)
- + max(max_output_len, next_output_len)
- ) <= 2048:
- # We can add more requests to the batch.
- continue
-
- # Generate the sequences.
- input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
- llm_outputs = llm.generate(
- input_ids=input_ids.cuda(),
- do_sample=True,
- num_return_sequences=n,
- temperature=1.0,
- top_p=1.0,
- use_cache=True,
- max_new_tokens=max_output_len,
- )
- if not disable_detokenize:
- # Include the decoding time.
- tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
- pbar.update(len(batch))
-
- # Clear the batch.
- batch = []
- max_prompt_len = 0
- max_output_len = 0
- end = time.perf_counter()
- return end - start
-
-
-def run_mii(
- requests: list[SampleRequest],
- model: str,
- tensor_parallel_size: int,
- output_len: int,
-) -> float:
- from mii import client, serve
-
- llm = serve(model, tensor_parallel=tensor_parallel_size)
- prompts = [request.prompt for request in requests]
-
- start = time.perf_counter()
- llm.generate(prompts, max_new_tokens=output_len)
- end = time.perf_counter()
- client = client(model)
- client.terminate_server()
- return end - start
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any]
-) -> None:
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={
- "requests_per_second": [results["requests_per_second"]],
- "tokens_per_second": [results["tokens_per_second"]],
- },
- extra_info={
- k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
- },
- )
- if pt_records:
- # Don't use json suffix here as we don't want CI to pick it up
- pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-def get_requests(args, tokenizer):
- # Common parameters for all dataset types.
- common_kwargs = {
- "dataset_path": args.dataset_path,
- "random_seed": args.seed,
- }
- sample_kwargs = {
- "tokenizer": tokenizer,
- "lora_path": args.lora_path,
- "max_loras": args.max_loras,
- "num_requests": args.num_prompts,
- "input_len": args.input_len,
- "output_len": args.output_len,
- }
-
- if args.dataset_path is None or args.dataset_name == "random":
- sample_kwargs["range_ratio"] = args.random_range_ratio
- sample_kwargs["prefix_len"] = args.prefix_len
- dataset_cls = RandomDataset
- elif args.dataset_name == "sharegpt":
- dataset_cls = ShareGPTDataset
- if args.backend == "vllm-chat":
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_name == "sonnet":
- assert tokenizer.chat_template or tokenizer.default_chat_template, (
- "Tokenizer/model must have chat template for sonnet dataset."
- )
- dataset_cls = SonnetDataset
- sample_kwargs["prefix_len"] = args.prefix_len
- sample_kwargs["return_prompt_formatted"] = True
- elif args.dataset_name == "burstgpt":
- dataset_cls = BurstGPTDataset
- elif args.dataset_name == "hf":
- common_kwargs["no_stream"] = args.no_stream
- if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = VisionArenaDataset
- common_kwargs["dataset_subset"] = None
- common_kwargs["dataset_split"] = "train"
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = InstructCoderDataset
- common_kwargs["dataset_split"] = "train"
- elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = ConversationDataset
- common_kwargs["dataset_subset"] = args.hf_subset
- common_kwargs["dataset_split"] = args.hf_split
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = AIMODataset
- common_kwargs["dataset_subset"] = None
- common_kwargs["dataset_split"] = "train"
- else:
- raise ValueError(f"Unknown dataset name: {args.dataset_name}")
- # Remove None values
- sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
- return dataset_cls(**common_kwargs).sample(**sample_kwargs)
-
-
-@deprecated(
- "benchmark_throughput.py is deprecated and will be removed in a "
- "future version. Please use 'vllm bench throughput' instead.",
-)
-def main(args: argparse.Namespace):
- if args.seed is None:
- args.seed = 0
- print(args)
- random.seed(args.seed)
- # Sample the requests.
- tokenizer = AutoTokenizer.from_pretrained(
- args.tokenizer, trust_remote_code=args.trust_remote_code
- )
- requests = get_requests(args, tokenizer)
- is_multi_modal = any(request.multi_modal_data is not None for request in requests)
- request_outputs: Optional[list[RequestOutput]] = None
- if args.backend == "vllm":
- if args.async_engine:
- elapsed_time = uvloop.run(
- run_vllm_async(
- requests,
- args.n,
- AsyncEngineArgs.from_cli_args(args),
- args.disable_frontend_multiprocessing,
- args.disable_detokenize,
- )
- )
- else:
- elapsed_time, request_outputs = run_vllm(
- requests,
- args.n,
- EngineArgs.from_cli_args(args),
- args.disable_detokenize,
- )
- elif args.backend == "hf":
- assert args.tensor_parallel_size == 1
- elapsed_time = run_hf(
- requests,
- args.model,
- tokenizer,
- args.n,
- args.hf_max_batch_size,
- args.trust_remote_code,
- args.disable_detokenize,
- )
- elif args.backend == "mii":
- elapsed_time = run_mii(
- requests, args.model, args.tensor_parallel_size, args.output_len
- )
- elif args.backend == "vllm-chat":
- elapsed_time, request_outputs = run_vllm_chat(
- requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
- )
- else:
- raise ValueError(f"Unknown backend: {args.backend}")
-
- if request_outputs:
- # Note: with the vllm and vllm-chat backends,
- # we have request_outputs, which we use to count tokens.
- total_prompt_tokens = 0
- total_output_tokens = 0
- for ro in request_outputs:
- if not isinstance(ro, RequestOutput):
- continue
- total_prompt_tokens += (
- len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
- )
- total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
- total_num_tokens = total_prompt_tokens + total_output_tokens
- else:
- total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
- total_output_tokens = sum(r.expected_output_len for r in requests)
- total_prompt_tokens = total_num_tokens - total_output_tokens
-
- if is_multi_modal and args.backend != "vllm-chat":
- print(
- "\033[91mWARNING\033[0m: Multi-modal request with "
- f"{args.backend} backend detected. The "
- "following metrics are not accurate because image tokens are not"
- " counted. See vllm-project/vllm/issues/9778 for details."
- )
- # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
- # vllm-chat backend counts the image tokens now
-
- print(
- f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
- f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
- f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
- )
- print(f"Total num prompt tokens: {total_prompt_tokens}")
- print(f"Total num output tokens: {total_output_tokens}")
-
- # Output JSON results if specified
- if args.output_json:
- results = {
- "elapsed_time": elapsed_time,
- "num_requests": len(requests),
- "total_num_tokens": total_num_tokens,
- "requests_per_second": len(requests) / elapsed_time,
- "tokens_per_second": total_num_tokens / elapsed_time,
- }
- with open(args.output_json, "w") as f:
- json.dump(results, f, indent=4)
- save_to_pytorch_benchmark_format(args, results)
-
-
-def validate_args(args):
- """
- Validate command-line arguments.
- """
-
- # === Deprecation and Defaulting ===
- if args.dataset is not None:
- warnings.warn(
- "The '--dataset' argument will be deprecated in the next release. "
- "Please use '--dataset-name' and '--dataset-path' instead.",
- stacklevel=2,
- )
- args.dataset_path = args.dataset
-
- if not getattr(args, "tokenizer", None):
- args.tokenizer = args.model
-
- # === Backend Validation ===
- valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
- if args.backend not in valid_backends:
- raise ValueError(f"Unsupported backend: {args.backend}")
-
- # === Dataset Configuration ===
- if not args.dataset and not args.dataset_path:
- print("When dataset path is not set, it will default to random dataset")
- args.dataset_name = "random"
- if args.input_len is None:
- raise ValueError("input_len must be provided for a random dataset")
-
- # === Dataset Name Specific Checks ===
- # --hf-subset and --hf-split: only used
- # when dataset_name is 'hf'
- if args.dataset_name != "hf" and (
- getattr(args, "hf_subset", None) is not None
- or getattr(args, "hf_split", None) is not None
- ):
- warnings.warn(
- "--hf-subset and --hf-split will be ignored \
- since --dataset-name is not 'hf'.",
- stacklevel=2,
- )
- elif args.dataset_name == "hf":
- if args.dataset_path in (
- VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
- | ConversationDataset.SUPPORTED_DATASET_PATHS
- ):
- assert args.backend == "vllm-chat", (
- f"{args.dataset_path} needs to use vllm-chat as the backend."
- ) # noqa: E501
- elif args.dataset_path in (
- InstructCoderDataset.SUPPORTED_DATASET_PATHS
- | AIMODataset.SUPPORTED_DATASET_PATHS
- ):
- assert args.backend == "vllm", (
- f"{args.dataset_path} needs to use vllm as the backend."
- ) # noqa: E501
- else:
- raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
-
- # --random-range-ratio: only used when dataset_name is 'random'
- if args.dataset_name != "random" and args.random_range_ratio is not None:
- warnings.warn(
- "--random-range-ratio will be ignored since \
- --dataset-name is not 'random'.",
- stacklevel=2,
- )
-
- # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
- # set.
- if (
- args.dataset_name not in {"random", "sonnet", None}
- and args.prefix_len is not None
- ):
- warnings.warn(
- "--prefix-len will be ignored since --dataset-name\
- is not 'random', 'sonnet', or not set.",
- stacklevel=2,
- )
-
- # === LoRA Settings ===
- if getattr(args, "enable_lora", False) and args.backend != "vllm":
- raise ValueError("LoRA benchmarking is only supported for vLLM backend")
- if getattr(args, "enable_lora", False) and args.lora_path is None:
- raise ValueError("LoRA path must be provided when enable_lora is True")
-
- # === Backend-specific Validations ===
- if args.backend == "hf" and args.hf_max_batch_size is None:
- raise ValueError("HF max batch size is required for HF backend")
- if args.backend != "hf" and args.hf_max_batch_size is not None:
- raise ValueError("HF max batch size is only for HF backend.")
-
- if (
- args.backend in {"hf", "mii"}
- and getattr(args, "quantization", None) is not None
- ):
- raise ValueError("Quantization is only for vLLM backend.")
-
- if args.backend == "mii" and args.dtype != "auto":
- raise ValueError("dtype must be auto for MII backend.")
- if args.backend == "mii" and args.n != 1:
- raise ValueError("n must be 1 for MII backend.")
- if args.backend == "mii" and args.tokenizer != args.model:
- raise ValueError("Tokenizer must be the same as the model for MII backend.")
-
- # --data-parallel is not supported currently.
- # https://github.com/vllm-project/vllm/issues/16222
- if args.data_parallel_size > 1:
- raise ValueError(
- "Data parallel is not supported in offline benchmark, "
- "please use benchmark serving instead"
- )
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(description="Benchmark the throughput.")
- parser.add_argument(
- "--backend",
- type=str,
- choices=["vllm", "hf", "mii", "vllm-chat"],
- default="vllm",
- )
- parser.add_argument(
- "--dataset-name",
- type=str,
- choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
- help="Name of the dataset to benchmark on.",
- default="sharegpt",
- )
- parser.add_argument(
- "--no-stream",
- action="store_true",
- help="Do not load the dataset in streaming mode.",
- )
- parser.add_argument(
- "--dataset",
- type=str,
- default=None,
- help="Path to the ShareGPT dataset, will be deprecated in\
- the next release. The dataset is expected to "
- "be a json in form of list[dict[..., conversations: "
- "list[dict[..., value: ]]]]",
- )
- parser.add_argument(
- "--dataset-path", type=str, default=None, help="Path to the dataset"
- )
- parser.add_argument(
- "--input-len",
- type=int,
- default=None,
- help="Input prompt length for each request",
- )
- parser.add_argument(
- "--output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the "
- "output length from the dataset.",
- )
- parser.add_argument(
- "--n", type=int, default=1, help="Number of generated sequences per prompt."
- )
- parser.add_argument(
- "--num-prompts", type=int, default=1000, help="Number of prompts to process."
- )
- parser.add_argument(
- "--hf-max-batch-size",
- type=int,
- default=None,
- help="Maximum batch size for HF backend.",
- )
- parser.add_argument(
- "--output-json",
- type=str,
- default=None,
- help="Path to save the throughput results in JSON format.",
- )
- parser.add_argument(
- "--async-engine",
- action="store_true",
- default=False,
- help="Use vLLM async engine rather than LLM class.",
- )
- parser.add_argument(
- "--disable-frontend-multiprocessing",
- action="store_true",
- default=False,
- help="Disable decoupled async engine frontend.",
- )
- parser.add_argument(
- "--disable-detokenize",
- action="store_true",
- help=(
- "Do not detokenize the response (i.e. do not include "
- "detokenization time in the measurement)"
- ),
- )
- # LoRA
- parser.add_argument(
- "--lora-path",
- type=str,
- default=None,
- help="Path to the LoRA adapters to use. This can be an absolute path, "
- "a relative path, or a Hugging Face model identifier.",
- )
- parser.add_argument(
- "--prefix-len",
- type=int,
- default=None,
- help=f"Number of prefix tokens to be used in RandomDataset "
- "and SonnetDataset. For RandomDataset, the total input "
- "length is the sum of prefix-len (default: "
- f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
- "sampled from [input_len * (1 - range_ratio), "
- "input_len * (1 + range_ratio)]. For SonnetDataset, "
- f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
- "controls how much of the input is fixed lines versus "
- "random lines, but the total input length remains approximately "
- "input_len tokens.",
- )
- # random dataset
- parser.add_argument(
- "--random-range-ratio",
- type=float,
- default=None,
- help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
- "for sampling input/output length, "
- "used only for RandomDataset. Must be in the range [0, 1) to "
- "define a symmetric sampling range "
- "[length * (1 - range_ratio), length * (1 + range_ratio)].",
- )
-
- # hf dataset
- parser.add_argument(
- "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
- )
- parser.add_argument(
- "--hf-split", type=str, default=None, help="Split of the HF dataset."
- )
-
- parser = AsyncEngineArgs.add_cli_args(parser)
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- if args.tokenizer is None:
- args.tokenizer = args.model
- validate_args(args)
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench throughput
+
+For help with the new command, run:
+ vllm bench throughput --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench throughput --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
index 9663503e9baa0..f1e504499eaf6 100644
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -4,7 +4,10 @@
import torch
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
- w8a8_block_fp8_matmul,
+ apply_w8a8_block_fp8_linear,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+ CUTLASS_BLOCK_FP8_SUPPORTED,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton as vllm_triton
@@ -29,7 +32,7 @@ DEEPSEEK_V3_SHAPES = [
]
-def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
"""Build runner function for w8a8 block fp8 matmul."""
factor_for_scale = 1e-2
@@ -37,37 +40,54 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
fp8_max, fp8_min = fp8_info.max, fp8_info.min
# Create random FP8 tensors
- A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
- A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+ A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
- B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
- B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+ B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+ B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
# Create scales
block_n, block_k = block_size[0], block_size[1]
n_tiles = (N + block_n - 1) // block_n
k_tiles = (K + block_k - 1) // block_k
- As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
Bs = (
torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
* factor_for_scale
)
+ # SM90 CUTLASS requires row-major format for scales
+ if use_cutlass and current_platform.is_device_capability(90):
+ Bs = Bs.T.contiguous()
+
def run():
- return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+ if use_cutlass:
+ return apply_w8a8_block_fp8_linear(
+ A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
+ )
+ else:
+ return apply_w8a8_block_fp8_linear(
+ A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
+ )
return run
+# Determine available providers
+available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
+plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
+
+if CUTLASS_BLOCK_FP8_SUPPORTED:
+ available_providers.append("w8a8-block-fp8-cutlass")
+
+
@vllm_triton.testing.perf_report(
vllm_triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
- line_vals=["torch-bf16", "w8a8-block-fp8"],
- line_names=["torch-bf16", "w8a8-block-fp8"],
+ line_vals=available_providers,
+ line_names=available_providers,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs W8A8 Block FP8 GEMMs",
args={},
@@ -85,11 +105,22 @@ def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
- else: # w8a8-block-fp8
- run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
- ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
- lambda: run_w8a8(), quantiles=quantiles
+ elif provider == "w8a8-block-fp8-triton":
+ run_w8a8_triton = build_w8a8_block_fp8_runner(
+ M, N, K, block_size, device, use_cutlass=False
)
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: run_w8a8_triton(), quantiles=quantiles
+ )
+ elif provider == "w8a8-block-fp8-cutlass":
+ run_w8a8_cutlass = build_w8a8_block_fp8_runner(
+ M, N, K, block_size, device, use_cutlass=True
+ )
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: run_w8a8_cutlass(), quantiles=quantiles
+ )
+ else:
+ raise ValueError(f"Unknown provider: {provider}")
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py
index 923d678f1f2db..9170361e974b6 100644
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@@ -2,14 +2,25 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Callable
+from unittest.mock import patch
+import pandas as pd
import torch
-from vllm import _custom_ops as ops
-from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+
+def with_triton_mode(fn):
+ """Temporarily force the Triton fallback path"""
+
+ def wrapped(*args, **kwargs):
+ with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+ return fn(*args, **kwargs)
+
+ return wrapped
# TODO(luka): use standalone_compile utility
@@ -21,78 +32,236 @@ def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
return inner
-torch._dynamo.config.recompile_limit = 8888
-compilation_config = CompilationConfig(custom_ops=["none"])
-with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
- torch_per_token_quant_fp8 = torch.compile(
- QuantFP8(False, GroupShape.PER_TOKEN),
- fullgraph=True,
- dynamic=False, # recompile for different shapes
- )
+def bench_compile(fn: Callable):
+ # recompile for different shapes
+ fwd = torch.compile(fn, fullgraph=True, dynamic=False)
# First dim is explicitly dynamic to simulate vLLM usage
- torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
+ return with_dyn_arg(fwd, 0, 0)
-def cuda_per_token_quant_fp8(
- input: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor]:
- return ops.scaled_fp8_quant(input)
+torch._dynamo.config.recompile_limit = 8888
-def calculate_diff(batch_size: int, seq_len: int):
- """Calculate difference between Triton and CUDA implementations."""
+def calculate_diff(
+ batch_size: int,
+ hidden_size: int,
+ group_shape: GroupShape,
+ dtype: torch.dtype,
+):
+ """Calculate the difference between Inductor and CUDA implementations."""
device = torch.device("cuda")
- x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
+ x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
- torch_out, torch_scale = torch_per_token_quant_fp8(x)
- cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
+ quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
- if torch.allclose(
- cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
- ) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
+ torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
+ torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
+ cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
+
+ out_allclose = lambda o1, o2: torch.allclose(
+ o1.to(torch.float32),
+ o2.to(torch.float32),
+ rtol=1e-3,
+ atol=1e-5,
+ )
+ scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
+
+ if (
+ out_allclose(cuda_out, torch_out)
+ and scale_allclose(cuda_scale, torch_scale)
+ and out_allclose(cuda_out, torch_eager_out)
+ and scale_allclose(cuda_scale, torch_eager_scale)
+ ):
print("✅ All implementations match")
else:
print("❌ Implementations differ")
-batch_size_range = [1, 16, 32, 64, 128]
-seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
-
-configs = list(itertools.product(batch_size_range, seq_len_range))
+configs = []
-@triton.testing.perf_report(
- triton.testing.Benchmark(
- x_names=["batch_size", "seq_len"],
- x_vals=configs,
- line_arg="provider",
- line_vals=["torch", "cuda"],
- line_names=["Torch", "CUDA"],
- styles=[("blue", "-"), ("green", "-")],
- ylabel="us",
- plot_name="per-token-dynamic-quant-fp8-performance",
- args={},
- )
-)
-def benchmark_quantization(batch_size, seq_len, provider):
- dtype = torch.float16
+def benchmark_quantization(
+ batch_size,
+ hidden_size,
+ provider,
+ group_shape: GroupShape,
+ col_major: bool,
+ dtype: torch.dtype,
+):
device = torch.device("cuda")
- x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
+ x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
+ quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
if provider == "torch":
- fn = lambda: torch_per_token_quant_fp8(x.clone())
+ fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
elif provider == "cuda":
- fn = lambda: cuda_per_token_quant_fp8(x.clone())
+ fn = lambda: quant_fp8.forward_cuda(x.clone())
+ elif provider == "triton":
+ if not group_shape.is_per_group():
+ # Triton only supported for per-group
+ return 0, 0, 0
+
+ fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+# TODO(luka) extract to utils
+def compute_geomean_speedups(
+ df: pd.DataFrame,
+ baseline_col: str,
+ speedup_cols: list[str],
+ groupby_cols: list[str] | None = None,
+) -> pd.DataFrame:
+ """
+ Compute geometric mean speedups over a baseline column.
+
+ Args:
+ df: Input dataframe
+ baseline_col: Column to use as baseline
+ speedup_cols: Columns to compute speedups for
+ groupby_cols: Columns to group by. If None, compute over entire df.
+
+ Returns:
+ pd.DataFrame with geometric mean speedups
+ """
+ from scipy.stats import gmean
+
+ def geo_speedup(group: pd.DataFrame) -> pd.Series:
+ ratios = {
+ col: (group[baseline_col] / group[col]).values for col in speedup_cols
+ }
+ return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
+
+ if groupby_cols is None:
+ result = geo_speedup(df).to_frame().T
+ else:
+ result = (
+ df.groupby(groupby_cols)
+ .apply(geo_speedup, include_groups=False)
+ .reset_index()
+ )
+
+ return result
+
+
if __name__ == "__main__":
- calculate_diff(batch_size=4, seq_len=4096)
- benchmark_quantization.run(print_data=True)
+ parser = FlexibleArgumentParser(
+ description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
+ )
+ parser.add_argument("-c", "--check", action="store_true")
+ parser.add_argument(
+ "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
+ )
+ parser.add_argument(
+ "--hidden-sizes",
+ type=int,
+ nargs="+",
+ default=None,
+ help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
+ )
+ parser.add_argument(
+ "--batch-sizes",
+ type=int,
+ nargs="+",
+ default=None,
+ help="Batch sizes to benchmark (default: 1,16,32,64,128)",
+ )
+ parser.add_argument(
+ "--group-sizes",
+ type=int,
+ nargs="+",
+ default=None,
+ help="Group sizes for GroupShape(1,N) to benchmark. "
+ "Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
+ )
+ parser.add_argument(
+ "--no-column-major",
+ action="store_true",
+ help="Disable column-major scales testing",
+ )
+
+ args = parser.parse_args()
+ assert args
+
+ dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
+
+ hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+ batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
+
+ if args.group_sizes is not None:
+ group_shapes = []
+ for size in args.group_sizes:
+ if size == 0:
+ group_shapes.append(GroupShape.PER_TENSOR)
+ elif size == -1:
+ group_shapes.append(GroupShape.PER_TOKEN)
+ else:
+ group_shapes.append(GroupShape(1, size))
+ else:
+ group_shapes = [
+ GroupShape.PER_TENSOR,
+ GroupShape.PER_TOKEN,
+ GroupShape(1, 64),
+ GroupShape(1, 128),
+ ]
+
+ column_major_scales = [False] if args.no_column_major else [True, False]
+
+ config_gen = itertools.product(
+ group_shapes,
+ column_major_scales,
+ batch_sizes,
+ hidden_sizes,
+ )
+
+ # filter out column-major scales for non-group, reverse order
+ configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
+
+ print(f"Running {len(configs)} configurations:")
+ print(f" Hidden sizes: {hidden_sizes}")
+ print(f" Batch sizes: {batch_sizes}")
+ print(f" Group shapes: {[str(g) for g in group_shapes]}")
+ print(f" Column major scales: {column_major_scales}")
+ print()
+
+ if args.check:
+ for group_shape in group_shapes:
+ group_size = group_shape[1]
+ print(f"{group_size=}")
+ calculate_diff(
+ batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
+ )
+
+ benchmark = triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
+ x_vals=configs,
+ line_arg="provider",
+ line_vals=["torch", "cuda", "triton"],
+ line_names=["Torch (Compiled)", "CUDA", "Triton"],
+ styles=[("blue", "-"), ("green", "-"), ("black", "-")],
+ ylabel="us",
+ plot_name="QuantFP8 performance",
+ args={},
+ )
+ )(benchmark_quantization)
+
+ df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
+
+ # Print geomean speedups
+ geo_table_grouped = compute_geomean_speedups(
+ df,
+ baseline_col="Torch (Compiled)",
+ speedup_cols=["CUDA", "Triton"],
+ groupby_cols=["col_major", "group_shape"],
+ )
+
+ print("Speedup over Torch (Compiled)")
+ print(geo_table_grouped.to_string(index=False))
diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
new file mode 100644
index 0000000000000..93edbcc9391fc
--- /dev/null
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# benchmark custom activation op performance
+import itertools
+
+import torch
+
+import vllm.model_executor.layers.activation # noqa F401
+from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+batch_size_range = [1, 16, 32, 64, 128]
+seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+intermediate_size = [3072, 9728, 12288]
+configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
+
+
+def benchmark_activation(
+ batch_size: int,
+ seq_len: int,
+ intermediate_size: int,
+ provider: str,
+ func_name: str,
+ dtype: torch.dtype,
+):
+ device = "cuda"
+ num_tokens = batch_size * seq_len
+ dim = intermediate_size
+ current_platform.seed_everything(42)
+ torch.set_default_device(device)
+
+ if func_name == "gelu_and_mul":
+ layer = CustomOp.op_registry[func_name](approximate="none")
+ elif func_name == "gelu_and_mul_tanh":
+ layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
+ elif func_name == "fatrelu_and_mul":
+ threshold = 0.5
+ layer = CustomOp.op_registry[func_name](threshold)
+ else:
+ layer = CustomOp.op_registry[func_name]()
+
+ x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
+ compiled_layer = torch.compile(layer.forward_native)
+
+ if provider == "custom":
+ fn = lambda: layer(x)
+ elif provider == "compiled":
+ fn = lambda: compiled_layer(x)
+
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ fn, quantiles=[0.5, 0.2, 0.8]
+ )
+ return ms, max_ms, min_ms
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
+ parser.add_argument(
+ "--func-name",
+ type=str,
+ choices=[
+ "mul_and_silu",
+ "silu_and_mul",
+ "gelu_and_mul",
+ "gelu_and_mul_tanh",
+ "fatrelu_and_mul",
+ "swigluoai_and_mul",
+ "gelu_new",
+ "gelu_fast",
+ "quick_gelu",
+ ],
+ default="silu_and_mul",
+ )
+ parser.add_argument(
+ "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+ )
+ args = parser.parse_args()
+ assert args
+
+ func_name = args.func_name
+ dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
+
+ perf_report = triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["batch_size", "seq_len", "intermediate_size"],
+ x_vals=configs,
+ line_arg="provider",
+ line_vals=["custom", "compiled"],
+ line_names=["Custom OP", "Compiled"],
+ styles=[("blue", "-"), ("green", "-")],
+ ylabel="ms",
+ plot_name=f"{func_name}-op-performance",
+ args={},
+ )
+ )
+
+ perf_report(
+ lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
+ batch_size, seq_len, intermediate_size, provider, func_name, dtype
+ )
+ ).run(print_data=True)
diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
index 35c20ee41b9a9..726a2a371d109 100644
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -13,6 +13,10 @@ import torch.utils.benchmark as benchmark
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import (
+ fp8_w8a8_moe_quant_config,
+ nvfp4_moe_quant_config,
+)
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.scalar_type import scalar_types
@@ -140,6 +144,12 @@ def bench_run(
a_fp8_scale: torch.Tensor,
num_repeats: int,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_fp8_scale,
+ )
+
for _ in range(num_repeats):
fused_experts(
a,
@@ -147,10 +157,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_fp8_scale,
+ quant_config=quant_config,
)
def run_cutlass_moe_fp4(
@@ -172,25 +179,27 @@ def bench_run(
device: torch.device,
num_repeats: int,
):
+ quant_config = nvfp4_moe_quant_config(
+ a1_gscale=a1_gs,
+ a2_gscale=a2_gs,
+ w1_scale=w1_blockscale,
+ w2_scale=w2_blockscale,
+ g1_alphas=w1_gs,
+ g2_alphas=w2_gs,
+ )
for _ in range(num_repeats):
with nvtx.annotate("cutlass_moe_fp4", color="green"):
cutlass_moe_fp4(
a=a,
- a1_gscale=a1_gs,
- a2_gscale=a2_gs,
w1_fp4=w1_fp4,
- w1_blockscale=w1_blockscale,
- w1_alphas=w1_gs,
w2_fp4=w2_fp4,
- w2_blockscale=w2_blockscale,
- w2_alphas=w2_gs,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
- device=device,
+ quant_config=quant_config,
)
def run_cutlass_from_graph(
@@ -211,26 +220,29 @@ def bench_run(
e: int,
device: torch.device,
):
+ quant_config = nvfp4_moe_quant_config(
+ a1_gscale=a1_gs,
+ a2_gscale=a2_gs,
+ w1_scale=w1_blockscale,
+ w2_scale=w2_blockscale,
+ g1_alphas=w1_gs,
+ g2_alphas=w2_gs,
+ )
+
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
return cutlass_moe_fp4(
a=a,
- a1_gscale=a1_gs,
w1_fp4=w1_fp4,
- w1_blockscale=w1_blockscale,
- w1_alphas=w1_alphas,
- a2_gscale=a2_gs,
w2_fp4=w2_fp4,
- w2_blockscale=w2_blockscale,
- w2_alphas=w2_alphas,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
- device=device,
+ quant_config=quant_config,
)
def run_triton_from_graph(
@@ -246,16 +258,18 @@ def bench_run(
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_fp8_scale,
+ )
return fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_fp8_scale,
+ quant_config=quant_config,
)
def replay_graph(graph, num_repeats):
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
new file mode 100644
index 0000000000000..a61c17edc1e28
--- /dev/null
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -0,0 +1,486 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Benchmark script for device communicators:
+CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
+and SymmMemCommunicator (multimem, two-shot).
+
+Usage:
+ torchrun --nproc_per_node= benchmark_device_communicators.py [options]
+
+Example:
+ torchrun --nproc_per_node=2 benchmark_device_communicators.py
+ --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100
+"""
+
+import json
+import os
+import time
+from contextlib import nullcontext
+from typing import Callable, Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+# Default sequence lengths to benchmark
+DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
+
+# Fixed hidden size and dtype for all benchmarks
+HIDDEN_SIZE = 8192
+BENCHMARK_DTYPE = torch.bfloat16
+
+# CUDA graph settings
+CUDA_GRAPH_CAPTURE_CYCLES = 10
+
+
+class CommunicatorBenchmark:
+ """Benchmark class for testing device communicators."""
+
+ def __init__(
+ self,
+ rank: int,
+ world_size: int,
+ device: torch.device,
+ cpu_group: ProcessGroup,
+ sequence_lengths: list[int],
+ ):
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.cpu_group = cpu_group
+
+ # Calculate max_size_override based on largest sequence length
+ max_seq_len = max(sequence_lengths)
+ max_tensor_elements = max_seq_len * HIDDEN_SIZE
+ self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1
+
+ # Initialize communicators
+ self.custom_allreduce = None
+ self.pynccl_comm = None
+ self.symm_mem_comm = None
+ self.symm_mem_comm_multimem = None
+ self.symm_mem_comm_two_shot = None
+
+ self._init_communicators()
+
+ def _init_communicators(self):
+ """Initialize all available communicators."""
+ try:
+ self.custom_allreduce = CustomAllreduce(
+ group=self.cpu_group,
+ device=self.device,
+ max_size=self.max_size_override,
+ )
+ if not self.custom_allreduce.disabled:
+ logger.info("Rank %s: CustomAllreduce initialized", self.rank)
+ else:
+ logger.info("Rank %s: CustomAllreduce disabled", self.rank)
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e
+ )
+ self.custom_allreduce = None
+
+ try:
+ self.pynccl_comm = PyNcclCommunicator(
+ group=self.cpu_group, device=self.device
+ )
+ if not self.pynccl_comm.disabled:
+ logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
+ else:
+ logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
+ self.pynccl_comm = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e
+ )
+ self.pynccl_comm = None
+
+ # Initialize variants for SymmMemCommunicator
+ try:
+ self.symm_mem_comm_multimem = SymmMemCommunicator(
+ group=self.cpu_group,
+ device=self.device,
+ force_multimem=True,
+ max_size_override=self.max_size_override,
+ )
+ if not self.symm_mem_comm_multimem.disabled:
+ logger.info(
+ "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank
+ )
+ else:
+ self.symm_mem_comm_multimem = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s",
+ self.rank,
+ e,
+ )
+ self.symm_mem_comm_multimem = None
+
+ try:
+ self.symm_mem_comm_two_shot = SymmMemCommunicator(
+ group=self.cpu_group,
+ device=self.device,
+ force_multimem=False,
+ max_size_override=self.max_size_override,
+ )
+ if not self.symm_mem_comm_two_shot.disabled:
+ logger.info(
+ "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank
+ )
+ else:
+ self.symm_mem_comm_two_shot = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s",
+ self.rank,
+ e,
+ )
+ self.symm_mem_comm_two_shot = None
+
+ def benchmark_allreduce(
+ self, sequence_length: int, num_warmup: int, num_trials: int
+ ) -> dict[str, float]:
+ """Benchmark allreduce operations for all available communicators."""
+
+ results = {}
+
+ # Define communicators with their benchmark functions
+ communicators = []
+
+ if self.custom_allreduce is not None:
+ comm = self.custom_allreduce
+ # CustomAllreduce one-shot
+ communicators.append(
+ (
+ "ca_1stage",
+ lambda t, c=comm: c.custom_all_reduce(t),
+ lambda t, c=comm: c.should_custom_ar(t),
+ comm.capture(),
+ "1stage", # env variable value
+ )
+ )
+ # CustomAllreduce two-shot
+ communicators.append(
+ (
+ "ca_2stage",
+ lambda t, c=comm: c.custom_all_reduce(t),
+ lambda t, c=comm: c.should_custom_ar(t),
+ comm.capture(),
+ "2stage", # env variable value
+ )
+ )
+
+ if self.pynccl_comm is not None:
+ comm = self.pynccl_comm
+ communicators.append(
+ (
+ "pynccl",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t: True, # Always available if initialized
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ if self.symm_mem_comm_multimem is not None:
+ comm = self.symm_mem_comm_multimem
+ communicators.append(
+ (
+ "symm_mem_multimem",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t, c=comm: c.should_use_symm_mem(t),
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ if self.symm_mem_comm_two_shot is not None:
+ comm = self.symm_mem_comm_two_shot
+ communicators.append(
+ (
+ "symm_mem_two_shot",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t, c=comm: c.should_use_symm_mem(t),
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ # Benchmark each communicator
+ for name, allreduce_fn, should_use_fn, context, env_var in communicators:
+ # Set environment variable if needed
+ if env_var is not None:
+ os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
+ else:
+ # Clear the environment variable to avoid interference
+ os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
+
+ latency = self.benchmark_allreduce_single(
+ sequence_length,
+ allreduce_fn,
+ should_use_fn,
+ context,
+ num_warmup,
+ num_trials,
+ )
+ if latency is not None:
+ results[name] = latency
+
+ return results
+
+ def benchmark_allreduce_single(
+ self,
+ sequence_length: int,
+ allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+ should_use_fn: Callable[[torch.Tensor], bool],
+ context,
+ num_warmup: int,
+ num_trials: int,
+ ) -> Optional[float]:
+ """Benchmark method with CUDA graph optimization."""
+ try:
+ # Create test tensor (2D: sequence_length x hidden_size)
+ tensor = torch.randn(
+ sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device
+ )
+ if not should_use_fn(tensor):
+ return None
+
+ torch.cuda.synchronize()
+ stream = torch.cuda.Stream()
+ with torch.cuda.stream(stream):
+ graph_input = tensor.clone()
+
+ # Warmup before capture
+ for _ in range(3):
+ allreduce_fn(graph_input)
+
+ # Capture the graph using context manager
+ with context:
+ graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(graph):
+ for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
+ allreduce_fn(graph_input)
+
+ torch.cuda.synchronize()
+ for _ in range(num_warmup):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+
+ for _ in range(num_trials):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ end_time = time.perf_counter()
+
+ # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES
+ return (
+ (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000
+ )
+
+ except Exception as e:
+ logger.error("CUDA graph benchmark failed: %s", e)
+ raise RuntimeError(
+ f"CUDA graph benchmark failed for communicator: {e}"
+ ) from e
+
+
+def _calculate_speedup_info(comm_results: dict[str, float]) -> str:
+ """Calculate speedup information for a single tensor size."""
+ if not comm_results:
+ return "N/A"
+
+ # Find the fastest communicator
+ fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k])
+ fastest_time = comm_results[fastest_comm]
+
+ # Calculate speedup vs PyNccl if available
+ if "pynccl" in comm_results:
+ pynccl_time = comm_results["pynccl"]
+ speedup = pynccl_time / fastest_time
+ return f"{fastest_comm} ({speedup:.2f}x)"
+ else:
+ return f"{fastest_comm} (N/A)"
+
+
+def print_results(
+ results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int
+):
+ """Print benchmark results in a formatted table."""
+
+ print(f"\n{'=' * 130}")
+ print("Device Communicator Benchmark Results")
+ print(
+ f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, "
+ f"Hidden Size: {HIDDEN_SIZE}"
+ )
+ print(f"{'=' * 130}")
+
+ # Get all communicator names
+ all_comms = set()
+ for size_results in results.values():
+ all_comms.update(size_results.keys())
+
+ all_comms = sorted(list(all_comms))
+
+ # Print header
+ header = f"{'Tensor Shape':<20}{'Tensor Size':<15}"
+ for comm in all_comms:
+ header += f"{comm:<20}"
+ header += f"{'Best (Speedup vs PyNccl)':<30}"
+ print(header)
+ print("-" * len(header))
+
+ # Print results for each sequence length
+ for seq_len in sequence_lengths:
+ if seq_len in results:
+ # Calculate tensor size in elements and bytes
+ tensor_elements = seq_len * HIDDEN_SIZE
+ tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize
+
+ # Format tensor size (MB)
+ tensor_size_mb = tensor_bytes / (1024 * 1024)
+ tensor_size_str = f"{tensor_size_mb:.2f} MB"
+
+ # Format tensor shape
+ tensor_shape = f"({seq_len}, {HIDDEN_SIZE})"
+
+ row = f"{tensor_shape:<20}{tensor_size_str:<15}"
+ for comm in all_comms:
+ if comm in results[seq_len]:
+ row += f"{results[seq_len][comm]:<20.3f}"
+ else:
+ row += f"{'N/A':<20}"
+
+ # Calculate speedup information
+ speedup_info = _calculate_speedup_info(results[seq_len])
+ row += f"{speedup_info:<30}"
+
+ print(row)
+
+ print(f"{'=' * 130}")
+ print("All times are in milliseconds (ms) per allreduce operation")
+ print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)")
+
+
+def main():
+ parser = FlexibleArgumentParser(description="Benchmark device communicators")
+
+ parser.add_argument(
+ "--sequence-lengths",
+ type=int,
+ nargs="+",
+ default=DEFAULT_SEQUENCE_LENGTHS,
+ help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)",
+ )
+
+ parser.add_argument(
+ "--num-warmup", type=int, default=5, help="Number of warmup iterations"
+ )
+
+ parser.add_argument(
+ "--num-trials", type=int, default=50, help="Number of benchmark trials"
+ )
+
+ parser.add_argument("--output-json", type=str, help="Output results to JSON file")
+
+ args = parser.parse_args()
+
+ # Initialize distributed
+ if not dist.is_initialized():
+ dist.init_process_group(backend="gloo")
+ rank = dist.get_rank()
+ world_size = dist.get_world_size()
+
+ # Set device
+ device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(device)
+
+ # Get CPU process group
+ cpu_group = dist.new_group(backend="gloo")
+
+ # Disable USE_SYMM_MEM to avoid affecting the max_sizes
+ # in symm_mem and custom_all_reduce for benchmark
+ os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+ # Initialize benchmark
+ benchmark = CommunicatorBenchmark(
+ rank, world_size, device, cpu_group, args.sequence_lengths
+ )
+
+ # Run benchmarks
+ all_results = {}
+
+ for seq_len in args.sequence_lengths:
+ if rank == 0:
+ logger.info(
+ "Benchmarking sequence length: %s (tensor shape: %s x %s)",
+ seq_len,
+ seq_len,
+ HIDDEN_SIZE,
+ )
+
+ results = benchmark.benchmark_allreduce(
+ sequence_length=seq_len,
+ num_warmup=args.num_warmup,
+ num_trials=args.num_trials,
+ )
+
+ all_results[seq_len] = results
+
+ # Synchronize between ranks
+ dist.barrier()
+
+ # Print results (only rank 0)
+ if rank == 0:
+ print_results(all_results, args.sequence_lengths, world_size)
+
+ # Save to JSON if requested
+ if args.output_json:
+ # Add speedup information to results
+ enhanced_results = {}
+ for seq_len, comm_results in all_results.items():
+ enhanced_results[seq_len] = {
+ "timings": comm_results,
+ "speedup_info": _calculate_speedup_info(comm_results),
+ }
+
+ output_data = {
+ "world_size": world_size,
+ "dtype": str(BENCHMARK_DTYPE),
+ "hidden_size": HIDDEN_SIZE,
+ "sequence_lengths": args.sequence_lengths,
+ "num_warmup": args.num_warmup,
+ "num_trials": args.num_trials,
+ "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES,
+ "results": enhanced_results,
+ }
+
+ with open(args.output_json, "w") as f:
+ json.dump(output_data, f, indent=2)
+
+ logger.info("Results saved to %s", args.output_json)
+
+ # Cleanup
+ if cpu_group != dist.group.WORLD:
+ dist.destroy_process_group(cpu_group)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index a6b42406b5cb0..14330ae6f03c5 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -7,6 +7,7 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
@@ -96,6 +97,11 @@ def bench_run(
a_scale: torch.Tensor,
num_repeats: int,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_scale,
+ )
for _ in range(num_repeats):
fused_experts(
a,
@@ -103,10 +109,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_scale,
+ quant_config=quant_config,
)
def run_cutlass_moe(
@@ -125,6 +128,12 @@ def bench_run(
per_act_token: bool,
num_repeats: int,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ per_act_token_quant=per_act_token,
+ )
+
for _ in range(num_repeats):
cutlass_moe_fp8(
a,
@@ -132,14 +141,11 @@ def bench_run(
w2,
topk_weights,
topk_ids,
- w1_scale,
- w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
- per_act_token,
- a1_scale=None,
+ quant_config=quant_config,
)
def run_cutlass_from_graph(
@@ -156,6 +162,12 @@ def bench_run(
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ per_act_token_quant=per_act_token,
+ )
+
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@@ -165,14 +177,11 @@ def bench_run(
w2_q,
topk_weights,
topk_ids,
- w1_scale,
- w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
- per_act_token,
- a1_scale=None,
+ quant_config=quant_config,
)
def run_triton_from_graph(
@@ -185,6 +194,11 @@ def bench_run(
w2_scale: torch.Tensor,
a_scale: torch.Tensor,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_scale,
+ )
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@@ -194,10 +208,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_scale,
+ quant_config=quant_config,
)
def replay_graph(graph, num_repeats):
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 89309c79f0991..debb29744bfaa 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -464,7 +464,11 @@ class BenchmarkTensors:
for field_name in LoRAKernelMeta.__dataclass_fields__:
field = getattr(self.lora_kernel_meta, field_name)
assert isinstance(field, torch.Tensor)
- setattr(self.lora_kernel_meta, field_name, to_device(field))
+ setattr(
+ self.lora_kernel_meta,
+ field_name,
+ to_device(field) if field_name != "no_lora_flag_cpu" else field,
+ )
def metadata(self) -> tuple[int, int, int]:
"""
@@ -512,6 +516,7 @@ class BenchmarkTensors:
"lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
"lora_ids": self.lora_kernel_meta.active_lora_ids,
"scaling": 1.0,
+ "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
}
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
@@ -552,6 +557,7 @@ class BenchmarkTensors:
"lora_ids": self.lora_kernel_meta.active_lora_ids,
"offset_start": 0,
"add_inputs": add_inputs,
+ "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
}
def bench_fn_kwargs(
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 710d30adfd846..d2beb28f70233 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -14,6 +14,10 @@ import ray
import torch
from ray.experimental.tqdm_ray import tqdm
+from vllm.model_executor.layers.fused_moe.config import (
+ FusedMoEQuantConfig,
+ _get_config_dtype_str,
+)
from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
@@ -134,43 +138,36 @@ def benchmark_config(
def run():
from vllm.model_executor.layers.fused_moe import override_config
+ if use_fp8_w8a8:
+ quant_dtype = torch.float8_e4m3fn
+ elif use_int8_w8a16:
+ quant_dtype = torch.int8
+ else:
+ quant_dtype = None
+
+ quant_config = FusedMoEQuantConfig.make(
+ quant_dtype=quant_dtype,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ block_shape=block_quant_shape,
+ )
+
with override_config(config):
- if use_deep_gemm:
- topk_weights, topk_ids, token_expert_indices = fused_topk(
- x, input_gating, topk, False
- )
- return fused_experts(
- x,
- w1,
- w2,
- topk_weights,
- topk_ids,
- inplace=True,
- use_fp8_w8a8=use_fp8_w8a8,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- block_shape=block_quant_shape,
- allow_deep_gemm=True,
- )
- else:
- fused_moe(
- x,
- w1,
- w2,
- input_gating,
- topk,
- renormalize=True,
- inplace=True,
- use_fp8_w8a8=use_fp8_w8a8,
- use_int8_w8a16=use_int8_w8a16,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- block_shape=block_quant_shape,
- )
+ topk_weights, topk_ids, token_expert_indices = fused_topk(
+ x, input_gating, topk, renormalize=not use_deep_gemm
+ )
+ return fused_experts(
+ x,
+ w1,
+ w2,
+ topk_weights,
+ topk_ids,
+ inplace=True,
+ quant_config=quant_config,
+ allow_deep_gemm=use_deep_gemm,
+ )
# JIT compilation & warmup
run()
@@ -414,7 +411,7 @@ class BenchmarkWorker:
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
- dtype_str = get_config_dtype_str(
+ dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
@@ -547,7 +544,7 @@ def save_configs(
block_quant_shape: list[int],
save_dir: str,
) -> None:
- dtype_str = get_config_dtype_str(
+ dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
@@ -560,7 +557,7 @@ def save_configs(
filename = os.path.join(save_dir, filename)
print(f"Writing best config to {filename}...")
with open(filename, "w") as f:
- json.dump(configs, f, indent=4)
+ json.dump({"triton_version": triton.__version__, **configs}, f, indent=4)
f.write("\n")
@@ -594,7 +591,11 @@ def main(args: argparse.Namespace):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
- elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
+ elif config.architectures[0] in (
+ "Qwen2MoeForCausalLM",
+ "Qwen3MoeForCausalLM",
+ "Qwen3NextForCausalLM",
+ ):
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
@@ -678,7 +679,11 @@ def main(args: argparse.Namespace):
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
print(f"Start tuning over {len(search_space)} configurations...")
-
+ if use_deep_gemm:
+ raise ValueError(
+ "Tuning with --use-deep-gemm is not supported as it only tunes Triton "
+ "kernels. Please remove the flag."
+ )
start = time.time()
configs = _distribute(
"tune",
diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py
new file mode 100644
index 0000000000000..9ac8f5e6594e4
--- /dev/null
+++ b/benchmarks/kernels/benchmark_polynorm.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+
+import torch
+
+from vllm import _custom_ops as vllm_ops
+from vllm.triton_utils import triton
+
+
+def polynorm_naive(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ eps: float = 1e-6,
+):
+ orig_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+
+ def norm(x, eps: float):
+ return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+
+ x = x.float()
+ return (
+ (
+ weight[0] * norm(x**3, eps)
+ + weight[1] * norm(x**2, eps)
+ + weight[2] * norm(x, eps)
+ + bias
+ )
+ .to(weight.dtype)
+ .view(orig_shape)
+ )
+
+
+def polynorm_vllm(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ eps: float = 1e-6,
+):
+ orig_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+
+ out = torch.empty_like(x)
+ vllm_ops.poly_norm(out, x, weight, bias, eps)
+ output = out
+
+ output = output.view(orig_shape)
+ return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_dim):
+ dtype = torch.bfloat16
+ x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+ weight = torch.ones(3, dtype=dtype, device="cuda")
+ bias = torch.ones(1, dtype=dtype, device="cuda")
+
+ output_naive = polynorm_naive(x, weight, bias)
+ output_vllm = polynorm_vllm(x, weight, bias)
+
+ if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+ print("✅ All implementations match")
+ else:
+ print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+dim_range = [2048, 4096]
+configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark():
+ @triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["dim", "batch_size", "seq_len"],
+ x_vals=[list(_) for _ in configs],
+ line_arg="provider",
+ line_vals=["naive", "vllm"],
+ line_names=["Naive", "vLLM"],
+ styles=[("blue", "-"), ("red", "-")],
+ ylabel="us",
+ plot_name="polynorm-perf",
+ args={},
+ )
+ )
+ def benchmark(dim, batch_size, seq_len, provider):
+ dtype = torch.bfloat16
+ hidden_dim = dim * 4
+
+ x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+ weight = torch.ones(3, dtype=dtype, device="cuda")
+ bias = torch.ones(1, dtype=dtype, device="cuda")
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "naive":
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: polynorm_naive(x, weight, bias),
+ quantiles=quantiles,
+ )
+ else:
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: polynorm_vllm(x, weight, bias),
+ quantiles=quantiles,
+ )
+
+ return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+ return benchmark
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=4,
+ help="Batch size",
+ )
+ parser.add_argument(
+ "--seq-len",
+ type=int,
+ default=128,
+ help="Sequence length",
+ )
+ parser.add_argument(
+ "--hidden-dim",
+ type=int,
+ default=8192,
+ help="Intermediate size of MLP",
+ )
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default="./configs/polnorm/",
+ help="Path to save polnorm benchmark results",
+ )
+
+ args = parser.parse_args()
+
+ # Run correctness test
+ calculate_diff(
+ batch_size=args.batch_size,
+ seq_len=args.seq_len,
+ hidden_dim=args.hidden_dim,
+ )
+
+ benchmark = get_benchmark()
+ # Run performance benchmark
+ benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
index 0650cbf3cc18e..c7a4066b39d70 100644
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -1,77 +1,675 @@
-#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import time
+from collections.abc import Callable
+import matplotlib.pyplot as plt
+import numpy as np
import torch
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
- silu_mul_fp8_quant_deep_gemm,
+ silu_mul_fp8_quant_deep_gemm_cuda,
)
from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
-def benchmark(E, T, H, G=128, runs=50):
- current_platform.seed_everything(42)
- y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
- tokens_per_expert = torch.randint(
- T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
+@triton.jit
+def _silu_mul_fp8_quant_deep_gemm(
+ # Pointers ------------------------------------------------------------
+ input_ptr, # 16-bit activations (E, T, 2*H)
+ y_q_ptr, # fp8 quantized activations (E, T, H)
+ y_s_ptr, # 16-bit scales (E, T, G)
+ counts_ptr, # int32 num tokens per expert (E)
+ # Sizes ---------------------------------------------------------------
+ H: tl.constexpr, # hidden dimension (per output)
+ GROUP_SIZE: tl.constexpr, # elements per group (usually 128)
+ # Strides for input (elements) ---------------------------------------
+ stride_i_e,
+ stride_i_t,
+ stride_i_h,
+ # Strides for y_q (elements) -----------------------------------------
+ stride_yq_e,
+ stride_yq_t,
+ stride_yq_h,
+ # Strides for y_s (elements) -----------------------------------------
+ stride_ys_e,
+ stride_ys_t,
+ stride_ys_g,
+ # Stride for counts (elements)
+ stride_counts_e,
+ # Numeric params ------------------------------------------------------
+ eps: tl.constexpr,
+ fp8_min: tl.constexpr,
+ fp8_max: tl.constexpr,
+ use_ue8m0: tl.constexpr,
+ # Meta ---------------------------------------------------------------
+ BLOCK: tl.constexpr,
+ NUM_STAGES: tl.constexpr,
+):
+ G = H // GROUP_SIZE
+
+ # map program id -> (e, g)
+ pid = tl.program_id(0)
+ e = pid // G
+ g = pid % G
+
+ e = e.to(tl.int64)
+ g = g.to(tl.int64)
+
+ # number of valid tokens for this expert
+ n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
+
+ cols = tl.arange(0, BLOCK).to(tl.int64)
+ mask = cols < BLOCK
+
+ base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
+ base_gate_offset = base_input_offset + cols * stride_i_h
+ base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
+ base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
+ base_ys_offset = e * stride_ys_e + g * stride_ys_g
+
+ for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
+ gate = tl.load(
+ input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
+ ).to(tl.float32)
+ up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)
+
+ gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
+ y = gate * up
+
+ y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
+ if use_ue8m0:
+ y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
+
+ y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+ tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
+ tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
+
+
+def silu_mul_fp8_quant_deep_gemm_triton(
+ y: torch.Tensor, # (E, T, 2*H)
+ tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert
+ num_parallel_tokens,
+ group_size: int = 128,
+ eps: float = 1e-10,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
+
+ y has shape (E, T, 2*H). The first half of the last dimension is
+ silu-activated, multiplied by the second half, then quantized into FP8.
+
+ Returns `(y_q, y_s)` where
+ * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
+ * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+ """
+ assert y.ndim == 3, "y must be (E, T, 2*H)"
+ E, T, H2 = y.shape
+ assert H2 % 2 == 0, "last dim of y must be even (2*H)"
+ H = H2 // 2
+ G = (H + group_size - 1) // group_size
+ assert H % group_size == 0, "H must be divisible by group_size"
+ assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, (
+ "tokens_per_expert must be shape (E,)"
+ )
+ tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)
+
+ # allocate outputs
+ fp8_dtype = torch.float8_e4m3fn
+ y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
+
+ # strides (elements)
+ stride_i_e, stride_i_t, stride_i_h = y.stride()
+ stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
+
+ # desired scale strides (elements): (T*G, 1, T)
+ stride_ys_e = T * G
+ stride_ys_t = 1
+ stride_ys_g = T
+ y_s = torch.empty_strided(
+ (E, T, G),
+ (stride_ys_e, stride_ys_t, stride_ys_g),
+ dtype=torch.float32,
+ device=y.device,
)
+ stride_cnt_e = tokens_per_expert.stride()[0]
+
+ # Static grid over experts and H-groups.
+ # A loop inside the kernel handles the token dim
+ grid = (E * G,)
+
+ f_info = torch.finfo(fp8_dtype)
+ fp8_max = f_info.max
+ fp8_min = f_info.min
+
+ _silu_mul_fp8_quant_deep_gemm[grid](
+ y,
+ y_q,
+ y_s,
+ tokens_per_expert,
+ H,
+ group_size,
+ stride_i_e,
+ stride_i_t,
+ stride_i_h,
+ stride_yq_e,
+ stride_yq_t,
+ stride_yq_h,
+ stride_ys_e,
+ stride_ys_t,
+ stride_ys_g,
+ stride_cnt_e,
+ eps,
+ fp8_min,
+ fp8_max,
+ is_deep_gemm_e8m0_used(),
+ BLOCK=group_size,
+ NUM_STAGES=4,
+ num_warps=1,
+ )
+
+ return y_q, y_s
+
+
+# Parse generation strategies
+strategies = ["uniform", "max_t", "first_t"]
+
+
+def benchmark(
+ kernel: Callable,
+ E: int,
+ T: int,
+ H: int,
+ total_tokens: int,
+ num_parallel_tokens: int = 64,
+ G: int = 128,
+ runs: int = 200,
+ num_warmups: int = 20,
+ gen_strategy: str = "default",
+ iterations_per_run: int = 20,
+):
+ def generate_data(seed_offset=0):
+ """Generate input data with given seed offset"""
+ current_platform.seed_everything(42 + seed_offset)
+ y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
+
+ if gen_strategy == "uniform":
+ r = torch.rand(size=(E,), device="cuda")
+ r /= r.sum()
+ r *= total_tokens
+ tokens_per_expert = r.int()
+ tokens_per_expert = torch.minimum(
+ tokens_per_expert,
+ torch.ones((E,), device=r.device, dtype=torch.int) * T,
+ )
+ elif gen_strategy == "max_t":
+ tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
+ tokens_per_expert.fill_(total_tokens / E)
+ elif gen_strategy == "first_t":
+ tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda")
+ tokens_per_expert[0] = min(T, total_tokens)
+ else:
+ raise ValueError(f"Unknown generation strategy: {gen_strategy}")
+ return y, tokens_per_expert
+
+ dataset_count = 4
+ # Pre-generate different input matrices for each iteration to avoid cache effects
+ data_sets = [generate_data(i) for i in range(dataset_count)]
+
# Warmup
- for _ in range(10):
- silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
- torch.cuda.synchronize()
+ y, tokens_per_expert = data_sets[0]
+ for _ in range(num_warmups):
+ kernel(
+ y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
+ )
+ torch.cuda.synchronize()
+
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
# Benchmark
- torch.cuda.synchronize()
- start = time.perf_counter()
+ latencies: list[float] = []
for _ in range(runs):
- silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
- torch.cuda.synchronize()
+ torch.cuda.synchronize()
- avg_time = (time.perf_counter() - start) / runs * 1000
+ start_event.record()
+ for i in range(iterations_per_run):
+ y, tokens_per_expert = data_sets[i % dataset_count]
+ kernel(
+ y,
+ tokens_per_expert,
+ num_parallel_tokens=num_parallel_tokens,
+ group_size=G,
+ )
+ end_event.record()
+ end_event.synchronize()
- # Calculate actual work done (only count valid tokens)
+ total_time_ms = start_event.elapsed_time(end_event)
+ per_iter_time_ms = total_time_ms / iterations_per_run
+ latencies.append(per_iter_time_ms)
+
+ # Use median instead of average for better outlier handling
+ median_time_ms = np.median(latencies)
+ median_time_s = median_time_ms / 1000
+
+ # Calculate actual work done (using first dataset for consistency)
+ _, tokens_per_expert = data_sets[0]
actual_tokens = tokens_per_expert.sum().item()
actual_elements = actual_tokens * H
# GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
ops_per_element = 8
total_ops = actual_elements * ops_per_element
- gflops = total_ops / (avg_time / 1000) / 1e9
+ gflops = total_ops / median_time_s / 1e9
# Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
input_bytes = actual_tokens * 2 * H * 2 # 2*H bfloat16 inputs
output_bytes = actual_tokens * H * 1 # H fp8 outputs
scale_bytes = actual_tokens * (H // G) * 4 # scales in float32
total_bytes = input_bytes + output_bytes + scale_bytes
- memory_bw = total_bytes / (avg_time / 1000) / 1e9
+ memory_bw = total_bytes / median_time_s / 1e9
- return avg_time, gflops, memory_bw
+ HOPPER_BANDWIDTH_TBPS = 3.35
+ return (
+ median_time_ms,
+ gflops,
+ memory_bw,
+ (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100,
+ )
+def create_comparison_plot(
+ ratio, cuda_times, baseline_times, config_labels, strategy_name, id
+):
+ """Create a comparison plot for a specific generation strategy"""
+ fig, ax = plt.subplots(1, 1, figsize=(16, 6))
+
+ # Configure x-axis positions
+ x = np.arange(len(config_labels))
+ width = 0.35
+
+ # Execution Time plot (lower is better)
+ ax.bar(
+ x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
+ )
+ ax.bar(
+ x + width / 2,
+ baseline_times,
+ width,
+ label="Baseline",
+ alpha=0.8,
+ color="orange",
+ )
+
+ # Add speedup labels over each bar pair
+ for i in range(len(x)):
+ speedup = ratio[i]
+ max_height = max(cuda_times[i], baseline_times[i])
+ ax.text(
+ x[i],
+ max_height + max_height * 0.02,
+ f"{speedup:.2f}x",
+ ha="center",
+ va="bottom",
+ fontweight="bold",
+ fontsize=9,
+ )
+
+ ax.set_xlabel("Configuration")
+ ax.set_ylabel("% Utilization")
+ ax.set_title(
+ f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+ )
+ ax.set_xticks(x)
+ ax.set_xticklabels(config_labels, rotation=45, ha="right")
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+
+ plt.tight_layout()
+ return fig, ax
+
+
+def create_combined_plot(all_results):
+ """Create a combined plot with all strategies in one PNG"""
+ num_strategies = len(all_results)
+ fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
+
+ if num_strategies == 1:
+ axes = [axes]
+
+ for idx, (
+ strategy_name,
+ ratio,
+ cuda_times,
+ baseline_times,
+ config_labels,
+ ) in enumerate(all_results):
+ ax = axes[idx]
+
+ # Configure x-axis positions
+ x = np.arange(len(config_labels))
+ width = 0.35
+
+ # Execution Time plot (lower is better)
+ ax.bar(
+ x - width / 2,
+ cuda_times,
+ width,
+ label="CUDA Kernel",
+ alpha=0.8,
+ color="blue",
+ )
+ ax.bar(
+ x + width / 2,
+ baseline_times,
+ width,
+ label="Baseline",
+ alpha=0.8,
+ color="orange",
+ )
+
+ # Add speedup labels over each bar pair
+ for i in range(len(x)):
+ speedup = ratio[i]
+ max_height = max(cuda_times[i], baseline_times[i])
+ ax.text(
+ x[i],
+ max_height + max_height * 0.02,
+ f"{speedup:.2f}x",
+ ha="center",
+ va="bottom",
+ fontweight="bold",
+ fontsize=9,
+ )
+
+ ax.set_xlabel("Configuration")
+ ax.set_ylabel("% Utilization")
+ ax.set_title(
+ f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+ )
+ ax.set_xticks(x)
+ ax.set_xticklabels(config_labels, rotation=45, ha="right")
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+
+ plt.tight_layout()
+ filename = "../../silu_bench/silu_benchmark_combined.png"
+ plt.savefig(filename, dpi=300, bbox_inches="tight")
+ plt.show()
+
+ return filename
+
+
+outer_dim = 7168
configs = [
- (8, 32, 1024),
- (16, 64, 2048),
- (32, 128, 4096),
# DeepSeekV3 Configs
- (256, 16, 7168),
- (256, 32, 7168),
- (256, 64, 7168),
- (256, 128, 7168),
- (256, 256, 7168),
- (256, 512, 7168),
+ (8, 1024, 7168),
+ # DeepSeekV3 Configs
+ (32, 1024, 7168),
+ # DeepSeekV3 Configs
(256, 1024, 7168),
]
-print(f"GPU: {torch.cuda.get_device_name()}")
-print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
-print("-" * 50)
+runs = 100
+num_warmups = 20
-for E, T, H in configs:
- try:
- time_ms, gflops, gbps = benchmark(E, T, H)
- print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
- except Exception:
- print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
+strategy_descriptions = {
+ "uniform": "Uniform Random",
+ "max_t": "Even Assignment",
+ "first_t": "experts[0] = T, experts[1:] = 0",
+}
+
+print(f"GPU: {torch.cuda.get_device_name()}")
+print(f"Testing strategies: {', '.join(strategies)}")
+print(f"Configurations: {len(configs)} configs")
+
+all_results = []
+
+# Run benchmarks for each strategy
+for id, strategy in enumerate(strategies):
+ print(f"\n{'=' * 60}")
+ print(f"Testing strategy: {strategy_descriptions[strategy]}")
+ print(f"{'=' * 60}")
+
+ # Collect benchmark data for both algorithms
+ config_labels = []
+ config_x_axis = []
+ all_cuda_results = []
+ all_baseline_results = []
+ all_ratios = []
+
+ for E, T, H in configs:
+ total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
+ config_x_axis.append(total_tokens_config)
+
+ cuda_results = []
+ baseline_results = []
+ ratios = []
+
+ for total_tokens in total_tokens_config:
+ config_label = f"E={E},T={T},H={H},TT={total_tokens}"
+ config_labels.append(config_label)
+
+ # CUDA kernel results
+ time_ms_cuda, gflops, gbps, perc = benchmark(
+ silu_mul_fp8_quant_deep_gemm_cuda,
+ E,
+ T,
+ H,
+ total_tokens,
+ runs=runs,
+ num_warmups=num_warmups,
+ gen_strategy=strategy,
+ )
+ cuda_results.append((time_ms_cuda, gflops, gbps, perc))
+
+ # Baseline results
+ time_ms_triton, gflops, gbps, perc = benchmark(
+ silu_mul_fp8_quant_deep_gemm_triton,
+ E,
+ T,
+ H,
+ total_tokens,
+ runs=runs,
+ num_warmups=num_warmups,
+ gen_strategy=strategy,
+ )
+ baseline_results.append((time_ms_triton, gflops, gbps, perc))
+ ratios.append(time_ms_triton / time_ms_cuda)
+
+ print(f"Completed: {config_label}")
+ all_cuda_results.append(cuda_results)
+ all_baseline_results.append(baseline_results)
+ all_ratios.append(ratios)
+
+ # Store results for combined plotting
+ all_results.append(
+ (
+ strategy_descriptions[strategy],
+ all_ratios,
+ all_cuda_results,
+ all_baseline_results,
+ config_labels,
+ config_x_axis,
+ )
+ )
+
+ # Print summary table for this strategy
+ print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
+ print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
+ print("-" * 60)
+
+ for i, (E, T, H) in enumerate(configs):
+ speedup = baseline_results[i][0] / cuda_results[i][0]
+ config_label = f"E={E:3d},T={T:4d},H={H:4d}"
+ print(
+ f"{config_label:<20} {cuda_results[i][0]:8.5f} "
+ f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
+ )
+
+
+def create_total_tokens_plot(all_results):
+ num_strategies = len(all_results)
+ num_configs = len(configs)
+
+ # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
+ fig, axs = plt.subplots(
+ num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
+ )
+
+ # Add main title to the entire figure
+ fig.suptitle(
+ "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
+ fontsize=16,
+ fontweight="bold",
+ y=0.98,
+ )
+
+ # Handle single strategy case
+ if num_strategies == 1:
+ axs = axs.reshape(1, -1)
+
+ # Handle single config case
+ if num_configs == 1:
+ axs = axs.reshape(-1, 2)
+
+ for strategy_idx, result in enumerate(all_results):
+ (
+ strategy_name,
+ all_ratios,
+ all_cuda_results,
+ all_baseline_results,
+ config_labels,
+ config_x_axis,
+ ) = result
+
+ for config_idx in range(num_configs):
+ # Speedup plot (left column)
+ ax_speedup = axs[strategy_idx, config_idx * 2]
+ # Bandwidth plot (right column)
+ ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1]
+
+ E, T, H = configs[config_idx]
+ ratios = all_ratios[config_idx]
+ total_tokens_values = config_x_axis[config_idx]
+
+ # Extract CUDA and Triton bandwidth percentages
+ cuda_bandwidth_percentages = [
+ result[3] for result in all_cuda_results[config_idx]
+ ]
+ triton_bandwidth_percentages = [
+ result[3] for result in all_baseline_results[config_idx]
+ ]
+
+ # Plot speedup ratios vs total tokens (left plot)
+ ax_speedup.plot(
+ total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
+ )
+ ax_speedup.set_title(
+ f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
+ fontsize=12,
+ fontweight="bold",
+ )
+ ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+ ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
+ ax_speedup.grid(True, alpha=0.3)
+
+ ax_bandwidth.plot(
+ total_tokens_values,
+ cuda_bandwidth_percentages,
+ "ro-",
+ linewidth=3,
+ markersize=8,
+ label="CUDA",
+ )
+ ax_bandwidth.plot(
+ total_tokens_values,
+ triton_bandwidth_percentages,
+ "go-",
+ linewidth=3,
+ markersize=8,
+ label="Triton",
+ )
+ ax_bandwidth.set_title(
+ f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
+ fontsize=12,
+ fontweight="bold",
+ )
+ ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+ ax_bandwidth.set_ylabel(
+ "% of Peak Bandwidth", fontweight="bold", fontsize=11
+ )
+ ax_bandwidth.legend(prop={"weight": "bold"})
+ ax_bandwidth.grid(True, alpha=0.3)
+
+ # Format x-axis labels for both plots
+ for ax in [ax_speedup, ax_bandwidth]:
+ ax.set_xticks(total_tokens_values)
+ ax.set_xticklabels(
+ [
+ f"{tt // 1000}K" if tt >= 1000 else str(tt)
+ for tt in total_tokens_values
+ ],
+ fontweight="bold",
+ )
+ # Make tick labels bold
+ for label in ax.get_xticklabels() + ax.get_yticklabels():
+ label.set_fontweight("bold")
+
+ # Add value labels on speedup points
+ for x, y in zip(total_tokens_values, ratios):
+ ax_speedup.annotate(
+ f"{y:.2f}x",
+ (x, y),
+ textcoords="offset points",
+ xytext=(0, 12),
+ ha="center",
+ fontsize=10,
+ fontweight="bold",
+ bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
+ )
+
+ # Add value labels on CUDA bandwidth points
+ for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
+ ax_bandwidth.annotate(
+ f"{y:.1f}%",
+ (x, y),
+ textcoords="offset points",
+ xytext=(0, 12),
+ ha="center",
+ fontsize=9,
+ fontweight="bold",
+ bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
+ )
+
+ # Add value labels on Triton bandwidth points
+ for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
+ ax_bandwidth.annotate(
+ f"{y:.1f}%",
+ (x, y),
+ textcoords="offset points",
+ xytext=(0, -15),
+ ha="center",
+ fontsize=9,
+ fontweight="bold",
+ bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3),
+ )
+
+ plt.tight_layout()
+ plt.subplots_adjust(top=0.93) # Make room for main title
+ filename = "silu_benchmark_total_tokens.png"
+ plt.savefig(filename, dpi=300, bbox_inches="tight")
+ plt.show()
+
+ return filename
+
+
+# Create combined plot with all strategies
+combined_plot_filename = create_total_tokens_plot(all_results)
+
+print(f"\n{'=' * 60}")
+print("Benchmark Complete!")
+print(f"Generated combined plot: {combined_plot_filename}")
+print(f"{'=' * 60}")
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 603ce5ecf0d2c..6ddab46214577 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -259,6 +259,7 @@ if __name__ == "__main__":
# (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
(None, None, None),
(None, FP8_DTYPE, None),
+ (FP8_DTYPE, FP8_DTYPE, None),
(FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
(FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
]
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 40903c6c3444f..131df74c7de1b 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -274,6 +274,7 @@ if __name__ == "__main__":
quant_dtypes = [
# (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
(None, None, None),
+ (FP8_DTYPE, FP8_DTYPE, None),
(FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
(FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
]
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index 98bde9d83c82d..c6c8e0b0b936b 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -11,13 +11,13 @@ from datetime import datetime
from typing import Any
import torch
-import triton
from tqdm import tqdm
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_w8a8_block_fp8_matmul,
)
from vllm.platforms import current_platform
+from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True)
@@ -56,7 +56,7 @@ def w8a8_block_matmul(
Bs: The per-block quantization scale for `B`.
block_size: The block size for per-block quantization.
It should be 2-dim, e.g., [128, 128].
- output_dytpe: The dtype of the returned tensor.
+ output_dtype: The dtype of the returned tensor.
Returns:
torch.Tensor: The result of matmul.
diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md
index 7adf97bcf5622..f5b5c6c97d484 100644
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -55,6 +55,107 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
----------------------------------------------------------------------------------------------------
```
+### JSON configuration file for synthetic conversations generation
+
+The input flag `--input-file` is used to determine the input conversations for the benchmark.
+When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
+
+The file `generate_multi_turn.json` is an example file.
+
+The file must contain the sections `prompt_input` and `prompt_output`.
+
+The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
+
+* `num_turns` - Number of total turns in the conversation (both user & assistant).
+The final value will always be rounded to an even number so each user turn has a reply.
+* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
+* `num_tokens` - Total token length of each **user** message (one turn).
+
+The `prompt_output` section must contain `num_tokens`:
+
+* `num_tokens` - Total token length of each **assistant** message (one turn).
+
+### Random distributions for synthetic conversations generation
+
+When creating an input JSON file (such as `generate_multi_turn.json`),
+every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.
+The distribution determines how to randomly sample values for the field.
+
+The available distributions are listed below.
+
+**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.
+Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
+
+#### constant
+
+```json
+{
+ "distribution": "constant",
+ "value": 500
+}
+```
+
+* `value` - the fixed integer value (always returns the same number).
+
+#### uniform
+
+```json
+{
+ "distribution": "uniform",
+ "min": 12,
+ "max": 18
+}
+```
+
+* `min` - minimum value (inclusive).
+* `max` - maximum value (inclusive), should be equal or larger than min.
+
+#### lognormal
+
+```json
+{
+ "distribution": "lognormal",
+ "average": 1000,
+ "max": 5000
+}
+```
+
+You can parameterize the lognormal distribution in one of two ways:
+
+Using the average and optional median ratio:
+
+* `average` - target average value of the distribution.
+* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
+
+Using the parameters of the underlying normal distribution:
+
+* `mean` - mean of the underlying normal distribution.
+* `sigma` - standard deviation of the underlying normal distribution.
+
+#### zipf
+
+```json
+{
+ "distribution": "zipf",
+ "alpha": 1.2,
+ "max": 100
+}
+```
+
+* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
+
+#### poisson
+
+```json
+{
+ "distribution": "poisson",
+ "alpha": 10,
+ "max": 50
+}
+```
+
+* `alpha` - expected value (λ). Also the variance of the distribution.
+
## ShareGPT Conversations
To run with the ShareGPT data, download the following ShareGPT dataset:
diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py
index 411b89dd23dc6..67b937930d58c 100644
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@@ -99,21 +99,105 @@ class PoissonDistribution(Distribution):
class LognormalDistribution(Distribution):
def __init__(
- self, mean: float, sigma: float, max_val: Optional[int] = None
+ self,
+ mean: Optional[float] = None,
+ sigma: Optional[float] = None,
+ average: Optional[int] = None,
+ median_ratio: Optional[float] = None,
+ max_val: Optional[int] = None,
) -> None:
+ self.average = average
+ self.median_ratio = median_ratio
+ self.max_val = max_val
+
+ if average is not None:
+ if average < 1:
+ raise ValueError("Lognormal average must be positive")
+
+ if mean or sigma:
+ raise ValueError(
+ "When using lognormal average, you can't provide mean/sigma"
+ )
+
+ if self.median_ratio is None:
+ # Default value that provides relatively wide range of values
+ self.median_ratio = 0.85
+
+ # Calculate mean/sigma of np.random.lognormal based on the average
+ mean, sigma = self._generate_lognormal_by_median(
+ target_average=self.average, median_ratio=self.median_ratio
+ )
+ else:
+ if mean is None or sigma is None:
+ raise ValueError(
+ "Must provide both mean and sigma if average is not used"
+ )
+
+ if mean <= 0 or sigma < 0:
+ raise ValueError(
+ "Lognormal mean must be positive and sigma must be non-negative"
+ )
+
+ # Mean and standard deviation of the underlying normal distribution
+ # Based on numpy.random.lognormal
self.mean = mean
self.sigma = sigma
- self.max_val = max_val
+
+ @staticmethod
+ def _generate_lognormal_by_median(
+ target_average: int, median_ratio: float
+ ) -> tuple[float, float]:
+ """
+ Compute (mu, sigma) for a lognormal distribution given:
+ - a target average (mean of the distribution)
+ - a ratio of median / mean (controls skewness), assume mean > median
+
+ Background:
+ If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
+ * mean(X) = exp(mu + sigma^2 / 2)
+ * median(X) = exp(mu)
+
+ So:
+ median / mean = exp(mu) / exp(mu + sigma^2 / 2)
+ = exp(-sigma^2 / 2)
+
+ Rearranging:
+ sigma^2 = 2 * ln(mean / median)
+ mu = ln(median)
+
+ This gives a unique (mu, sigma) for any valid mean and median.
+ """
+ # Check input validity: median must be smaller than mean
+ if median_ratio <= 0 or median_ratio >= 1:
+ raise ValueError("median_ratio must be in range (0, 1)")
+
+ target_median = target_average * median_ratio
+
+ # Solve sigma^2 = 2 * ln(mean / median)
+ sigma = np.sqrt(2 * np.log(target_average / target_median))
+ mu = np.log(target_median)
+
+ return mu, sigma
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
+
+ if self.average is not None:
+ # Scale to average
+ samples *= self.average / samples.mean()
+
if self.max_val:
samples = np.minimum(samples, self.max_val)
return np.round(samples).astype(int)
def __repr__(self) -> str:
- return f"LognormalDistribution[{self.mean}, {self.sigma}]"
+ if self.average:
+ return (
+ f"LognormalDistribution[{self.average}, "
+ f"{self.median_ratio}, {self.max_val}]"
+ )
+ return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
class GenConvArgs(NamedTuple):
@@ -173,10 +257,21 @@ def get_random_distribution(
return PoissonDistribution(conf["alpha"], max_val=max_val)
elif distribution == "lognormal":
+ max_val = conf.get("max", None)
+
+ if "average" in conf:
+ # Infer lognormal mean/sigma (numpy) from input average
+ median_ratio = conf.get("median_ratio", None)
+ return LognormalDistribution(
+ average=conf["average"], median_ratio=median_ratio, max_val=max_val
+ )
+
+ # Use mean/sigma directly (for full control over the distribution)
verify_field_exists(conf, "mean", section, subsection)
verify_field_exists(conf, "sigma", section, subsection)
- max_val = conf.get("max", None)
- return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
+ return LognormalDistribution(
+ mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
+ )
elif distribution == "uniform":
verify_field_exists(conf, "min", section, subsection)
diff --git a/benchmarks/multi_turn/generate_multi_turn.json b/benchmarks/multi_turn/generate_multi_turn.json
index 274d03c2bdb2b..03cfc7d63e8aa 100644
--- a/benchmarks/multi_turn/generate_multi_turn.json
+++ b/benchmarks/multi_turn/generate_multi_turn.json
@@ -15,9 +15,8 @@
},
"prefix_num_tokens": {
"distribution": "lognormal",
- "mean": 6,
- "sigma": 4,
- "max": 1500
+ "average": 1000,
+ "max": 5000
},
"num_tokens": {
"distribution": "uniform",
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 9c0ed1d09572e..8558976e2c392 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -480,7 +480,6 @@ function (define_gpu_extension_target GPU_MOD_NAME)
${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
endif()
- set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
target_compile_options(${GPU_MOD_NAME} PRIVATE
$<$:${GPU_COMPILE_FLAGS}>)
diff --git a/csrc/attention/mla/cutlass_mla_entry.cu b/csrc/attention/mla/cutlass_mla_entry.cu
deleted file mode 100644
index 0319d1daf302f..0000000000000
--- a/csrc/attention/mla/cutlass_mla_entry.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include
-
-#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
-void cutlass_mla_decode_sm100a(torch::Tensor const& out,
- torch::Tensor const& q_nope,
- torch::Tensor const& q_pe,
- torch::Tensor const& kv_c_and_k_pe_cache,
- torch::Tensor const& seq_lens,
- torch::Tensor const& page_table, double scale);
-#endif
-
-void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
- torch::Tensor const& q_pe,
- torch::Tensor const& kv_c_and_k_pe_cache,
- torch::Tensor const& seq_lens,
- torch::Tensor const& page_table, double scale) {
-#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
- return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
- seq_lens, page_table, scale);
-#endif
- TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
-}
diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu
deleted file mode 100644
index 9d05d910dd81f..0000000000000
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include
-
-#include
-#include
-
-#include "cute/tensor.hpp"
-
-#include "cutlass/cutlass.h"
-#include "cutlass/kernel_hardware_info.h"
-
-#include "cutlass_extensions/common.hpp"
-
-#include "device/sm100_mla.hpp"
-#include "kernel/sm100_mla_tile_scheduler.hpp"
-
-using namespace cute;
-using namespace cutlass::fmha::kernel;
-
-template
-struct MlaSm100 {
- using Element = T;
- using ElementAcc = float;
- using ElementOut = T;
-
- using TileShape = Shape<_128, _128, Shape<_512, _64>>;
- using TileShapeH = cute::tuple_element_t<0, TileShape>;
- using TileShapeD = cute::tuple_element_t<2, TileShape>;
-
- // H K (D_latent D_rope) B
- using ProblemShape = cute::tuple;
-
- using StrideQ = cute::tuple; // H D B
- using StrideK = cute::tuple; // K D B
- using StrideO = StrideK; // H D B
- using StrideLSE = cute::tuple<_1, int>; // H B
-
- using TileScheduler =
- std::conditional_t;
-
- using FmhaKernel =
- cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
- TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
- /*kIsCpAsync=*/true>;
- using Fmha = cutlass::fmha::device::MLA;
-};
-
-template
-typename T::Fmha::Arguments args_from_options(
- at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
- at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
- at::Tensor const& page_table, double scale) {
- cutlass::KernelHardwareInfo hw_info;
- hw_info.device_id = q_nope.device().index();
- hw_info.sm_count =
- cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
- hw_info.device_id);
-
- int batches = q_nope.sizes()[0];
- int page_count_per_seq = page_table.sizes()[1];
- int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
- int page_size = kv_c_and_k_pe_cache.sizes()[1];
- int max_seq_len = page_size * page_count_per_seq;
- using TileShapeH = typename T::TileShapeH;
- using TileShapeD = typename T::TileShapeD;
- auto problem_shape =
- cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
-
- auto [H, K, D, B] = problem_shape;
- auto [D_latent, D_rope] = D;
-
- using StrideQ = typename T::StrideQ;
- using StrideK = typename T::StrideK;
- using StrideO = typename T::StrideO;
- using StrideLSE = typename T::StrideLSE;
-
- StrideQ stride_Q_latent = cute::make_tuple(
- static_cast(D_latent), _1{}, static_cast(H * D_latent));
- StrideQ stride_Q_rope = cute::make_tuple(static_cast(D_rope), _1{},
- static_cast(H * D_rope));
- StrideK stride_C =
- cute::make_tuple(static_cast(D_latent + D_rope), _1{},
- static_cast(page_size * (D_latent + D_rope)));
- StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
- StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast(H));
- StrideO stride_O = cute::make_tuple(static_cast(D_latent), _1{},
- static_cast(H * D_latent));
-
- using Element = typename T::Element;
- using ElementOut = typename T::ElementOut;
- using ElementAcc = typename T::ElementAcc;
- auto Q_latent_ptr = static_cast(q_nope.data_ptr());
- auto Q_rope_ptr = static_cast(q_pe.data_ptr());
- auto C_ptr = static_cast(kv_c_and_k_pe_cache.data_ptr());
- auto scale_f = static_cast(scale);
- typename T::Fmha::Arguments arguments{
- problem_shape,
- {scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
- stride_C, C_ptr + D_latent, stride_C,
- static_cast(seq_lens.data_ptr()),
- static_cast(page_table.data_ptr()), stride_PT, page_count_total,
- page_size},
- {static_cast(out.data_ptr()), stride_O,
- static_cast(nullptr), stride_LSE},
- hw_info,
- 1, // split_kv
- nullptr, // is_var_split_kv
- };
- // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
- // split_kv automatically based on batch size and sequence length to balance
- // workload across available SMs. Consider using var_split_kv for manual
- // control if needed.
- T::Fmha::set_split_kv(arguments);
- return arguments;
-}
-
-template
-void runMla(at::Tensor const& out, at::Tensor const& q_nope,
- at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
- at::Tensor const& seq_lens, at::Tensor const& page_table,
- float scale, cudaStream_t stream) {
- using MlaSm100Type = MlaSm100;
- typename MlaSm100Type::Fmha fmha;
- auto arguments = args_from_options(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
- size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
- auto const workspace_options =
- torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
- auto workspace = torch::empty(workspace_size, workspace_options);
-
- CUTLASS_CHECK(fmha.can_implement(arguments));
-
- CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
-
- CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
-}
-
-void cutlass_mla_decode_sm100a(torch::Tensor const& out,
- torch::Tensor const& q_nope,
- torch::Tensor const& q_pe,
- torch::Tensor const& kv_c_and_k_pe_cache,
- torch::Tensor const& seq_lens,
- torch::Tensor const& page_table, double scale) {
- TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
- TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
- TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
- TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
- "kv_c_and_k_pe_cache must be a 3D tensor");
- TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
- TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
- TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
-
- auto B_q_nope = q_nope.size(0);
- auto H_q_nope = q_nope.size(1);
- auto D_q_nope = q_nope.size(2);
- auto B_q_pe = q_pe.size(0);
- auto H_q_pe = q_pe.size(1);
- auto D_q_pe = q_pe.size(2);
- auto B_pt = page_table.size(0);
- auto PAGE_NUM = page_table.size(1);
- auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
- auto D_ckv = kv_c_and_k_pe_cache.size(2);
- auto B_o = out.size(0);
- auto H_o = out.size(1);
- auto D_o = out.size(2);
-
- TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
- TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
- TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
- TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
- "H_q_nope, H_q_pe, and H_o must be equal to 128");
- TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
- "PAGE_SIZE must be a power of 2");
- TORCH_CHECK(
- B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
- "Batch dims must be same for page_table, q_nope and q_pe, and out");
- TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
- "PAGE_NUM must be divisible by 128 / PAGE_SIZE");
- TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
-
- TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
- q_nope.dtype() == at::ScalarType::BFloat16 ||
- q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
- "q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
- TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
- q_nope.dtype() == q_pe.dtype(),
- "kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
- TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
- "seq_lens must be a 32-bit integer tensor");
- TORCH_CHECK(page_table.dtype() == torch::kInt32,
- "page_table must be a 32-bit integer tensor");
-
- auto in_dtype = q_nope.dtype();
- const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
- const cudaStream_t stream =
- at::cuda::getCurrentCUDAStream(q_nope.get_device());
- if (in_dtype == at::ScalarType::Half) {
- runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
- page_table, scale, stream);
- } else if (in_dtype == at::ScalarType::BFloat16) {
- runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache,
- seq_lens, page_table, scale, stream);
- } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
- runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache,
- seq_lens, page_table, scale, stream);
- } else {
- TORCH_CHECK(false, "Unsupported input data type of MLA");
- }
-}
diff --git a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
index 95e32559cd540..fbbc2e588c326 100644
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@@ -133,6 +133,14 @@ public:
// printf(" sm_count = %d\n", sm_count);
int max_splits = ceil_div(K, 128);
max_splits = min(16, max_splits);
+
+ // TODO: This avoids a hang when the batch size larger than 1 and
+ // there is more than 4 kv_splits.
+ // Discuss with NVIDIA how this can be fixed.
+ if (B > 1) {
+ max_splits = min(2, max_splits);
+ }
+
// printf(" max_splits = %d\n", max_splits);
int sms_per_batch = max(1, sm_count / B);
// printf(" sms_per_batch = %d\n", sms_per_batch);
diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
index 820bf81dd1a02..d1874515cc8fd 100644
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -36,12 +36,14 @@ limitations under the License.
#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
void sm100_cutlass_mla_decode(
torch::Tensor const& out,
+ torch::Tensor const& lse,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
torch::Tensor const& seq_lens,
torch::Tensor const& page_table,
torch::Tensor const& workspace,
+ double sm_scale,
int64_t num_kv_splits) {
TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
}
@@ -99,6 +101,7 @@ struct MlaSm100 {
template
typename T::Fmha::Arguments args_from_options(
at::Tensor const& out,
+ at::Tensor const& lse,
at::Tensor const& q_nope,
at::Tensor const& q_pe,
at::Tensor const& kv_c_and_k_pe_cache,
@@ -162,7 +165,10 @@ typename T::Fmha::Arguments args_from_options(
stride_PT,
page_count_total,
page_size},
- {static_cast(out.data_ptr()), stride_O, static_cast(nullptr), stride_LSE},
+ {static_cast(out.data_ptr()),
+ stride_O,
+ static_cast(lse.defined() ? lse.data_ptr() : nullptr),
+ stride_LSE},
hw_info,
// TODO(trevor-m): Change split_kv back to -1 when
// https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
@@ -181,6 +187,7 @@ typename T::Fmha::Arguments args_from_options(
template
void runMla(
at::Tensor const& out,
+ at::Tensor const& lse,
at::Tensor const& q_nope,
at::Tensor const& q_pe,
at::Tensor const& kv_c_and_k_pe_cache,
@@ -192,7 +199,7 @@ void runMla(
cudaStream_t stream) {
using MlaSm100Type = MlaSm100;
typename MlaSm100Type::Fmha fmha;
- auto arguments = args_from_options(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+ auto arguments = args_from_options(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
CUTLASS_CHECK(fmha.can_implement(arguments));
@@ -214,6 +221,7 @@ void runMla(
void sm100_cutlass_mla_decode(
torch::Tensor const& out,
+ torch::Tensor const& lse,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
@@ -234,13 +242,13 @@ void sm100_cutlass_mla_decode(
DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
if (in_dtype == at::ScalarType::Half) {
runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else if (in_dtype == at::ScalarType::BFloat16) {
runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else {
TORCH_CHECK(false, "Unsupported input data type of MLA");
}
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index 17bbe04eef94a..c3a21796881c9 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -17,4 +17,8 @@
#warning "unsupported vLLM cpu implementation"
#endif
+#ifdef _OPENMP
+ #include
+#endif
+
#endif
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index ab8cbbbf4ec4f..51bca37e699b9 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -12,7 +12,7 @@ namespace vec_op {
#define vec_sub(a, b) ((a) - (b))
#define vec_mul(a, b) ((a) * (b))
#define vec_div(a, b) ((a) / (b))
-#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic
+#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
// FIXME: FP16 is not fully supported in Torch-CPU
diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp
index 9a3af4ac9d8a6..1c42a75bc2d61 100644
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -523,7 +523,7 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
CPU_KERNEL_GUARD_IN(onednn_mm)
TORCH_CHECK(a.dim() == 2);
TORCH_CHECK(a.stride(-1) == 1);
- TORCH_CHECK(c.is_contiguous());
+ TORCH_CHECK(c.stride(-1) == 1);
MatMulPrimitiveHandler* ptr =
reinterpret_cast(handler);
diff --git a/csrc/cpu/sgl-kernels/moe.cpp b/csrc/cpu/sgl-kernels/moe.cpp
index beeccff783ea0..94b24c2f13a06 100644
--- a/csrc/cpu/sgl-kernels/moe.cpp
+++ b/csrc/cpu/sgl-kernels/moe.cpp
@@ -215,7 +215,7 @@ int moe_align_block_size(
offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
}
});
- // TODO: do we need to vecterize this ?
+ // TODO: do we need to vectorize this ?
for (int mb = 0; mb < num_token_blocks; ++mb) {
offsets[mb + 1] += offsets[mb];
}
diff --git a/csrc/cub_helpers.h b/csrc/cub_helpers.h
new file mode 100644
index 0000000000000..470a63a22cab0
--- /dev/null
+++ b/csrc/cub_helpers.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#ifndef USE_ROCM
+ #include
+ #if CUB_VERSION >= 200800
+ #include
+using CubAddOp = cuda::std::plus<>;
+using CubMaxOp = cuda::maximum<>;
+ #else // if CUB_VERSION < 200800
+using CubAddOp = cub::Sum;
+using CubMaxOp = cub::Max;
+ #endif // CUB_VERSION
+#else
+ #include
+using CubAddOp = cub::Sum;
+using CubMaxOp = cub::Max;
+#endif // USE_ROCM
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index 44709b4597765..58926f6429dd3 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -15,6 +15,8 @@ typedef __hip_bfloat16 nv_bfloat16;
#include