diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
index 3ea0b7bb5cd66..4ef8b5c3709b3 100644
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@@ -4,8 +4,8 @@ tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
- value: 0.233
+ value: 0.231
- name: "exact_match,flexible-extract"
- value: 0.236
+ value: 0.22
limit: 1000
num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 96e57dfd06475..4ae23eff62f37 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,6 +13,7 @@ from pathlib import Path
import lm_eval
import numpy
+import pytest
import yaml
RTOL = 0.05
@@ -46,6 +47,10 @@ def test_lm_eval_correctness():
eval_config = yaml.safe_load(
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
+ if eval_config[
+ "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
+ pytest.skip("FBGEMM is currently failing on main.")
+
# Launch eval requests.
results = launch_lm_eval(eval_config)
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 096a1c870c6ba..18f582b6e4c94 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -57,8 +57,6 @@ steps:
agents:
queue: tpu_queue_postmerge
commands:
- - "rm -f /var/log/syslog"
- - "rm -f /var/log/kern.log"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
@@ -84,7 +82,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 955baa1ff8b3c..0680bae13ddbf 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -101,16 +101,30 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_permute_cols.py"
fi
-#ignore certain Entrypoints tests
+#ignore certain Entrypoints/openai tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
- --ignore=entrypoints/openai/test_accuracy.py \
--ignore=entrypoints/openai/test_audio.py \
- --ignore=entrypoints/openai/test_encoder_decoder.py \
- --ignore=entrypoints/openai/test_embedding.py \
- --ignore=entrypoints/openai/test_oot_registration.py "}
+ --ignore=entrypoints/openai/test_chat.py \
+ --ignore=entrypoints/openai/test_shutdown.py \
+ --ignore=entrypoints/openai/test_completion.py \
+ --ignore=entrypoints/openai/test_sleep.py \
+ --ignore=entrypoints/openai/test_models.py \
+ --ignore=entrypoints/openai/test_prompt_validation.py "}
fi
+#ignore certain Entrypoints/llm tests
+if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+ commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+fi
+
+# --ignore=entrypoints/openai/test_encoder_decoder.py \
+# --ignore=entrypoints/openai/test_embedding.py \
+# --ignore=entrypoints/openai/test_oot_registration.py
+# --ignore=entrypoints/openai/test_accuracy.py \
+# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
+
+
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 55c374fcc33de..ad5ae6f415748 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -44,11 +44,11 @@ remove_docker_container() {
trap remove_docker_container EXIT
# Run the image
-docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
+docker run --rm -it --device=/dev/neuron0 --network bridge \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \
${image_name} \
- /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
+ /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
deleted file mode 100755
index 8ba2e4e386fdb..0000000000000
--- a/.buildkite/run-tpu-test.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Build the docker image.
-docker build -f Dockerfile.tpu -t vllm-tpu .
-
-# Set up cleanup.
-remove_docker_container() { docker rm -f tpu-test || true; }
-trap remove_docker_container EXIT
-# Remove the container that might not be cleaned up in the previous run.
-remove_docker_container
-
-# For HF_TOKEN.
-source /etc/environment
-# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it \
- -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
- vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
- && python3 -m pip install pytest \
- && python3 -m pip install lm_eval[api]==0.4.4 \
- && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
- && python3 /workspace/vllm/tests/tpu/test_compilation.py \
- && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
- && python3 /workspace/vllm/examples/offline_inference/tpu.py"
diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
new file mode 100755
index 0000000000000..82f40c650f8cf
--- /dev/null
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+set -e
+
+# Build the docker image.
+docker build -f Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it \
+ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+ vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+ && python3 -m pip install pytest \
+ && python3 -m pip install lm_eval[api]==0.4.4 \
+ && echo TEST_1 \
+ && VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
+ && echo TEST_2 \
+ && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+ && echo TEST_3 \
+ && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+ && echo TEST_4 \
+ && VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+ && echo TEST_5 \
+ && VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
+
+
+# TODO: This test fails because it uses RANDOM_SEED sampling
+# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index d48639e5720c5..3a0e6bdb2caaf 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -4,16 +4,28 @@
# It serves a sanity check for compilation and basic model usage.
set -ex
+image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
# Try building the docker image
-docker build -t xpu-test -f Dockerfile.xpu .
+docker build -t ${image_name} -f Dockerfile.xpu .
# Setup cleanup
-remove_docker_container() { docker rm -f xpu-test || true; }
+remove_docker_container() {
+ docker rm -f "${container_name}" || true;
+ docker image rm -f "${image_name}" || true;
+ docker system prune -f || true;
+}
trap remove_docker_container EXIT
-remove_docker_container
# Run the image and test offline inference/tensor parallel
-docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
- python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
- python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
+docker run \
+ --device /dev/dri \
+ -v /dev/dri/by-path:/dev/dri/by-path \
+ --entrypoint="" \
+ --name "${container_name}" \
+ "${image_name}" \
+ sh -c '
+ VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+ VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
'
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2af76cb24dd14..730f272b54e7c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -41,7 +41,6 @@ steps:
- grep \"sig sig-object py\" build/html/api/inference_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 24min
- fast_check: true
source_file_dependencies:
- vllm/
- tests/mq_llm_engine
@@ -118,15 +117,14 @@ steps:
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+ - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py
- - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+ - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
- fast_check: true
source_file_dependencies:
- vllm/distributed/
- vllm/core/
@@ -138,7 +136,11 @@ steps:
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
commands:
- - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
+ # test with tp=2 and external_dp=2
+ - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with internal dp
+ - python3 ../examples/offline_inference/data_parallel.py
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
@@ -152,7 +154,6 @@ steps:
- label: Metrics, Tracing Test # 10min
num_gpus: 2
- fast_check: true
source_file_dependencies:
- vllm/
- tests/metrics
@@ -200,16 +201,19 @@ steps:
- tests/v1
commands:
# split the test to avoid interference
- - VLLM_USE_V1=1 pytest -v -s v1/core
- - VLLM_USE_V1=1 pytest -v -s v1/engine
- - VLLM_USE_V1=1 pytest -v -s v1/sample
- - VLLM_USE_V1=1 pytest -v -s v1/worker
- - VLLM_USE_V1=1 pytest -v -s v1/structured_output
- - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
- - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
+ - pytest -v -s v1/core
+ - pytest -v -s v1/entrypoints
+ - pytest -v -s v1/engine
+ - pytest -v -s v1/entrypoints
+ - pytest -v -s v1/sample
+ - pytest -v -s v1/worker
+ - pytest -v -s v1/structured_output
+ - pytest -v -s v1/test_stats.py
+ - pytest -v -s v1/test_utils.py
+ - pytest -v -s v1/test_oracle.py
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- - VLLM_USE_V1=1 pytest -v -s v1/e2e
+ - pytest -v -s v1/e2e
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -227,14 +231,17 @@ steps:
- python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- - python3 offline_inference/vision_language.py
- - python3 offline_inference/vision_language_multi_image.py
- - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_embedding.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+ - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min
mirror_hardwares: [amd]
@@ -284,7 +291,6 @@ steps:
parallelism: 4
- label: PyTorch Fullgraph Smoke Test # 9min
- fast_check: true
source_file_dependencies:
- vllm/
- tests/compile
@@ -293,6 +299,7 @@ steps:
# these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py
+ - pytest -v -s compile/test_pass_manager.py
- label: PyTorch Fullgraph Test # 18min
source_file_dependencies:
@@ -379,7 +386,8 @@ steps:
commands:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py
- - pytest -v -s models/test_initialization.py
+ # V1 Test: https://github.com/vllm-project/vllm/issues/14531
+ - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
- label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd]
@@ -508,8 +516,6 @@ steps:
- entrypoints/llm/test_collective_rpc.py
commands:
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
- - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -522,13 +528,12 @@ steps:
# this test fails consistently.
# TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+ - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+ - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests"
num_gpus: 2
- fast_check: true
source_file_dependencies:
- vllm/plugins/
- tests/plugins/
diff --git a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
deleted file mode 100644
index 79e6e9080d51c..0000000000000
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: π² Misc/random discussions that do not fit into the above categories.
-description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
-title: "[Misc]: "
-labels: ["misc"]
-
-body:
-- type: markdown
- attributes:
- value: >
- #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
-- type: textarea
- attributes:
- label: Anything you want to discuss about vllm.
- description: >
- Anything you want to discuss about vllm.
- validations:
- required: true
-- type: markdown
- attributes:
- value: >
- Thanks for contributing π!
-- type: checkboxes
- id: askllm
- attributes:
- label: Before submitting a new issue...
- options:
- - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
- required: true
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 3ba13e0cec6cb..fa40268d67727 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1 +1,5 @@
blank_issues_enabled: false
+contact_links:
+ - name: Questions
+ url: https://discuss.vllm.ai
+ about: Ask questions and discuss with other vLLM community members
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 074ac9d122bfe..484cd171f5f52 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,7 +53,7 @@ repos:
entry: tools/mypy.sh 0 "local"
language: python
types: [python]
- additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+ additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
stages: [pre-commit] # Don't run in CI
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5baa39b6f9e59..65d1ddbeee0b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
#
# Try to find python package with an executable that exactly matches
@@ -319,7 +319,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build AllSpark kernels if we are building for at least some compatible archs.
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
+ if (ALLSPARK_ARCHS)
set(ALLSPARK_SRCS
"csrc/quantization/gptq_allspark/allspark_repack.cu"
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
@@ -330,7 +330,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
else()
message(STATUS "Not building AllSpark kernels as no compatible archs found"
- " in CUDA target architectures, or CUDA not >= 12.0")
+ " in CUDA target architectures")
endif()
diff --git a/Dockerfile b/Dockerfile
index ff4a0839f6e0f..79bca1cf9f8c1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -222,7 +222,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
- uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
+ uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
fi
COPY examples examples
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index e2d9ab37533e4..f852f3d69759f 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -61,6 +61,7 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
cd /install \
&& pip install -U -r requirements/rocm.txt \
+ && pip install -U -r requirements/rocm-test.txt \
&& pip uninstall -y vllm \
&& pip install *.whl
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 672a494eef99d..ad4abf16b43b6 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -1,11 +1,7 @@
-FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
+# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
+FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
- echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
- chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
- wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
- echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
- chmod 644 /usr/share/keyrings/intel-graphics.gpg
+RUN rm /etc/apt/sources.list.d/intel-graphics.list
RUN apt-get update -y && \
apt-get install -y --no-install-recommends --fix-missing \
@@ -21,8 +17,6 @@ RUN apt-get update -y && \
python3 \
python3-dev \
python3-pip \
- libze-intel-gpu-dev \
- libze-intel-gpu1 \
wget
WORKDIR /workspace/vllm
diff --git a/README.md b/README.md
index 405e3a257f768..573b667ca88e3 100644
--- a/README.md
+++ b/README.md
@@ -10,21 +10,21 @@ Easy, fast, and cheap LLM serving for everyone
-| Documentation | Blog | Paper | Twitter/X | Developer Slack |
+| Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |
---
-Weβre excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!
+[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
-Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
-
-π **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!
+[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
---
*Latest News* π₯
+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
+- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
@@ -151,10 +151,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us
-- For technical questions and feature requests, please use GitHub issues or discussions.
-- For discussing with fellow users and coordinating contributions and development, please use Slack.
-- For security disclosures, please use GitHub's security advisory feature.
-- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
+- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
+- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
+- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
+- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
## Media Kit
diff --git a/benchmarks/README.md b/benchmarks/README.md
index edc10d8b43eeb..d41de1caa04c0 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -42,21 +42,27 @@ become available.
| HuggingFace |
- β
|
- π§ |
+ π‘ |
+ π‘ |
Specify your dataset path on HuggingFace |
| VisionArena |
β
|
- π§ |
+ β
|
lmarena-ai/vision-arena-bench-v0.1 (a HuggingFace dataset) |
-β
: supported
+
+β
: supported
+
π§: to be supported
+π‘: Partial support. Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
+If you need support for other dataset formats, please consider contributing.
+
**Note**: VisionArenaβs `dataset-name` should be set to `hf`
---
@@ -76,10 +82,10 @@ Then run the benchmarking script
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
-BACKEND="openai-chat"
+BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
```
If successful, you will see the following output
@@ -123,7 +129,7 @@ DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT='train'
-python3 benchmarks/benchmark_serving.py \
+python3 vllm/benchmarks/benchmark_serving.py \
--backend "${BACKEND}" \
--model "${MODEL_NAME}" \
--endpoint "/v1/chat/completions" \
@@ -133,6 +139,57 @@ python3 benchmarks/benchmark_serving.py \
--num-prompts "${NUM_PROMPTS}"
```
+### HuggingFaceDataset Examples
+
+Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
+formats, please consider contributing.
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
+```
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
+DATASET_SPLIT='train'
+DATASET_SUBSET='chart2text(cauldron)'
+python3 vllm/benchmarks/benchmark_serving.py \
+ --backend "${BACKEND}" \
+ --model "${MODEL_NAME}" \
+ --endpoint "/v1/chat/completions" \
+ --dataset-name "${DATASET_NAME}" \
+ --dataset-path "${DATASET_PATH}" \
+ --hf-split "${DATASET_SPLIT}" \
+ --num-prompts "${NUM_PROMPTS}" \
+ --hf-subset "${DATASET_SUBSET}"
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
+DATASET_SPLIT='train'
+python3 vllm/benchmarks/benchmark_serving.py \
+ --backend "${BACKEND}" \
+ --model "${MODEL_NAME}" \
+ --endpoint "/v1/chat/completions" \
+ --dataset-name "${DATASET_NAME}" \
+ --dataset-path "${DATASET_PATH}" \
+ --hf-split "${DATASET_SPLIT}" \
+ --num-prompts "${NUM_PROMPTS}" \
+```
+
---
## Example - Offline Throughput Benchmark
@@ -140,35 +197,65 @@ python3 benchmarks/benchmark_serving.py \
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
DATASET_NAME="sonnet"
-DATASET_PATH="benchmarks/sonnet.txt"
+DATASET_PATH="vllm/benchmarks/sonnet.txt"
-python3 benchmarks/benchmark_throughput.py \
+python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--num-prompts "${NUM_PROMPTS}"
- ```
+```
If successful, you will see the following output
```
-Throughput: 7.35 requests/s, 4789.20 total tokens/s, 1102.83 output tokens/s
+Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
+Total num prompt tokens: 5014
+Total num output tokens: 1500
+```
+
+### VisionArena Benchmark for Vision Language Models
+
+``` bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+DATASET_NAME="hf"
+DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
+DATASET_SPLIT="train"
+
+python3 vllm/benchmarks/benchmark_throughput.py \
+ --model "${MODEL_NAME}" \
+ --backend "vllm-chat" \
+ --dataset-name "${DATASET_NAME}" \
+ --dataset-path "${DATASET_PATH}" \
+ --num-prompts "${NUM_PROMPTS}" \
+ --hf-split "${DATASET_SPLIT}"
+```
+
+The `num prompt tokens` now includes image token counts
+
+```
+Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
+Total num prompt tokens: 14527
+Total num output tokens: 1280
```
### Benchmark with LoRA Adapters
``` bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="meta-llama/Llama-2-7b-hf"
BACKEND="vllm"
DATASET_NAME="sharegpt"
-DATASET_PATH="/home/jovyan/data/vllm_benchmark_datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
+DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json"
NUM_PROMPTS=10
MAX_LORAS=2
MAX_LORA_RANK=8
ENABLE_LORA="--enable-lora"
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
-python3 benchmarks/benchmark_throughput.py \
+python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \
--backend "${BACKEND}" \
--dataset_path "${DATASET_PATH}" \
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index d53428d219e7a..0f13c79ae234b 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -14,7 +14,8 @@ from tqdm.asyncio import tqdm
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
-from vllm.model_executor.model_loader.weight_utils import get_lock
+# NOTE(simon): do not import vLLM here so the benchmark script
+# can run without vLLM installed.
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@@ -62,7 +63,7 @@ async def async_request_tgi(
"temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p.
"truncate": request_func_input.prompt_len,
- # TGI does not accept ignore_eos flag.
+ "ignore_eos_token": request_func_input.ignore_eos,
}
payload = {
"inputs": request_func_input.prompt,
@@ -70,6 +71,10 @@ async def async_request_tgi(
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
+ if request_func_input.ignore_eos:
+ output.output_tokens = request_func_input.output_len
+ else:
+ output.output_tokens = None
ttft = 0.0
st = time.perf_counter()
@@ -333,7 +338,7 @@ async def async_request_openai_chat_completions(
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
- "chat/completions"
+ ("chat/completions", "profile")
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
async with aiohttp.ClientSession(trust_env=True,
@@ -427,6 +432,8 @@ def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
+ from vllm.model_executor.model_loader.weight_utils import get_lock
+
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with get_lock(pretrained_model_name_or_path):
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 30fffdda491d0..0567875f9862f 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -17,6 +17,7 @@ SampleRequest instances, similar to the approach used in ShareGPT.
import base64
import io
import json
+import logging
import random
from abc import ABC, abstractmethod
from collections.abc import Mapping
@@ -35,6 +36,8 @@ from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+logger = logging.getLogger(__name__)
+
# -----------------------------------------------------------------------------
# Data Classes
# -----------------------------------------------------------------------------
@@ -46,7 +49,7 @@ class SampleRequest:
Represents a single inference request for benchmarking.
"""
- prompt: str
+ prompt: Union[str, Any]
prompt_len: int
expected_output_len: int
multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
@@ -61,9 +64,6 @@ class SampleRequest:
class BenchmarkDataset(ABC):
DEFAULT_SEED = 0
- # num_requests has default 1000 in both the benchmark_serving.py and
- # benchmark_throughput.py
-
def __init__(
self,
dataset_path: Optional[str] = None,
@@ -84,13 +84,27 @@ class BenchmarkDataset(ABC):
if random_seed is not None else self.DEFAULT_SEED)
self.data = None
+ def apply_multimodal_chat_transformation(
+ self,
+ prompt: str,
+ mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
+ """
+ Transform a prompt and optional multimodal content into a chat format.
+ This method is used for chat models that expect a specific conversation
+ format.
+ """
+ content = [{"text": prompt, "type": "text"}]
+ if mm_content is not None:
+ content.append(mm_content)
+ return [{"role": "user", "content": content}]
+
def load_data(self) -> None:
"""
Load data from the dataset path into self.data.
-
+
This method must be overridden by subclasses since the method to load
data will vary depending on the dataset format and source.
-
+
Raises:
NotImplementedError: If a subclass does not implement this method.
"""
@@ -107,18 +121,18 @@ class BenchmarkDataset(ABC):
"""
Optionally select a random LoRA request and return its associated
tokenizer.
-
+
This method is used when LoRA parameters are provided. It randomly
selects a LoRA based on max_loras and retrieves a cached tokenizer for
that LoRA if available. Otherwise, it returns the base tokenizer.
-
+
Args:
tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
LoRA is selected. max_loras (Optional[int]): The maximum number of
LoRAs available. If None, LoRA is not used. lora_path
(Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
is not used.
-
+
Returns:
tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
element is a LoRARequest (or None if not applicable) and the second
@@ -146,21 +160,39 @@ class BenchmarkDataset(ABC):
num_requests: int) -> list[SampleRequest]:
"""
Abstract method to generate sample requests from the dataset.
-
+
Subclasses must override this method to implement dataset-specific logic
for generating a list of SampleRequest objects.
-
+
Args:
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
for processing the dataset's text.
num_requests (int): The number of sample requests to generate.
-
+
Returns:
list[SampleRequest]: A list of sample requests generated from the
dataset.
"""
raise NotImplementedError("sample must be implemented in subclasses.")
+ def maybe_oversample_requests(self, requests: list[SampleRequest],
+ num_requests: int) -> None:
+ """
+ Oversamples the list of requests if its size is less than the desired
+ number.
+
+ Args:
+ requests (List[SampleRequest]): The current list of sampled
+ requests. num_requests (int): The target number of requests.
+ """
+ if len(requests) < num_requests:
+ random.seed(self.random_seed)
+ additional = random.choices(requests,
+ k=num_requests - len(requests))
+ requests.extend(additional)
+ logger.info("Oversampled requests to reach %d total samples.",
+ num_requests)
+
# -----------------------------------------------------------------------------
# Utility Functions and Global Caches
@@ -262,15 +294,16 @@ class RandomDataset(BenchmarkDataset):
) -> None:
super().__init__(**kwargs)
- def sample(self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- prefix_len: int = DEFAULT_PREFIX_LEN,
- range_ratio: float = DEFAULT_RANGE_RATIO,
- input_len: int = DEFAULT_INPUT_LEN,
- output_len: int = DEFAULT_OUTPUT_LEN,
- **kwargs) -> list[SampleRequest]:
-
+ def sample(
+ self,
+ tokenizer: PreTrainedTokenizerBase,
+ num_requests: int,
+ prefix_len: int = DEFAULT_PREFIX_LEN,
+ range_ratio: float = DEFAULT_RANGE_RATIO,
+ input_len: int = DEFAULT_INPUT_LEN,
+ output_len: int = DEFAULT_OUTPUT_LEN,
+ **kwargs,
+ ) -> list[SampleRequest]:
vocab_size = tokenizer.vocab_size
prefix_token_ids = (np.random.randint(
@@ -332,19 +365,24 @@ class ShareGPTDataset(BenchmarkDataset):
random.seed(self.random_seed)
random.shuffle(self.data)
- def sample(self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- lora_path: Optional[str] = None,
- max_loras: Optional[int] = None,
- output_len: Optional[int] = None,
- **kwargs) -> list:
+ def sample(
+ self,
+ tokenizer: PreTrainedTokenizerBase,
+ num_requests: int,
+ lora_path: Optional[str] = None,
+ max_loras: Optional[int] = None,
+ output_len: Optional[int] = None,
+ enable_multimodal_chat: bool = False,
+ **kwargs,
+ ) -> list:
samples: list = []
for entry in self.data:
if len(samples) >= num_requests:
break
- prompt, completion = entry["conversations"][0]["value"],\
- entry["conversations"][1]["value"]
+ prompt, completion = (
+ entry["conversations"][0]["value"],
+ entry["conversations"][1]["value"],
+ )
lora_request, tokenizer = self.get_random_lora_request(
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
@@ -358,6 +396,9 @@ class ShareGPTDataset(BenchmarkDataset):
skip_min_output_len_check=output_len
is not None):
continue
+ if enable_multimodal_chat:
+ prompt = self.apply_multimodal_chat_transformation(
+ prompt, None)
samples.append(
SampleRequest(
prompt=prompt,
@@ -365,6 +406,7 @@ class ShareGPTDataset(BenchmarkDataset):
expected_output_len=new_output_len,
lora_request=lora_request,
))
+ self.maybe_oversample_requests(samples, num_requests)
return samples
@@ -397,19 +439,20 @@ class SonnetDataset(BenchmarkDataset):
with open(self.dataset_path, encoding="utf-8") as f:
self.data = f.readlines()
- def sample(self,
- tokenizer,
- num_requests: int,
- prefix_len: int = DEFAULT_PREFIX_LEN,
- input_len: int = DEFAULT_INPUT_LEN,
- output_len: int = DEFAULT_OUTPUT_LEN,
- return_prompt_formatted: bool = False,
- **kwargs) -> list:
+ def sample(
+ self,
+ tokenizer,
+ num_requests: int,
+ prefix_len: int = DEFAULT_PREFIX_LEN,
+ input_len: int = DEFAULT_INPUT_LEN,
+ output_len: int = DEFAULT_OUTPUT_LEN,
+ return_prompt_formatted: bool = False,
+ **kwargs,
+ ) -> list:
# Calculate average token length for a poem line.
tokenized_lines = [tokenizer(line).input_ids for line in self.data]
avg_len = sum(len(tokens)
- for tokens in \
- tokenized_lines) / len(tokenized_lines)
+ for tokens in tokenized_lines) / len(tokenized_lines)
# Build the base prompt.
base_prompt = "Pick as many lines as you can from these poem lines:\n"
@@ -488,12 +531,14 @@ class BurstGPTDataset(BenchmarkDataset):
# Convert the dataframe to a list of lists.
return data.values.tolist()
- def sample(self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- max_loras: Optional[int] = None,
- lora_path: Optional[str] = None,
- **kwargs) -> list[SampleRequest]:
+ def sample(
+ self,
+ tokenizer: PreTrainedTokenizerBase,
+ num_requests: int,
+ max_loras: Optional[int] = None,
+ lora_path: Optional[str] = None,
+ **kwargs,
+ ) -> list[SampleRequest]:
samples = []
data = self._sample_loaded_data(num_requests=num_requests)
for i in range(num_requests):
@@ -526,7 +571,6 @@ class HuggingFaceDataset(BenchmarkDataset):
Dataset class for processing a HuggingFace dataset with conversation data
and optional images.
"""
- DEFAULT_NUM_REQUESTS = 1000
def __init__(
self,
@@ -550,10 +594,13 @@ class HuggingFaceDataset(BenchmarkDataset):
split=self.dataset_split,
streaming=True,
)
-
- if "conversations" not in self.data.features:
- raise ValueError("HF Dataset must have a 'conversations' column.")
-
+ if self.data.features is None or "conversations" \
+ not in self.data.features:
+ raise ValueError(
+ "HuggingFaceDataset currently only supports datasets with "
+ "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
+ "Please consider contributing if you would like to add "
+ "support for additional dataset formats.")
# Shuffle and filter examples with at least 2 conversations.
self.data = self.data.shuffle(seed=self.random_seed).filter(
lambda x: len(x["conversations"]) >= 2)
@@ -561,9 +608,8 @@ class HuggingFaceDataset(BenchmarkDataset):
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
- lora_path: Optional[str] = None,
- max_loras: Optional[int] = None,
output_len: Optional[int] = None,
+ enable_multimodal_chat: bool = False,
**kwargs) -> list:
sampled_requests = []
dynamic_output = output_len is None
@@ -571,13 +617,9 @@ class HuggingFaceDataset(BenchmarkDataset):
for item in self.data:
if len(sampled_requests) >= num_requests:
break
-
conv = item["conversations"]
prompt, completion = conv[0]["value"], conv[1]["value"]
- lora_request, tokenizer = self.get_random_lora_request(
- tokenizer, lora_path=lora_path, max_loras=max_loras)
-
prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_ids)
@@ -587,17 +629,22 @@ class HuggingFaceDataset(BenchmarkDataset):
if dynamic_output and not is_valid_sequence(
prompt_len, completion_len):
continue
-
mm_content = process_image(
item["image"]) if "image" in item else None
+ if enable_multimodal_chat:
+ # Note: when chat is enabled the request prompt_len is no longer
+ # accurate and we will be using request output to count the
+ # actual prompt len and output len
+ prompt = self.apply_multimodal_chat_transformation(
+ prompt, mm_content)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=mm_content,
- lora_request=lora_request,
))
+ self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
@@ -606,25 +653,19 @@ class HuggingFaceDataset(BenchmarkDataset):
# -----------------------------------------------------------------------------
-class VisionArenaDataset(BenchmarkDataset):
+class VisionArenaDataset(HuggingFaceDataset):
"""
Vision Arena Dataset.
"""
DEFAULT_OUTPUT_LEN = 128
- DEFAULT_NUM_REQUESTS = 1000
VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
def __init__(
self,
- dataset_split: str,
- dataset_subset: Optional[str] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
- self.dataset_split = dataset_split
- self.dataset_subset = dataset_subset
-
if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
raise ValueError(f"Only support Vision Arena dataset.\
This data path {self.dataset_path} is not valid.")
@@ -642,12 +683,14 @@ class VisionArenaDataset(BenchmarkDataset):
)
self.data = dataset.shuffle(seed=self.random_seed)
- def sample(self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: int = DEFAULT_OUTPUT_LEN,
- **kwargs) -> list:
- # TODO (jenniferzhao): Add support for offline benchmark sampling
+ def sample(
+ self,
+ tokenizer: PreTrainedTokenizerBase,
+ num_requests: int,
+ output_len: Optional[int] = None,
+ enable_multimodal_chat: bool = False,
+ **kwargs,
+ ) -> list:
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = []
@@ -655,8 +698,14 @@ class VisionArenaDataset(BenchmarkDataset):
if len(sampled_requests) >= num_requests:
break
prompt = item["turns"][0][0]["content"]
- prompt_len = len(tokenizer(prompt).input_ids)
mm_content = process_image(item["images"][0])
+ prompt_len = len(tokenizer(prompt).input_ids)
+ if enable_multimodal_chat:
+ # Note: when chat is enabled the request prompt_len is no longer
+ # accurate and we will be using request output to count the
+ # actual prompt len
+ prompt = self.apply_multimodal_chat_transformation(
+ prompt, mm_content)
sampled_requests.append(
SampleRequest(
prompt=prompt,
@@ -664,4 +713,5 @@ class VisionArenaDataset(BenchmarkDataset):
expected_output_len=output_len,
multi_modal_data=mm_content,
))
+ self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 1dd01ca968678..47627126b6688 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -684,6 +684,15 @@ def main(args: argparse.Namespace):
"Invalid metadata format. Please use KEY=VALUE format."
)
+ if not args.save_detailed:
+ # Remove fields with too many data points
+ for field in [
+ "input_lens", "output_lens", "ttfts", "itls",
+ "generated_texts", "errors"
+ ]:
+ if field in result_json:
+ del result_json[field]
+
# Traffic
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
@@ -828,6 +837,12 @@ if __name__ == "__main__":
action="store_true",
help="Specify to save benchmark results to a json file",
)
+ parser.add_argument(
+ "--save-detailed",
+ action="store_true",
+ help="When saving the results, whether to include per request "
+ "information such as response, error, ttfs, tpots, etc.",
+ )
parser.add_argument(
"--metadata",
metavar="KEY=VALUE",
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 3a6e962c115c0..c79a93faff197 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -732,8 +732,11 @@ def main(args: argparse.Namespace):
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
base_url = f"http://{args.host}:{args.port}"
- tokenizer = get_tokenizer(tokenizer_id,
- trust_remote_code=args.trust_remote_code)
+ tokenizer = get_tokenizer(
+ tokenizer_id,
+ trust_remote_code=args.trust_remote_code,
+ tokenizer_mode=args.tokenizer_mode,
+ )
if args.dataset == 'grammar':
args.structure_type = 'guided_grammar'
@@ -876,6 +879,13 @@ if __name__ == "__main__":
help=
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
)
+ parser.add_argument(
+ "--tokenizer-mode",
+ type=str,
+ default="auto",
+ help=
+ "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
+ )
parser.add_argument(
"--num-prompts",
type=int,
@@ -989,11 +999,12 @@ if __name__ == "__main__":
type=float,
default=1.0,
help="Ratio of Structured Outputs requests")
- parser.add_argument("--structured-output-backend",
- type=str,
- choices=["outlines", "lm-format-enforcer", "xgrammar"],
- default="xgrammar",
- help="Backend to use for structured outputs")
+ parser.add_argument(
+ "--structured-output-backend",
+ type=str,
+ choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
+ default="xgrammar",
+ help="Backend to use for structured outputs")
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 7e6556733b288..53869db478c51 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,8 +11,9 @@ from typing import Any, Optional, Union
import torch
import uvloop
-from benchmark_dataset import (BurstGPTDataset, RandomDataset, SampleRequest,
- ShareGPTDataset, SonnetDataset)
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+ RandomDataset, SampleRequest, ShareGPTDataset,
+ SonnetDataset, VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -23,6 +24,7 @@ from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
from vllm.inputs import TextPrompt, TokensPrompt
from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@@ -32,7 +34,7 @@ def run_vllm(
n: int,
engine_args: EngineArgs,
disable_detokenize: bool = False,
-) -> float:
+) -> tuple[float, Optional[list[RequestOutput]]]:
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
@@ -66,12 +68,13 @@ def run_vllm(
use_beam_search = False
+ outputs = None
if not use_beam_search:
start = time.perf_counter()
- llm.generate(prompts,
- sampling_params,
- lora_request=lora_requests,
- use_tqdm=True)
+ outputs = llm.generate(prompts,
+ sampling_params,
+ lora_request=lora_requests,
+ use_tqdm=True)
end = time.perf_counter()
else:
assert lora_requests is None, "BeamSearch API does not support LoRA"
@@ -89,7 +92,46 @@ def run_vllm(
ignore_eos=True,
))
end = time.perf_counter()
- return end - start
+ return end - start, outputs
+
+
+def run_vllm_chat(
+ requests: list[SampleRequest],
+ n: int,
+ engine_args: EngineArgs,
+ disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
+ """
+ Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
+ multimodal models as it properly handles multimodal inputs and chat
+ formatting. For non-multimodal models, use run_vllm() instead.
+ """
+ from vllm import LLM, SamplingParams
+ llm = LLM(**dataclasses.asdict(engine_args))
+
+ assert all(
+ llm.llm_engine.model_config.max_model_len >= (
+ request.prompt_len + request.expected_output_len)
+ for request in requests), (
+ "Please ensure that max_model_len is greater than the sum of "
+ "prompt_len and expected_output_len for all requests.")
+
+ prompts = []
+ sampling_params: list[SamplingParams] = []
+ for request in requests:
+ prompts.append(request.prompt)
+ sampling_params.append(
+ SamplingParams(
+ n=n,
+ temperature=1.0,
+ top_p=1.0,
+ ignore_eos=True,
+ max_tokens=request.expected_output_len,
+ detokenize=not disable_detokenize,
+ ))
+ start = time.perf_counter()
+ outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
+ end = time.perf_counter()
+ return end - start, outputs
async def run_vllm_async(
@@ -264,6 +306,8 @@ def get_requests(args, tokenizer):
dataset_cls = RandomDataset
elif args.dataset_name == "sharegpt":
dataset_cls = ShareGPTDataset
+ if args.backend == "vllm-chat":
+ sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_name == "sonnet":
assert tokenizer.chat_template or tokenizer.default_chat_template, (
"Tokenizer/model must have chat template for sonnet dataset.")
@@ -272,6 +316,19 @@ def get_requests(args, tokenizer):
sample_kwargs["return_prompt_formatted"] = True
elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset
+ elif args.dataset_name == "hf":
+ if args.backend != "vllm-chat":
+ raise ValueError(
+ "hf datasets only are supported by vllm-chat backend")
+ # Choose between VisionArenaDataset and HuggingFaceDataset based on
+ # provided parameters.
+ dataset_cls = (VisionArenaDataset if args.dataset_path
+ == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+ and args.hf_subset is None else HuggingFaceDataset)
+ common_kwargs['dataset_subset'] = args.hf_subset
+ common_kwargs['dataset_split'] = args.hf_split
+ sample_kwargs["enable_multimodal_chat"] = True
+
else:
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
# Remove None values
@@ -290,6 +347,7 @@ def main(args: argparse.Namespace):
requests = get_requests(args, tokenizer)
is_multi_modal = any(request.multi_modal_data is not None
for request in requests)
+ request_outputs: Optional[list[RequestOutput]] = None
if args.backend == "vllm":
if args.async_engine:
elapsed_time = uvloop.run(
@@ -301,9 +359,9 @@ def main(args: argparse.Namespace):
args.disable_detokenize,
))
else:
- elapsed_time = run_vllm(requests, args.n,
- EngineArgs.from_cli_args(args),
- args.disable_detokenize)
+ elapsed_time, request_outputs = run_vllm(
+ requests, args.n, EngineArgs.from_cli_args(args),
+ args.disable_detokenize)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -312,20 +370,45 @@ def main(args: argparse.Namespace):
elif args.backend == "mii":
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
args.output_len)
+ elif args.backend == "vllm-chat":
+ elapsed_time, request_outputs = run_vllm_chat(
+ requests, args.n, EngineArgs.from_cli_args(args),
+ args.disable_detokenize)
else:
raise ValueError(f"Unknown backend: {args.backend}")
- total_num_tokens = sum(request.prompt_len + request.expected_output_len
- for request in requests)
- total_output_tokens = sum(request.expected_output_len
- for request in requests)
- if is_multi_modal:
- print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+
+ if request_outputs:
+ # Note: with the vllm and vllm-chat backends,
+ # we have request_outputs, which we use to count tokens.
+ total_prompt_tokens = 0
+ total_output_tokens = 0
+ for ro in request_outputs:
+ if not isinstance(ro, RequestOutput):
+ continue
+ total_prompt_tokens += len(
+ ro.prompt_token_ids) if ro.prompt_token_ids else 0
+ total_output_tokens += sum(
+ len(o.token_ids) for o in ro.outputs if o)
+ total_num_tokens = total_prompt_tokens + total_output_tokens
+ else:
+ total_num_tokens = sum(r.prompt_len + r.expected_output_len
+ for r in requests)
+ total_output_tokens = sum(r.expected_output_len for r in requests)
+ total_prompt_tokens = total_num_tokens - total_output_tokens
+
+ if is_multi_modal and args.backend != "vllm-chat":
+ print("\033[91mWARNING\033[0m: Multi-modal request with "
+ f"{args.backend} backend detected. The "
"following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details.")
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
+ # vllm-chat backend counts the image tokens now
+
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+ print(f"Total num prompt tokens: {total_prompt_tokens}")
+ print(f"Total num output tokens: {total_output_tokens}")
# Output JSON results if specified
if args.output_json:
@@ -341,17 +424,100 @@ def main(args: argparse.Namespace):
save_to_pytorch_benchmark_format(args, results)
+def validate_args(args):
+ """
+ Validate command-line arguments.
+ """
+
+ # === Deprecation and Defaulting ===
+ if args.dataset is not None:
+ warnings.warn(
+ "The '--dataset' argument will be deprecated in the next release. "
+ "Please use '--dataset-name' and '--dataset-path' instead.",
+ stacklevel=2)
+ args.dataset_path = args.dataset
+
+ if not getattr(args, "tokenizer", None):
+ args.tokenizer = args.model
+
+ # === Backend Validation ===
+ valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
+ if args.backend not in valid_backends:
+ raise ValueError(f"Unsupported backend: {args.backend}")
+
+ # === Dataset Configuration ===
+ if not args.dataset and not args.dataset_path:
+ print(
+ "When dataset path is not set, it will default to random dataset")
+ args.dataset_name = 'random'
+ if args.input_len is None:
+ raise ValueError("input_len must be provided for a random dataset")
+
+ # === Dataset Name Specific Checks ===
+ # --hf-subset and --hf-split: only used
+ # when dataset_name is 'hf'
+ if args.dataset_name != "hf" and (
+ getattr(args, "hf_subset", None) is not None
+ or getattr(args, "hf_split", None) is not None):
+ warnings.warn("--hf-subset and --hf-split will be ignored \
+ since --dataset-name is not 'hf'.",
+ stacklevel=2)
+ elif args.dataset_name == "hf" and args.backend != "vllm-chat":
+ raise ValueError(
+ "When --dataset-name is 'hf', backend must be 'vllm-chat'")
+
+ # --random-range-ratio: only used when dataset_name is 'random'
+ if args.dataset_name != 'random' and args.random_range_ratio is not None:
+ warnings.warn("--random-range-ratio will be ignored since \
+ --dataset-name is not 'random'.",
+ stacklevel=2)
+
+ # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
+ # set.
+ if args.dataset_name not in {"random", "sonnet", None
+ } and args.prefix_len is not None:
+ warnings.warn("--prefix-len will be ignored since --dataset-name\
+ is not 'random', 'sonnet', or not set.",
+ stacklevel=2)
+
+ # === LoRA Settings ===
+ if getattr(args, "enable_lora", False) and args.backend != "vllm":
+ raise ValueError(
+ "LoRA benchmarking is only supported for vLLM backend")
+ if getattr(args, "enable_lora", False) and args.lora_path is None:
+ raise ValueError("LoRA path must be provided when enable_lora is True")
+
+ # === Backend-specific Validations ===
+ if args.backend == "hf" and args.hf_max_batch_size is None:
+ raise ValueError("HF max batch size is required for HF backend")
+ if args.backend != "hf" and args.hf_max_batch_size is not None:
+ raise ValueError("HF max batch size is only for HF backend.")
+
+ if args.backend in {"hf", "mii"} and getattr(args, "quantization",
+ None) is not None:
+ raise ValueError("Quantization is only for vLLM backend.")
+
+ if args.backend == "mii" and args.dtype != "auto":
+ raise ValueError("dtype must be auto for MII backend.")
+ if args.backend == "mii" and args.n != 1:
+ raise ValueError("n must be 1 for MII backend.")
+ if args.backend == "mii" and args.tokenizer != args.model:
+ raise ValueError(
+ "Tokenizer must be the same as the model for MII backend.")
+
+
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend",
type=str,
- choices=["vllm", "hf", "mii"],
+ choices=["vllm", "hf", "mii", "vllm-chat"],
default="vllm")
- parser.add_argument("--dataset-name",
- type=str,
- choices=["sharegpt", "random", "sonnet", "burstgpt"],
- help="Name of the dataset to benchmark on.",
- default="sharegpt")
+ parser.add_argument(
+ "--dataset-name",
+ type=str,
+ choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+ help="Name of the dataset to benchmark on.",
+ default="sharegpt")
parser.add_argument(
"--dataset",
type=str,
@@ -419,55 +585,24 @@ if __name__ == "__main__":
parser.add_argument(
"--random-range-ratio",
type=float,
- default=1.0,
+ default=None,
help="Range of sampled ratio of input/output length, "
"used only for RandomDataSet.",
)
+ # hf dtaset
+ parser.add_argument("--hf-subset",
+ type=str,
+ default=None,
+ help="Subset of the HF dataset.")
+ parser.add_argument("--hf-split",
+ type=str,
+ default=None,
+ help="Split of the HF dataset.")
+
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
- if args.dataset is not None:
- warnings.warn(
- "The '--dataset' argument will be deprecated in the next "
- "release. Please use '--dataset-name' and "
- "'--dataset-path' in the future runs.",
- stacklevel=2)
- args.dataset_path = args.dataset
- if args.dataset is None and args.dataset_path is None:
- # for random dataset, the default sampling setting is in
- # benchmark_dataset.RandomDataset
- print("When dataset is not set, it will default to random dataset")
- else:
- assert args.input_len is None
- if args.enable_lora:
- assert args.lora_path is not None
-
- if args.backend == "vllm":
- if args.hf_max_batch_size is not None:
- raise ValueError("HF max batch size is only for HF backend.")
- elif args.backend == "hf":
- if args.hf_max_batch_size is None:
- raise ValueError("HF max batch size is required for HF backend.")
- if args.quantization is not None:
- raise ValueError("Quantization is only for vLLM backend.")
- if args.enable_lora is not None:
- raise ValueError("LoRA benchmarking is only supported for vLLM"
- " backend")
- elif args.backend == "mii":
- if args.dtype != "auto":
- raise ValueError("dtype must be auto for MII backend.")
- if args.n != 1:
- raise ValueError("n must be 1 for MII backend.")
- if args.quantization is not None:
- raise ValueError("Quantization is only for vLLM backend.")
- if args.hf_max_batch_size is not None:
- raise ValueError("HF max batch size is only for HF backend.")
- if args.tokenizer != args.model:
- raise ValueError("Tokenizer must be the same as the model for MII "
- "backend.")
- if args.enable_lora is not None:
- raise ValueError("LoRA benchmarking is only supported for vLLM"
- " backend")
+ validate_args(args)
main(args)
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 115b92539f967..b4b91eda28440 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -17,13 +17,8 @@ from torch.utils.benchmark import Measurement as TMeasurement
from utils import ArgPool, Bench, CudaGraphBenchParams
from weight_shapes import WEIGHT_SHAPES
-from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
-from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
-from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
-from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.lora.ops.triton_ops.v1 import V1KernelMeta, v1_expand, v1_shrink
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -167,69 +162,25 @@ class OpType(Enum):
"""
LoRA Ops to benchmark and its properties.
"""
- SGMV_SHRINK = auto()
- BGMV_SHRINK = auto()
- SGMV_EXPAND = auto()
- BGMV_EXPAND = auto()
- BGMV_EXPAND_SLICE = auto()
- V1_SHRINK = auto()
- V1_EXPAND = auto()
+ LORA_SHRINK = auto()
+ LORA_EXPAND = auto()
@staticmethod
def from_str(s: str) -> "OpType":
- if s.lower() == 'sgmv_shrink':
- return OpType.SGMV_SHRINK
- if s.lower() == 'sgmv_expand':
- return OpType.SGMV_EXPAND
- if s.lower() == 'bgmv_shrink':
- return OpType.BGMV_SHRINK
- if s.lower() == 'bgmv_expand':
- return OpType.BGMV_EXPAND
- if s.lower() == "bgmv_expand_slice":
- return OpType.BGMV_EXPAND_SLICE
- if s.lower() == "v1_shrink":
- return OpType.V1_SHRINK
- if s.lower() == "v1_expand":
- return OpType.V1_EXPAND
+ if s.lower() == "lora_shrink":
+ return OpType.LORA_SHRINK
+ if s.lower() == "lora_expand":
+ return OpType.LORA_EXPAND
raise ValueError(f"Unrecognized str {s} to convert to OpType")
def is_shrink_fn(self) -> bool:
- return self in [
- OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK
- ]
+ return self in [OpType.LORA_SHRINK]
def is_expand_fn(self) -> bool:
- return self in [
- OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND
- ]
-
- def is_prefill_op(self) -> bool:
- return self in [
- OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK,
- OpType.V1_EXPAND
- ]
-
- def is_decode_op(self) -> bool:
- return self in [
- OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE,
- OpType.V1_SHRINK, OpType.V1_EXPAND
- ]
-
- def is_expand_slice_fn(self) -> bool:
- return self in [OpType.BGMV_EXPAND_SLICE]
+ return self in [OpType.LORA_EXPAND]
def num_slices(self) -> list[int]:
- if self in [
- OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK,
- OpType.V1_EXPAND
- ]:
- # SGMV kernels and v1 kernels supports slices
- return [1, 2, 3]
- if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
- return [1]
- if self in [OpType.BGMV_EXPAND_SLICE]:
- return [2, 3]
- raise ValueError(f"Unrecognized OpType {self}")
+ return [1, 2, 3]
def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
lora_rank: int) -> tuple[int, int, int]:
@@ -239,7 +190,7 @@ class OpType(Enum):
k = hidden_size
n = lora_rank
else:
- assert self.is_expand_fn() or self.is_expand_slice_fn()
+ assert self.is_expand_fn()
m = num_tokens
k = lora_rank
n = hidden_size
@@ -254,7 +205,7 @@ class OpType(Enum):
if self.is_shrink_fn():
return op_dtype, op_dtype, torch.float32
else:
- assert self.is_expand_fn() or self.is_expand_slice_fn()
+ assert self.is_expand_fn()
return torch.float32, op_dtype, op_dtype
def matmul_shapes(
@@ -268,43 +219,19 @@ class OpType(Enum):
m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
b_shape = (num_loras, n, k) # col-major
- if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
- # SGMV shrink and V1 shrink kernels support num_slices inherently
- # in the kernel.
+ if self in [OpType.LORA_SHRINK]:
+ # LoRA shrink kernels support num_slices inherently in the kernel.
return ((m, k), b_shape, (num_slices, m, n))
- if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
- # SGMV expand and V1 expand kernels support num_slices inherently
- # in the kernel
+ if self in [OpType.LORA_EXPAND]:
+ # LoRA expand kernels support num_slices inherently in the kernel
return ((num_slices, m, k), b_shape, (m, n * num_slices))
- if self == OpType.BGMV_SHRINK:
- return ((m, k), b_shape, (m, n))
- if self == OpType.BGMV_EXPAND:
- return ((m, k), b_shape, (m, n))
- if self == OpType.BGMV_EXPAND_SLICE:
- return ((num_slices, m, k), b_shape, (m, n * num_slices))
-
raise ValueError(f"Unrecognized op_type {self}")
def bench_fn(self) -> Callable:
-
- def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
- for x in kwargs_list:
- bgmv_expand_slice(**x)
-
- if self == OpType.SGMV_SHRINK:
- return sgmv_shrink
- if self == OpType.SGMV_EXPAND:
- return sgmv_expand
- if self == OpType.BGMV_SHRINK:
- return bgmv_shrink
- if self == OpType.BGMV_EXPAND:
- return bgmv_expand
- if self == OpType.BGMV_EXPAND_SLICE:
- return emulate_bgmv_expand_slice
- if self == OpType.V1_SHRINK:
- return v1_shrink
- if self == OpType.V1_EXPAND:
- return v1_expand
+ if self == OpType.LORA_SHRINK:
+ return lora_shrink
+ if self == OpType.LORA_EXPAND:
+ return lora_expand
raise ValueError(f"Unrecognized optype {self}")
@@ -318,34 +245,13 @@ class OpType(Enum):
"""
w_dtype = lora_weights[0].dtype
num_slices = len(lora_weights)
- if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
+ if self in [OpType.LORA_SHRINK]:
for slice_idx in range(num_slices):
ref_group_gemm(ref_out=output[slice_idx, :],
input=input,
lora_weights=lora_weights[slice_idx],
**kwargs)
- elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
- hidden_size = lora_weights[0].shape[1]
- for slice_idx in range(num_slices):
- slice_offset = slice_idx * hidden_size
- ref_group_gemm(
- ref_out=output[:, slice_offset:slice_offset + hidden_size],
- input=input[slice_idx].clone().to(dtype=w_dtype),
- lora_weights=lora_weights[slice_idx],
- **kwargs)
- elif self == OpType.BGMV_SHRINK:
- assert num_slices == 1
- ref_group_gemm(ref_out=output,
- input=input,
- lora_weights=lora_weights[0],
- **kwargs)
- elif self == OpType.BGMV_EXPAND:
- assert num_slices == 1
- ref_group_gemm(ref_out=output,
- input=input.clone().to(dtype=w_dtype),
- lora_weights=lora_weights[0],
- **kwargs)
- elif self == OpType.BGMV_EXPAND_SLICE:
+ elif self in [OpType.LORA_EXPAND]:
hidden_size = lora_weights[0].shape[1]
for slice_idx in range(num_slices):
slice_offset = slice_idx * hidden_size
@@ -411,13 +317,11 @@ class BenchmarkTensors:
input: torch.Tensor
lora_weights_lst: list[torch.Tensor]
output: torch.Tensor
- # metadata tensors
+ # LoRA kernel metadata
+ lora_kernel_meta: LoRAKernelMeta
+ # Metadata tensors used in testing correctness
seq_lens: torch.Tensor
- seq_start_loc: torch.Tensor
prompt_lora_mapping: torch.Tensor
- token_lora_mapping: torch.Tensor
- # v1 kernel metadata
- v1_kernel_meta: Optional[V1KernelMeta] = None
def io_types(self) -> str:
return (f"{dtype_to_str(self.input.dtype)}x"
@@ -444,35 +348,29 @@ class BenchmarkTensors:
assert ctx.num_active_loras <= ctx.num_loras
total_tokens = ctx.batch_size * ctx.seq_length
+ # Make metadata tensors involved in correctness testing.
# Prepare seq lens tensor
seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
(ctx.batch_size, ))
- # Prepare seq_start_loc tensor
- seq_start_loc_tensor = torch.cumsum(torch.tensor(
- [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
- dim=0)
assert total_tokens == seq_len_tensor.sum()
# Prepare prompt lora indices tensor
prompt_lora_indices_tensor = make_prompt_lora_mapping(
ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
- # Prepare token lora indices tensor
+
+ # Make LoRAKernelMeta
token_lora_indices_tensor = make_token_lora_mapping(
total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
seq_len_tensor, "cpu")
-
- v1_kernel_meta = None
- if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]:
- v1_kernel_meta = V1KernelMeta.make(
- max_loras=ctx.num_loras,
- max_num_tokens=token_lora_indices_tensor.size(0),
- device="cpu")
- v1_kernel_meta.prepare_tensors(
- token_lora_mapping=token_lora_indices_tensor)
+ lora_kernel_meta = LoRAKernelMeta.make(
+ max_loras=ctx.num_loras,
+ max_num_tokens=token_lora_indices_tensor.size(0),
+ device="cpu")
+ lora_kernel_meta.prepare_tensors(
+ token_lora_mapping=token_lora_indices_tensor)
return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
- seq_len_tensor, seq_start_loc_tensor,
- prompt_lora_indices_tensor,
- token_lora_indices_tensor, v1_kernel_meta)
+ lora_kernel_meta, seq_len_tensor,
+ prompt_lora_indices_tensor)
def sanity_check(self) -> None:
"""
@@ -482,9 +380,9 @@ class BenchmarkTensors:
# check metadata tensors
assert torch.sum(self.seq_lens) == num_tokens
num_seqs = self.seq_lens.shape[0]
- assert self.seq_start_loc.shape[0] == num_seqs
+ #assert self.seq_start_loc.shape[0] == num_seqs
assert self.prompt_lora_mapping.shape[0] == num_seqs
- assert self.token_lora_mapping.shape[0] == num_tokens
+ assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
def to_device(self, device: str):
"""
@@ -499,220 +397,27 @@ class BenchmarkTensors:
self.input = to_device(self.input)
self.output = to_device(self.output)
self.seq_lens = to_device(self.seq_lens)
- self.seq_start_loc = to_device(self.seq_start_loc)
self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
- self.token_lora_mapping = to_device(self.token_lora_mapping)
for i in range(len(self.lora_weights_lst)):
self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
- # v1 meta
- if self.v1_kernel_meta:
- for field_name in V1KernelMeta.__dataclass_fields__:
- field = getattr(self.v1_kernel_meta, field_name)
- assert isinstance(field, torch.Tensor)
- setattr(self.v1_kernel_meta, field_name, to_device(field))
+ # LoRA meta
+ for field_name in LoRAKernelMeta.__dataclass_fields__:
+ field = getattr(self.lora_kernel_meta, field_name)
+ assert isinstance(field, torch.Tensor)
+ setattr(self.lora_kernel_meta, field_name, to_device(field))
def metadata(self) -> tuple[int, int, int]:
"""
Return num_seqs, num_tokens and max_seq_len
"""
num_seqs = self.seq_lens.shape[0]
- num_tokens = self.token_lora_mapping.shape[0]
+ num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
max_seq_len = torch.max(self.seq_lens).item()
num_slices = len(self.lora_weights_lst)
return num_seqs, num_tokens, max_seq_len, num_slices
- def convert_to_sgmv_benchmark_tensors(self):
- """
- For sgmv punica kernels, when consecutive sequences have the
- same LoRA ID, we just merge them together.
- This happens in punica.py::compute_metadata
- """
-
- # Collapse seq_lens and seq_start_loc
- _, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
- return_counts=True)
- cum_result = torch.cumsum(seq_lens, dim=0)
- seq_start_loc = torch.zeros_like(seq_lens)
- seq_start_loc[1:].copy_(cum_result[:-1])
-
- # Collapse prompt mapping
- prompt_lora_mapping = torch.unique_consecutive(
- self.prompt_lora_mapping)
-
- assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
- f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
-
- self.prompt_lora_mapping = prompt_lora_mapping.to(
- dtype=self.prompt_lora_mapping.dtype)
- self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
- self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
-
- def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
- self.convert_to_sgmv_benchmark_tensors()
- self.sanity_check()
- self.to_device(self.input.device)
-
- num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
-
- # Sanity check matrix shapes.
- i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
- 0].shape, self.output.shape
- # Expected input shape [num_tokens, hidden_size]
- assert len(i_shape) == 2
- assert i_shape[0] == num_tokens
- hidden_size = i_shape[1]
- # Expected lora weight shape [num_loras, lora_rank, hidden_size]
- assert len(lw_shape) == 3
- assert lw_shape[2] == hidden_size
- lora_rank = lw_shape[1]
- # Expected output shape [num_slices, num_tokens, lora_rank]
- assert len(o_shape) == 3
- assert o_shape == (num_slices, num_tokens, lora_rank)
-
- return {
- 'inputs': self.input,
- 'lora_a_weights': self.lora_weights_lst,
- 'output_tensor': self.output,
- 'b_seq_start_loc': self.seq_start_loc,
- 'seq_len_tensor': self.seq_lens,
- 'lora_indices_tensor': self.prompt_lora_mapping,
- 'batches': num_seqs,
- 'max_seq_length': max_seq_len,
- 'token_nums': num_tokens,
- 'scaling': 1.0,
- }
-
- def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-
- self.convert_to_sgmv_benchmark_tensors()
- self.sanity_check()
- self.to_device(self.input.device)
-
- num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
-
- # Sanity check matrix shapes.
- i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
- 0].shape, self.output.shape
- # Expected input shape : [num_slices, num_tokens, lora_rank]
- assert len(i_shape) == 3
- assert i_shape[0] == num_slices
- assert i_shape[1] == num_tokens
- lora_rank = i_shape[2]
- # Expected lora weight shape : [num_lora, hidden_size, lora_rank]
- assert len(lw_shape) == 3
- assert lw_shape[2] == lora_rank
- hidden_size = lw_shape[1]
- # Expected output shape : [num_tokens, hidden_size * num_slices]
- assert len(o_shape) == 2
- assert o_shape == (num_tokens, hidden_size * num_slices)
-
- return {
- 'inputs': self.input,
- 'lora_b_weights': self.lora_weights_lst,
- 'output_tensor': self.output,
- 'b_seq_start_loc': self.seq_start_loc,
- 'seq_len_tensor': self.seq_lens,
- 'lora_indices_tensor': self.prompt_lora_mapping,
- 'batches': num_seqs,
- 'max_seq_length': max_seq_len,
- 'token_nums': num_tokens,
- 'offset_start': 0,
- 'add_inputs': add_inputs,
- }
-
- def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
- assert len(self.lora_weights_lst) == 1
- self.to_device(self.input.device)
-
- _, num_tokens, _, _ = self.metadata()
- # Sanity check shapes
- i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
- 0].shape, self.output.shape
- # Expected input shape [num_tokens, hidden_size]
- assert len(i_shape) == 2
- assert i_shape[0] == num_tokens
- hidden_size = i_shape[1]
- # Expected lora weight shape [num_loras, lora_rank, hidden_size]
- assert len(lw_shape) == 3
- assert lw_shape[2] == hidden_size
- lora_rank = lw_shape[1]
- # Expected output shape [num_tokens, lora_rank]
- assert len(o_shape) == 2
- assert o_shape == (num_tokens, lora_rank)
-
- return {
- 'inputs': self.input,
- 'lora_a_weights': self.lora_weights_lst[0],
- 'output_tensor': self.output,
- 'lora_indices_tensor': self.token_lora_mapping,
- 'scaling': 1.0
- }
-
- def as_bgmv_expand_kwargs(self, add_inputs: bool):
- assert len(self.lora_weights_lst) == 1
- self.to_device(self.input.device)
-
- _, num_tokens, _, _ = self.metadata()
- # Sanity check shapes
- i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
- 0].shape, self.output.shape
- # Expected input shape [num_tokens, lora_rank]
- assert len(i_shape) == 2
- assert i_shape[0] == num_tokens
- lora_rank = i_shape[1]
- # Expected lora weight shape [num_loras, hidden_size, lora_rank]
- assert len(lw_shape) == 3
- assert lw_shape[2] == lora_rank
- hidden_size = lw_shape[1]
- # Expected output shape [num_tokens, hidden_size]
- assert len(o_shape) == 2
- assert o_shape == (num_tokens, hidden_size)
-
- return {
- 'inputs': self.input,
- 'lora_b_weights': self.lora_weights_lst[0],
- 'output_tensor': self.output,
- 'lora_indices_tensor': self.token_lora_mapping,
- 'add_inputs': add_inputs
- }
-
- def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-
- _, num_tokens, _, num_slices = self.metadata()
- # Sanity check shapes
- i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
- 0].shape, self.output.shape
- # Expected input shape [num_slices, num_tokens, lora_rank]
- assert len(i_shape) == 3
- assert i_shape[0] == num_slices
- assert i_shape[1] == num_tokens
- lora_rank = i_shape[2]
- # Expected lora weight shape [num_loras, hidden_size, lora_rank]
- assert len(lw_shape) == 3
- assert lw_shape[2] == lora_rank
- hidden_size = lw_shape[1]
- # Expected output shape [num_tokens, hidden_size * num_slices]
- assert len(o_shape) == 2
- assert o_shape == (num_tokens, hidden_size * num_slices)
-
- self.to_device(self.input.device)
-
- kwargs_list = []
- for i in range(num_slices):
- kwargs_list.append({
- 'inputs': self.input[i],
- 'lora_b_weights': self.lora_weights_lst[i],
- 'output_tensor': self.output,
- 'lora_indices_tensor': self.token_lora_mapping,
- 'slice_offset': i * hidden_size,
- 'slice_size': hidden_size,
- 'add_inputs': add_inputs,
- })
- return {'kwargs_list': kwargs_list}
-
- def as_v1_shrink_kwargs(self) -> dict[str, Any]:
- assert self.v1_kernel_meta is not None
+ def as_lora_shrink_kwargs(self) -> dict[str, Any]:
self.sanity_check()
self.to_device(self.input.device)
@@ -737,17 +442,16 @@ class BenchmarkTensors:
'inputs': self.input,
'lora_a_weights': self.lora_weights_lst,
'output_tensor': self.output,
- 'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
+ 'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
'token_indices_sorted_by_lora_ids':
- self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
- 'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
- 'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
- 'lora_ids': self.v1_kernel_meta.active_lora_ids,
+ self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
+ 'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
+ 'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
+ 'lora_ids': self.lora_kernel_meta.active_lora_ids,
'scaling': 1.0,
}
- def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
- assert self.v1_kernel_meta is not None
+ def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
self.sanity_check()
self.to_device(self.input.device)
@@ -773,12 +477,12 @@ class BenchmarkTensors:
'inputs': self.input,
'lora_b_weights': self.lora_weights_lst,
'output_tensor': self.output,
- 'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
+ 'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
'token_indices_sorted_by_lora_ids':
- self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
- 'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
- 'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
- 'lora_ids': self.v1_kernel_meta.active_lora_ids,
+ self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
+ 'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
+ 'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
+ 'lora_ids': self.lora_kernel_meta.active_lora_ids,
'offset_start': 0,
'add_inputs': add_inputs,
}
@@ -791,20 +495,10 @@ class BenchmarkTensors:
else:
assert add_inputs is not None
- if op_type == OpType.SGMV_SHRINK:
- return self.as_sgmv_shrink_kwargs()
- if op_type == OpType.SGMV_EXPAND:
- return self.as_sgmv_expand_kwargs(add_inputs)
- if op_type == OpType.BGMV_SHRINK:
- return self.as_bgmv_shrink_kwargs()
- if op_type == OpType.BGMV_EXPAND:
- return self.as_bgmv_expand_kwargs(add_inputs)
- if op_type == OpType.BGMV_EXPAND_SLICE:
- return self.as_bgmv_expand_slice_kwargs(add_inputs)
- if op_type == OpType.V1_SHRINK:
- return self.as_v1_shrink_kwargs()
- if op_type == OpType.V1_EXPAND:
- return self.as_v1_expand_kwargs(add_inputs)
+ if op_type == OpType.LORA_SHRINK:
+ return self.as_lora_shrink_kwargs()
+ if op_type == OpType.LORA_EXPAND:
+ return self.as_lora_expand_kwargs(add_inputs)
raise ValueError(f"Unrecognized optype {self}")
def test_correctness(self, op_type: OpType,
@@ -993,10 +687,6 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
for bench_ctx in bench_ctxs:
for seq_len in args.seq_lengths:
bench_ops: list[OpType] = args.op_types
- if seq_len > 1:
- # bench only prefill ops
- bench_ops = [op for op in args.op_types if op.is_prefill_op()]
-
seq_len_timers = []
for bench_op in bench_ops:
for num_slices in bench_op.num_slices():
@@ -1206,13 +896,13 @@ Benchmark LoRA kernels:
{use_cuda_graph_recommendation()}
list_bench example:
- python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+ python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
model_bench example:
- python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+ python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
range_bench example:
- python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8
+ python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8
""", # noqa: E501
formatter_class=argparse.RawTextHelpFormatter)
diff --git a/benchmarks/run_structured_output_benchmark.sh b/benchmarks/run_structured_output_benchmark.sh
index 8a777320f7354..126dfbc244161 100755
--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
@@ -54,6 +54,7 @@ for qps in "${QPS_VALUES[@]}"; do
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
--request-rate $qps \
--result-filename "$FILENAME" \
+ --tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
--port ${PORT:-8000}
echo "Completed benchmark with QPS: $qps"
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index f2d01099097a5..afd7c47e8ac00 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG 9bfa9869829d8c593527eb34c5271d0090f7ccc9
+ GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index d06eac2b3d4fe..0b3f6fc8c19a8 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -350,8 +350,8 @@ __global__ void concat_and_cache_mla_kernel(
} // namespace vllm
-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
// KV_DTYPE is the real data type of kv-cache.
#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
vllm::reshape_and_cache_kernel \
@@ -393,8 +393,8 @@ void reshape_and_cache(
CALL_RESHAPE_AND_CACHE)
}
-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
// KV_DTYPE is the real data type of kv-cache.
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
vllm::reshape_and_cache_flash_kernel \
@@ -446,8 +446,8 @@ void reshape_and_cache_flash(
CALL_RESHAPE_AND_CACHE_FLASH);
}
-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
// KV_DTYPE is the real data type of kv-cache.
#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
vllm::concat_and_cache_mla_kernel \
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
index e3809acad7453..d726ee9307fe0 100644
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -3,6 +3,12 @@
#include "cpu_types.hpp"
+#if defined(__x86_64__)
+ #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
+#else
+ #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
+#endif
+
namespace {
template
void copy_blocks_cpu_impl(std::vector const& key_caches,
@@ -95,13 +101,12 @@ void copy_blocks(std::vector const& key_caches,
}
const int element_num_per_block = key_caches[0][0].numel();
- VLLM_DISPATCH_FLOATING_TYPES(
- key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
- CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
- copy_blocks_cpu_impl(key_caches, value_caches, block_mapping,
- element_num_per_block, num_layers);
- CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
- });
+ DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
+ CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
+ copy_blocks_cpu_impl(key_caches, value_caches, block_mapping,
+ element_num_per_block, num_layers);
+ CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
+ });
}
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
@@ -118,16 +123,15 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
int key_stride = key.stride(0);
int value_stride = value.stride(0);
- VLLM_DISPATCH_FLOATING_TYPES(
- key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
- CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
- reshape_and_cache_cpu_impl(
- key.data_ptr(), value.data_ptr(),
- key_cache.data_ptr(), value_cache.data_ptr(),
- slot_mapping.data_ptr(), num_tokens, key_stride,
- value_stride, num_heads, head_size, block_size, x);
- CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
- });
+ DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
+ CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
+ reshape_and_cache_cpu_impl(
+ key.data_ptr(), value.data_ptr(),
+ key_cache.data_ptr(), value_cache.data_ptr(),
+ slot_mapping.data_ptr(), num_tokens, key_stride, value_stride,
+ num_heads, head_size, block_size, x);
+ CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
+ });
}
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index a4ef2be2a58ca..a9369e1fd1016 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -16,9 +16,18 @@ namespace vec_op {
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \
+ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+ AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
+
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+#define VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2(TYPE, NAME, ...) \
+ AT_DISPATCH_SWITCH(TYPE, NAME, \
+ VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(__VA_ARGS__))
+
#ifndef CPU_OP_GUARD
#define CPU_KERNEL_GUARD_IN(NAME)
#define CPU_KERNEL_GUARD_OUT(NAME)
diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp
index 96bce7dda0132..8a59e884d6c82 100644
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -170,7 +170,7 @@ void rotary_embedding_gptj_impl(
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
torch::Tensor& key, int64_t head_size,
torch::Tensor& cos_sin_cache, bool is_neox) {
- int num_tokens = query.numel() / query.size(-1);
+ int num_tokens = positions.numel();
int rot_dim = cos_sin_cache.size(1);
int num_heads = query.size(-1) / head_size;
int num_kv_heads = key.size(-1) / head_size;
diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
index c3902f4c2a163..fea4bc2ca0d8f 100644
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -274,7 +274,7 @@ void advance_step_flashinfer(
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
- int block_tables_stride = block_tables.stride(0);
+ [[maybe_unused]] int block_tables_stride = block_tables.stride(0);
TORCH_CHECK((blocks * threads > num_queries),
"multi-step: not enough threads to map to num_queries = ",
num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh
index f01427cc3d0ca..c4ed1b4757928 100644
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -19,12 +19,24 @@ __device__ __forceinline__ fp8_type cvt_c10(float const r) {
return {};
}
+// __hip_fp8_e4m3 only exists starting in ROCm 6.3. The macro
+// HIP_FP8_TYPE_OCP comes from the hip_fp8.h header and also makes
+// its first appearance in ROCm 6.3. Since VLLM_DISPATCH_FP8_TYPES
+// on ROCm instantiates both OCP and FNUZ kernels, we need to replace
+// the new HW cvt with something reasonable that doesn't rely on the
+// ROCm 6.3 feature. This allows compiling on ROCm 6.2 or newer.
template <>
__device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) {
+ #if HIP_FP8_TYPE_OCP
return c10::Float8_e4m3fn(
__hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation,
__hip_fp8_e4m3::__default_interpret),
c10::Float8_e4m3fn::from_bits());
+ #else
+ // Cast implemented by pytorch. Uses bit manipulation instead of HW cvt.
+ // HW cvt above is faster when it is available (ROCm 6.3 or newer).
+ return static_cast(r);
+ #endif
}
template <>
@@ -434,7 +446,7 @@ scaled_vec_conversion(const uint8_t& a, float scale) {
template <>
__inline__ __device__ uint32_t
scaled_vec_conversion(const uint16_t& a, float scale) {
- __half2_raw h2r =
+ [[maybe_unused]] __half2_raw h2r =
__hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
union {
__half2_raw h2r;
diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
index 785f1a09c1900..538cb5848e21f 100644
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -206,8 +206,8 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
int offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
- int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
- int end_m = min(offset_m + m_count, size_m);
+ [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+ [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
int n = offset_n + t * 4;
@@ -344,8 +344,8 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
int offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
- int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
- int end_m = min(offset_m + m_count, size_m);
+ [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+ [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
int n = offset_n + t * 4;
@@ -465,8 +465,8 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
int offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
- int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
- int end_m = min(offset_m + m_count, size_m);
+ [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+ [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
int n = offset_n + t * 4;
@@ -593,8 +593,8 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
int offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
- int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
- int end_m = min(offset_m + m_count, size_m);
+ [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+ [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
int n = offset_n + t * 4;
diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
index c4ed98ca64f8b..b520f8c32b95b 100644
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@@ -437,9 +437,10 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
#pragma unroll
for (int k_idx = 0; k_idx < 2; ++k_idx) {
- FType low16 = static_cast(C_frag[m_idx][n_idx][k_idx * 2]);
+ FType low16 =
+ ScalarType::float2num(C_frag[m_idx][n_idx][k_idx * 2]);
FType high16 =
- static_cast(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
+ ScalarType::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
uint32_t tmp = (reinterpret_cast(low16) & 0xffff) |
(reinterpret_cast(high16) << 16);
int sts_offset =
@@ -793,7 +794,7 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
FT scale_reg[4];
*(reinterpret_cast(scale_reg)) =
*(reinterpret_cast(scales + params_nidx));
- FT zero_reg[4] = {0};
+ FT zero_reg[4];
if (zeros != nullptr) {
*(reinterpret_cast(zero_reg)) =
*(reinterpret_cast(zeros + params_nidx));
@@ -809,8 +810,10 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
reinterpret_cast::T2*>(&(fval_reg[ni * 4])));
#pragma unroll
for (int ki = 0; ki < 4; ++ki) {
- fval_reg[ni * 4 + ki] =
- (fval_reg[ni * 4 + ki] - zero_reg[ni]) * scale_reg[ni];
+ if (zeros != nullptr) {
+ fval_reg[ni * 4 + ki] = __hsub(fval_reg[ni * 4 + ki], zero_reg[ni]);
+ }
+ fval_reg[ni * 4 + ki] = __hmul(fval_reg[ni * 4 + ki], scale_reg[ni]);
int sts_offset = sts_base_offset + ((ki / 2) * 8 + (ki % 2)) * 32 +
((ni + lane_id % 4) % 4) * 8;
smem[sts_offset] = fval_reg[ni * 4 + ki];
diff --git a/csrc/quantization/gptq_allspark/allspark_utils.cuh b/csrc/quantization/gptq_allspark/allspark_utils.cuh
index 7aded9a17280d..80456c25590d0 100644
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -7,6 +7,8 @@
#include
#include
#include
+#include "../gptq_marlin/marlin_dtypes.cuh"
+using marlin::ScalarType;
namespace allspark {
@@ -66,14 +68,14 @@ __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
return;
}
- FType sum(0);
+ float sum = 0.f;
int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
for (int i = 0; i < n_mat; ++i) {
- sum += C_split[idx + i * matrix_size];
+ sum += ScalarType::num2float(C_split[idx + i * matrix_size]);
}
- C[idx] = sum;
+ C[idx] = ScalarType::float2num(sum);
}
template
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 86029da141b36..c500d00ea528e 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) {
template
__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
- union tmpcvt {
+ [[maybe_unused]] union tmpcvt {
uint16_t u;
_Float16 f;
__hip_bfloat16 b;
@@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
template
__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
const _B16x4& inp2) {
- union tmpcvt {
+ [[maybe_unused]] union tmpcvt {
uint16_t u;
_Float16 f;
__hip_bfloat16 b;
@@ -308,8 +308,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);
- __shared__ float shared_qk_max[NWARPS][16 + 1];
- __shared__ float shared_exp_sum[NWARPS][16 + 1];
+ [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1];
+ [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1];
// shared_logits is used for multiple purposes
__shared__ _B16x4 shared_logits[NWARPS][4][16][4];
@@ -426,7 +426,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
const int klocal_token_idx =
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
- const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+ [[maybe_unused]] const int kglobal_token_idx =
+ partition_start_token_idx + klocal_token_idx;
const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
@@ -1272,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const int seq_idx = blockIdx.y;
const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
- constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+ [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
const int warpid = threadIdx.x / WARP_SIZE;
- const int laneid = threadIdx.x % WARP_SIZE;
+ [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
__shared__ float shared_global_exp_sum;
// max num partitions supported is warp_size * NPAR_LOOPS
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index d3bcb86adbc80..eb3a2c911d55e 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -370,7 +370,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
"bool");
ops.impl("cutlass_scaled_mm_supports_block_fp8",
- &cutlass_scaled_mm_supports_fp8);
+ &cutlass_scaled_mm_supports_block_fp8);
// Check if cutlass sparse scaled_mm is supported for CUDA devices of the
// given capability
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
index c57f27b49b88a..efb4f692972b5 100644
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@@ -4,6 +4,7 @@
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index f55a62ef01b4f..9cbfc32991f09 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -34,7 +34,8 @@ Further update the model as follows:
image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features)
- def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
+ def get_multimodal_embeddings(
+ self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
# Validate the multimodal input keyword arguments
image_input = self._parse_and_validate_image_input(**kwargs)
@@ -61,7 +62,7 @@ Further update the model as follows:
def get_input_embeddings(
self,
input_ids: torch.Tensor,
- multimodal_embeddings: Optional[NestedTensors] = None,
+ multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
# `get_input_embeddings` should already be implemented for the language
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 9e52a2182cfbb..1f60faf40879e 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
create a custom Dockerfile on top of the base image with an extra layer that installs them:
```Dockerfile
-FROM vllm/vllm-openai:v0.7.3
+FROM vllm/vllm-openai:v0.8.0
# e.g. install the `audio` and `video` optional dependencies
# NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install --system vllm[audio,video]==0.7.3
+RUN uv pip install vllm[audio,video]==0.8.0
```
:::
@@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
```Dockerfile
FROM vllm/vllm-openai:latest
-RUN uv pip install --system git+https://github.com/huggingface/transformers.git
+RUN uv pip install git+https://github.com/huggingface/transformers.git
```
:::
diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md
index e4fc5e1313079..7320d727fbaa4 100644
--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@@ -4,9 +4,9 @@
A Helm chart to deploy vLLM for Kubernetes
-Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
+Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
-This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
## Prerequisites
diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md
index 64071ba042d0b..b31344b199663 100644
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -4,17 +4,19 @@
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
---------
-
-Alternatively, you can also deploy Kubernetes using [helm chart](https://docs.vllm.ai/en/latest/deployment/frameworks/helm.html). There are also open-source projects available to make your deployment even smoother.
-
-* [vLLM production-stack](https://github.com/vllm-project/production-stack): Born out of a Berkeley-UChicago collaboration, vLLM production stack is a project that contains latest research and community effort, while still delivering production-level stability and performance. Checkout the [documentation page](https://docs.vllm.ai/en/latest/deployment/integrations/production-stack.html) for more details and examples.
-
---------
+Alternatively, you can deploy vLLM to Kubernetes using any of the following:
+* [Helm](frameworks/helm.md)
+* [InftyAI/llmaz](integrations/llmaz.md)
+* [KServe](integrations/kserve.md)
+* [kubernetes-sigs/lws](frameworks/lws.md)
+* [meta-llama/llama-stack](integrations/llamastack.md)
+* [substratusai/kubeai](integrations/kubeai.md)
+* [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
+* [vllm-project/production-stack](integrations/production-stack.md)
## Pre-requisite
-Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine).
+Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
## Deployment using native K8s
diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
index 5f2582877260a..e1770c8226435 100644
--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@@ -419,7 +419,7 @@ List of `v_vec` for one thread
which is also `V_VEC_SIZE` elements from `logits`. Overall, with
multiple inner iterations, each warp will process one block of value
tokens. And with multiple outer iterations, the whole context value
- tokens are processd
+ tokens are processed
```cpp
float accs[NUM_ROWS_PER_THREAD];
diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md
index bed40516ca46a..b3981b2dc24a7 100644
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@@ -13,7 +13,7 @@ Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
Metrics in vLLM can be categorized as follows:
1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus.
-2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histrograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
+2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are.
@@ -47,7 +47,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
- `vllm:tokens_total` (Counter)
- `vllm:iteration_tokens_total` (Histogram)
- `vllm:time_in_queue_requests` (Histogram)
-- `vllm:model_forward_time_milliseconds` (Histogram
+- `vllm:model_forward_time_milliseconds` (Histogram)
- `vllm:model_execute_time_milliseconds` (Histogram)
- `vllm:request_params_n` (Histogram)
- `vllm:request_params_max_tokens` (Histogram)
diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
index 2fae22cc264e5..3d14a76840d45 100644
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@@ -191,7 +191,7 @@ When the head block (least recently used block) of the free queue is cached, we
In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
-**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
:::{image} /assets/design/v1/prefix_caching/example-time-1.png
:alt: Example Time 1
@@ -203,7 +203,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
:alt: Example Time 3
:::
-**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
:::{image} /assets/design/v1/prefix_caching/example-time-4.png
:alt: Example Time 4
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index dff7e916fb460..a71da72e4360a 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -110,7 +110,7 @@ In addition to serving LoRA adapters at server startup, the vLLM server now supp
LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
to change models on-the-fly is needed.
-Note: Enabling this feature in production environments is risky as user may participate model adapter management.
+Note: Enabling this feature in production environments is risky as users may participate in model adapter management.
To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index cc8d6fceb7d66..852248e418ca0 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -162,7 +162,7 @@ A variety of speculative models of this type are available on HF hub:
## Speculating using EAGLE based draft models
The following code configures vLLM to use speculative decoding where proposals are generated by
-an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here]().
```python
from vllm import LLM, SamplingParams
diff --git a/docs/source/getting_started/faq.md b/docs/source/getting_started/faq.md
index 4751b325e6fc4..c1bb28937c144 100644
--- a/docs/source/getting_started/faq.md
+++ b/docs/source/getting_started/faq.md
@@ -15,7 +15,7 @@ more are listed [here](#supported-models).
By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
[Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
-but they are expected be inferior to models that are specifically trained on embedding tasks.
+but they are expected to be inferior to models that are specifically trained on embedding tasks.
______________________________________________________________________
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index 7e52f6048909c..e91ed6fbd7a88 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -119,7 +119,7 @@ If you're observing the following error: `docker: Error response from daemon: Un
## Supported configurations
-The following configurations have been validated to be function with
+The following configurations have been validated to function with
Gaudi2 devices. Configurations that are not listed may or may not work.
- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index 5641c1563656c..ab0db4795da77 100644
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -19,7 +19,7 @@ Currently, there are no pre-built OpenVINO wheels.
### Build wheel from source
-First, install Python and ensure you lave the latest pip. For example, on Ubuntu 22.04, you can run:
+First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run:
```console
sudo apt-get update -y
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index 9ca25e4709e86..65af7b50bdc15 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -189,12 +189,13 @@ vLLM CPU backend supports the following vLLM features:
- Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
- Chunked-prefill
- Prefix-caching
-- FP8-E5M2 KV-Caching (TODO)
+- FP8-E5M2 KV cache
## Related runtime environment variables
- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+- `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
## Performance tips
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 7e3b884c2ab1e..d3e375aec10cb 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -131,6 +131,8 @@ Building from source requires a lot of compilation. If you are building from sou
For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
+When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
+
[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
:::
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index 5a47b16f77661..84a9b387789c7 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -1,6 +1,6 @@
# Installation
-vLLM initially supports basic model inferencing and serving on Intel GPU platform.
+vLLM initially supports basic model inference and serving on Intel GPU platform.
:::{attention}
There are no pre-built wheels or images for this device, so you must build vLLM from source.
@@ -65,7 +65,7 @@ $ docker run -it \
## Supported features
-XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
+XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:
```console
python -m vllm.entrypoints.openai.api_server \
@@ -78,6 +78,6 @@ python -m vllm.entrypoints.openai.api_server \
-tp=8
```
-By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script.
-There are some new features coming with ipex-xpu 2.6, eg: **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
+There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md
new file mode 100644
index 0000000000000..3e54022eebb21
--- /dev/null
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -0,0 +1,161 @@
+# vLLM V1 User Guide
+
+V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
+
+To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
+
+## Why vLLM V1?
+
+vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
+
+Building on V0βs success, vLLM V1 retains the stable and proven components from V0
+(such as the models, GPU kernels, and utilities). At the same time, it significantly
+re-architects the core systems, covering the scheduler, KV cache manager, worker,
+sampler, and API server, to provide a cohesive, maintainable framework that better
+accommodates continued growth and innovation.
+
+Specifically, V1 aims to:
+
+- Provide a **simple, modular, and easy-to-hack codebase**.
+- Ensure **high performance** with near-zero CPU overhead.
+- **Combine key optimizations** into a unified architecture.
+- Require **zero configs** by enabling features/optimizations by default.
+
+We see significant performance improvements from upgrading to V1 core engine, in
+particular for long context scenarios. Please see performance benchmark (To be
+added).
+
+For more details, check out the vLLM V1 blog post [vLLM V1: A Major
+Upgrade to vLLMβs Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html) (published Jan 27, 2025).
+
+This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
+
+### Supports Overview
+#### Hardware
+
+| Hardware | Status |
+|----------|------------------------------------------|
+| **NVIDIA** | π Natively Supported |
+| **AMD** | π§ WIP |
+| **TPU** | π§ WIP |
+#### Feature / Model
+
+| Feature / Model | Status |
+|-----------------|-----------------------------------------------------------------------------------|
+| **Prefix Caching** | π Optimized |
+| **Chunked Prefill** | π Optimized |
+| **Logprobs Calculation** | π’ Functional |
+| **LoRA** | π’ Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))|
+| **Multimodal Models** | π’ Functional |
+| **Spec Decode** | π§ WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))|
+| **Prompt Logprobs with Prefix Caching** | π‘ Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))|
+| **FP8 KV Cache** | π‘ Planned |
+| **Structured Output Alternative Backends** | π‘ Planned |
+| **Embedding Models** | π‘ Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249)) |
+| **Mamba Models** | π‘ Planned |
+| **Encoder-Decoder Models** | π‘ Planned |
+| **Request-level Structured Output Backend** | π΄ Deprecated |
+| **best_of** | π΄ Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))|
+| **Per-Request Logits Processors** | π΄ Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360)) |
+| **GPU <> CPU KV Cache Swapping** | π΄ Deprecated |
+
+- **π Optimized**: Nearly fully optimized, with no further work currently planned.
+- **π’ Functional**: Fully operational, with ongoing optimizations.
+- **π§ WIP**: Under active development.
+- **π‘ Planned**: Scheduled for future implementation (some may have open PRs/RFCs).
+- **π΄ Deprecated**: Not planned for v1 unless there is strong demand.
+
+**Note**: vLLM V1βs unified scheduler treats both prompt and output tokens the same
+way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically
+allocate a fixed token budget per request, enabling features like chunked prefills,
+prefix caching, and speculative decoding without a strict separation between prefill
+and decode phases.
+
+### Semantic Changes and Deprecated Features
+
+#### Logprobs
+
+vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
+differences compared to V0:
+
+**Logprobs Calculation**
+
+Logprobs in V1 are now returned immediately once computed from the modelβs raw output (i.e.
+before applying any logits post-processing such as temperature scaling or penalty
+adjustments). As a result, the returned logprobs do not reflect the final adjusted
+probabilities used during sampling.
+
+Support for logprobs with post-sampling adjustments is in progress and will be added in future updates.
+
+**Prompt Logprobs with Prefix Caching**
+
+Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](https://github.com/vllm-project/vllm/issues/13414).
+
+#### Deprecated Features
+
+As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
+
+**Sampling features**
+
+- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
+- **Per-Request Logits Processors**: In V0, users could pass custom
+ processing functions to adjust logits on a per-request basis. In vLLM V1, this
+ feature has been deprecated. Instead, the design is moving toward supporting **global logits
+ processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
+
+**KV Cache features**
+
+- **GPU <> CPU KV Cache Swapping**: with the new simplified core architecture, vLLM V1 no longer requires KV cache swapping
+to handle request preemptions.
+
+**Structured Output features**
+
+- **Request-level Structured Output Backend**: Deprecated, alternative backends
+ (outlines, guidance) with fallbacks is WIP.
+### Feature & Model Support in Progress
+
+Although we have re-implemented and partially optimized many features and models from V0 in vLLM V1, optimization work is still ongoing for some, and others remain unsupported.
+
+#### Features to Be Optimized
+
+These features are already supported in vLLM V1, but their optimization is still
+in progress.
+
+- **LoRA**: LoRA is functionally working on vLLM V1 but its performance is
+ inferior to that of V0. The team is actively working on improving its
+ performance
+(e.g., see [PR #13096](https://github.com/vllm-project/vllm/pull/13096)).
+
+- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
+ will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
+
+#### Features to Be Supported
+
+- **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 keyβvalue cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
+
+- **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently
+ supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar.
+ Details about the structured outputs can be found
+ [here](https://docs.vllm.ai/en/latest/features/structured_outputs.html).
+
+#### Models to Be Supported
+
+vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol,
+and the majority fall into the following categories. V1 support for these models will be added eventually.
+
+**Embedding Models**
+Instead of having a separate model runner, hidden states processor [RFC #12249](https://github.com/vllm-project/vllm/issues/12249), which is based on global logits processor [RFC #13360](https://github.com/vllm-project/vllm/pull/13360), has been proposed to enable simultaneous generation and embedding using the same engine instance in V1. It is still in the planning stage.
+
+**Mamba Models**
+Models using selective state-space mechanisms (instead of standard transformer attention)
+are not yet supported (e.g., `MambaForCausalLM`, `JambaForCausalLM`).
+
+**Encoder-Decoder Models**
+vLLM V1 is currently optimized for decoder-only transformers. Models requiring
+ cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
+
+For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
+
+## FAQ
+
+TODO
diff --git a/docs/source/index.md b/docs/source/index.md
index 52c4622d3e5a3..1624d5cf5aae7 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -67,6 +67,8 @@ getting_started/quickstart
getting_started/examples/examples_index
getting_started/troubleshooting
getting_started/faq
+getting_started/v1_user_guide
+
:::
% What does vLLM support?
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index bcbd7bf9600c5..5e5e7287f39eb 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -101,7 +101,7 @@ class MyAttention(nn.Module):
def forward(self, hidden_states, **kwargs): # <- kwargs are required
...
- attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
query_states,
@@ -477,6 +477,11 @@ See [this page](#generative-models) for more information on how to use generativ
* `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
* β
οΈ
* β
οΈ
+- * `Zamba2ForCausalLM`
+ * Zamba2
+ * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
+ *
+ *
:::
:::{note}
@@ -879,7 +884,7 @@ See [this page](#generative-models) for more information on how to use generativ
- * `PixtralForConditionalGeneration`
* Pixtral
* T + I+
- * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
+ * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.
*
* β
οΈ
* β
οΈ
@@ -946,7 +951,7 @@ V0 correctly implements the model's attention pattern:
V1 currently uses a simplified attention pattern:
- Uses causal attention for all tokens, including image tokens
-- Generates reasonable outputs but does not match the original model's attention for text + image inputs
+- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
- Will be updated in the future to support the correct behavior
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index e6be644b73932..591acc2c9b753 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -20,7 +20,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b
## Running vLLM on a single node
-vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
+vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inference currently requires Ray.
Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
@@ -29,7 +29,7 @@ To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size`
```python
from vllm import LLM
llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
-output = llm.generate("San Franciso is a")
+output = llm.generate("San Francisco is a")
```
To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
@@ -83,7 +83,7 @@ Since this is a ray cluster of **containers**, all the following commands should
Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
-After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
```console
vllm serve /path/to/the/model/in/the/container \
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
index 1d55f201503ce..647ece3f85f06 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@@ -39,7 +39,16 @@ The following metrics are exposed:
The following metrics are deprecated and due to be removed in a future version:
-- *(No metrics are currently deprecated)*
+- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
+ `vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
+ used in V1.
+- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
+ counters in V1.
+- `vllm:time_in_queue_requests` because it duplicates
+ `vllm:request_queue_time_seconds`.
+- `vllm:model_forward_time_milliseconds` and
+ `vllm:model_execute_time_milliseconds` because
+ prefill/decode/inference time metrics should be used instead.
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
diff --git a/docs/source/training/rlhf.md b/docs/source/training/rlhf.md
index 00822aefe11e6..72e89c0c7478c 100644
--- a/docs/source/training/rlhf.md
+++ b/docs/source/training/rlhf.md
@@ -1,6 +1,6 @@
# Reinforcement Learning from Human Feedback
-Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviours.
+Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 293b9fddac89e..840892ea07010 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -7,11 +7,13 @@ For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
import os
+from dataclasses import asdict
+from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser
@@ -23,21 +25,31 @@ question_per_audio_count = {
2: "What sport and what nursery rhyme are referenced?"
}
+
+class ModelRequestData(NamedTuple):
+ engine_args: EngineArgs
+ prompt: str
+ stop_token_ids: Optional[list[int]] = None
+ lora_requests: Optional[list[LoRARequest]] = None
+
+
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
# MiniCPM-O
-def run_minicpmo(question: str, audio_count: int):
+def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
model_name = "openbmb/MiniCPM-o-2_6"
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
- llm = LLM(model=model_name,
- trust_remote_code=True,
- max_model_len=4096,
- max_num_seqs=5,
- limit_mm_per_prompt={"audio": audio_count})
+ engine_args = EngineArgs(
+ model=model_name,
+ trust_remote_code=True,
+ max_model_len=4096,
+ max_num_seqs=5,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
@@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int):
tokenize=False,
add_generation_prompt=True,
chat_template=audio_chat_template)
- return llm, prompt, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ stop_token_ids=stop_token_ids,
+ )
# Phi-4-multimodal-instruct
-def run_phi4mm(questions: str, audio_count: int):
+def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process audio inputs.
@@ -67,36 +84,35 @@ def run_phi4mm(questions: str, audio_count: int):
speech_lora_path = os.path.join(model_path, "speech-lora")
placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
- prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
+ prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_path,
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=2,
enable_lora=True,
max_lora_rank=320,
- lora_extra_vocab_size=0,
limit_mm_per_prompt={"audio": audio_count},
)
- lora_request = LoRARequest("speech", 1, speech_lora_path)
- # To maintain code compatibility in this script, we add LoRA here.
- llm.llm_engine.add_lora(lora_request=lora_request)
- # You can also add LoRA using:
- # llm.generate(prompts, lora_request=lora_request,...)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompts,
+ lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
+ )
# Qwen2-Audio
-def run_qwen2_audio(question: str, audio_count: int):
+def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=5,
- limit_mm_per_prompt={"audio": audio_count})
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=5,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
audio_in_prompt = "".join([
f"Audio {idx+1}: "
@@ -107,12 +123,15 @@ def run_qwen2_audio(question: str, audio_count: int):
"<|im_start|>user\n"
f"{audio_in_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ )
# Ultravox 0.5-1B
-def run_ultravox(question: str, audio_count: int):
+def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -124,29 +143,39 @@ def run_ultravox(question: str, audio_count: int):
tokenize=False,
add_generation_prompt=True)
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=5,
- trust_remote_code=True,
- limit_mm_per_prompt={"audio": audio_count})
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=5,
+ trust_remote_code=True,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ )
# Whisper
-def run_whisper(question: str, audio_count: int):
+def run_whisper(question: str, audio_count: int) -> ModelRequestData:
assert audio_count == 1, (
"Whisper only support single audio input per prompt")
model_name = "openai/whisper-large-v3-turbo"
prompt = "<|startoftranscript|>"
- llm = LLM(model=model_name,
- max_model_len=448,
- max_num_seqs=5,
- limit_mm_per_prompt={"audio": audio_count})
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=448,
+ max_num_seqs=5,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ )
model_example_map = {
@@ -164,14 +193,24 @@ def main(args):
raise ValueError(f"Model type {model} is not supported.")
audio_count = args.num_audios
- llm, prompt, stop_token_ids = model_example_map[model](
- question_per_audio_count[audio_count], audio_count)
+ req_data = model_example_map[model](question_per_audio_count[audio_count],
+ audio_count)
+
+ engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+ llm = LLM(**engine_args)
+
+ # To maintain code compatibility in this script, we add LoRA here.
+ # You can also add LoRA using:
+ # llm.generate(prompts, lora_request=lora_request,...)
+ if req_data.lora_requests:
+ for lora_request in req_data.lora_requests:
+ llm.llm_engine.add_lora(lora_request=lora_request)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2,
max_tokens=64,
- stop_token_ids=stop_token_ids)
+ stop_token_ids=req_data.stop_token_ids)
mm_data = {}
if audio_count > 0:
@@ -183,7 +222,7 @@ def main(args):
}
assert args.num_prompts > 0
- inputs = {"prompt": prompt, "multi_modal_data": mm_data}
+ inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
if args.num_prompts > 1:
# Batch inference
inputs = [inputs] * args.num_prompts
@@ -214,6 +253,10 @@ if __name__ == "__main__":
default=1,
choices=[0, 1, 2],
help="Number of audio items per prompt.")
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args()
main(args)
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index b00519314d8bd..b73770ce382cf 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -76,5 +76,10 @@ if __name__ == "__main__":
GPUs_per_dp_rank))
proc.start()
procs.append(proc)
+ exit_code = 0
for proc in procs:
proc.join()
+ if proc.exitcode:
+ exit_code = proc.exitcode
+
+ exit(exit_code)
diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
new file mode 100644
index 0000000000000..baa91b2d0364d
--- /dev/null
+++ b/examples/offline_inference/eagle.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+import os
+
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+ "--dataset",
+ type=str,
+ default="./examples/data/gsm8k.jsonl",
+ help="downloaded from the eagle repo " \
+ "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+)
+parser.add_argument("--max_num_seqs", type=int, default=8)
+parser.add_argument("--num_prompts", type=int, default=80)
+parser.add_argument("--num_spec_tokens", type=int, default=2)
+parser.add_argument("--tp", type=int, default=1)
+parser.add_argument("--draft_tp", type=int, default=1)
+parser.add_argument("--enforce_eager", action='store_true')
+parser.add_argument("--enable_chunked_prefill", action='store_true')
+parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
+parser.add_argument("--temp", type=float, default=0)
+
+args = parser.parse_args()
+
+print(args)
+
+model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
+
+max_model_len = 2048
+
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+
+if os.path.exists(args.dataset):
+ prompts = []
+ num_prompts = args.num_prompts
+ with open(args.dataset) as f:
+ for line in f:
+ data = json.loads(line)
+ prompts.append(data["turns"][0])
+else:
+ prompts = ["The future of AI is", "The president of the United States is"]
+
+prompts = prompts[:args.num_prompts]
+num_prompts = len(prompts)
+
+prompt_ids = [
+ tokenizer.apply_chat_template([{
+ "role": "user",
+ "content": prompt
+ }],
+ add_generation_prompt=True)
+ for prompt in prompts
+]
+
+llm = LLM(
+ model=model_dir,
+ trust_remote_code=True,
+ tensor_parallel_size=args.tp,
+ enable_chunked_prefill=args.enable_chunked_prefill,
+ max_num_batched_tokens=args.max_num_batched_tokens,
+ enforce_eager=args.enforce_eager,
+ max_model_len=max_model_len,
+ max_num_seqs=args.max_num_seqs,
+ gpu_memory_utilization=0.8,
+ speculative_model=eagle_dir,
+ num_speculative_tokens=args.num_spec_tokens,
+ speculative_draft_tensor_parallel_size=args.draft_tp,
+ speculative_max_model_len=max_model_len,
+ disable_log_stats=False,
+)
+
+sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
+
+outputs = llm.generate(prompt_token_ids=prompt_ids,
+ sampling_params=sampling_params)
+
+# calculate the average number of accepted tokens per forward pass, +1 is
+# to account for the token from the target model that's always going to be
+# accepted
+acceptance_counts = [0] * (args.num_spec_tokens + 1)
+for output in outputs:
+ for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
+ acceptance_counts[step] += count
+
+print(f"mean acceptance length: \
+ {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index f44bc423658ec..6d0c3ac1ee09a 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -4,16 +4,23 @@ This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
"""
import time
+from collections.abc import Sequence
+from dataclasses import asdict
+from typing import NamedTuple
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, PromptType, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.utils import FlexibleArgumentParser
+class ModelRequestData(NamedTuple):
+ engine_args: EngineArgs
+ prompts: Sequence[PromptType]
+
+
def run_florence2():
- # Create a Florence-2 encoder/decoder model instance
- llm = LLM(
+ engine_args = EngineArgs(
model="microsoft/Florence-2-large",
tokenizer="facebook/bart-large",
max_num_seqs=8,
@@ -39,12 +46,15 @@ def run_florence2():
"decoder_prompt": "",
},
]
- return llm, prompts
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
def run_mllama():
- # Create a Mllama encoder/decoder model instance
- llm = LLM(
+ engine_args = EngineArgs(
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
max_model_len=4096,
max_num_seqs=2,
@@ -69,12 +79,15 @@ def run_mllama():
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
},
]
- return llm, prompts
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
def run_whisper():
- # Create a Whisper encoder/decoder model instance
- llm = LLM(
+ engine_args = EngineArgs(
model="openai/whisper-large-v3-turbo",
max_model_len=448,
max_num_seqs=16,
@@ -99,7 +112,11 @@ def run_whisper():
"decoder_prompt": "<|startoftranscript|>",
}
]
- return llm, prompts
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
model_example_map = {
@@ -114,7 +131,12 @@ def main(args):
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
- llm, prompts = model_example_map[model]()
+ req_data = model_example_map[model]()
+
+ engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+ llm = LLM(**engine_args)
+
+ prompts = req_data.prompts
# Create a sampling params object.
sampling_params = SamplingParams(
@@ -153,6 +175,10 @@ if __name__ == "__main__":
default="mllama",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args()
main(args)
diff --git a/examples/offline_inference/pixtral.py b/examples/offline_inference/mistral-small.py
similarity index 76%
rename from examples/offline_inference/pixtral.py
rename to examples/offline_inference/mistral-small.py
index 760de114508cd..43be2aa80773f 100644
--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/mistral-small.py
@@ -6,14 +6,16 @@ import argparse
from vllm import LLM
from vllm.sampling_params import SamplingParams
-# This script is an offline demo for running Pixtral.
+# This script is an offline demo for running Mistral-Small-3.1
#
# If you want to run a server/client setup, please follow this code:
#
# - Server:
#
# ```bash
-# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+# --tokenizer-mode mistral --config-format mistral --load-format mistral \
+# --limit-mm-per-prompt 'image=4' --max-model-len 16384
# ```
#
# - Client:
@@ -23,7 +25,7 @@ from vllm.sampling_params import SamplingParams
# --header 'Content-Type: application/json' \
# --header 'Authorization: Bearer token' \
# --data '{
-# "model": "mistralai/Pixtral-12B-2409",
+# "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
# "messages": [
# {
# "role": "user",
@@ -43,12 +45,20 @@ from vllm.sampling_params import SamplingParams
# python demo.py advanced
-def run_simple_demo():
- model_name = "mistralai/Pixtral-12B-2409"
+def run_simple_demo(args: argparse.Namespace):
+ model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
sampling_params = SamplingParams(max_tokens=8192)
- # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
- llm = LLM(model=model_name, tokenizer_mode="mistral")
+ # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
+ llm = LLM(
+ model=model_name,
+ tokenizer_mode="mistral",
+ config_format="mistral",
+ load_format="mistral",
+ max_model_len=4096,
+ max_num_seqs=2,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
prompt = "Describe this image in one sentence."
image_url = "https://picsum.photos/id/237/200/300"
@@ -76,8 +86,8 @@ def run_simple_demo():
print(outputs[0].outputs[0].text)
-def run_advanced_demo():
- model_name = "mistralai/Pixtral-12B-2409"
+def run_advanced_demo(args: argparse.Namespace):
+ model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
max_img_per_msg = 5
max_tokens_per_img = 4096
@@ -85,8 +95,11 @@ def run_advanced_demo():
llm = LLM(
model=model_name,
tokenizer_mode="mistral",
+ config_format="mistral",
+ load_format="mistral",
limit_mm_per_prompt={"image": max_img_per_msg},
max_model_len=max_img_per_msg * max_tokens_per_img,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompt = "Describe the following image."
@@ -153,14 +166,19 @@ def main():
help="Specify the demo mode: 'simple' or 'advanced'",
)
+ parser.add_argument(
+ '--disable-mm-preprocessor-cache',
+ action='store_true',
+ help='If True, disables caching of multi-modal preprocessor/mapper.')
+
args = parser.parse_args()
if args.mode == "simple":
print("Running simple demo...")
- run_simple_demo()
+ run_simple_demo(args)
elif args.mode == "advanced":
print("Running advanced demo...")
- run_advanced_demo()
+ run_advanced_demo(args)
if __name__ == "__main__":
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 432cda5e24396..1cc2562759d47 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -8,126 +8,167 @@ on HuggingFace model repository.
"""
import os
import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser
+
+class ModelRequestData(NamedTuple):
+ engine_args: EngineArgs
+ prompts: list[str]
+ stop_token_ids: Optional[list[int]] = None
+ lora_requests: Optional[list[LoRARequest]] = None
+
+
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
# Aria
-def run_aria(questions: list[str], modality: str):
+def run_aria(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "rhymes-ai/Aria"
# NOTE: Need L40 (or equivalent) to avoid OOM
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=2,
- dtype="bfloat16",
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=2,
+ dtype="bfloat16",
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
prompts = [(f"<|im_start|>user\n<|img|>{question}"
"<|im_end|>\n<|im_start|>assistant\n")
for question in questions]
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# BLIP-2
-def run_blip2(questions: list[str], modality: str):
+def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompts = [f"Question: {question} Answer:" for question in questions]
- llm = LLM(model="Salesforce/blip2-opt-2.7b",
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="Salesforce/blip2-opt-2.7b",
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Chameleon
-def run_chameleon(questions: list[str], modality: str):
+def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"{question}" for question in questions]
- llm = LLM(model="facebook/chameleon-7b",
- max_model_len=4096,
- max_num_seqs=2,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="facebook/chameleon-7b",
+ max_model_len=4096,
+ max_num_seqs=2,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Deepseek-VL2
-def run_deepseek_vl2(questions: list[str], modality: str):
+def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "deepseek-ai/deepseek-vl2-tiny"
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=2,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
- hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=2,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+ )
prompts = [
f"<|User|>: \n{question}\n\n<|Assistant|>:"
for question in questions
]
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Florence2
-def run_florence2(question: str, modality: str):
+def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
- llm = LLM(model="microsoft/Florence-2-large",
- tokenizer="facebook/bart-large",
- max_num_seqs=8,
- trust_remote_code=True,
- dtype="bfloat16",
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+ engine_args = EngineArgs(
+ model="microsoft/Florence-2-large",
+ tokenizer="facebook/bart-large",
+ max_num_seqs=8,
+ trust_remote_code=True,
+ dtype="bfloat16",
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
- prompt = ""
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+ prompts = ["" for _ in questions]
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Fuyu
-def run_fuyu(questions: list[str], modality: str):
+def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"{question}\n" for question in questions]
- llm = LLM(model="adept/fuyu-8b",
- max_model_len=2048,
- max_num_seqs=2,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="adept/fuyu-8b",
+ max_model_len=2048,
+ max_num_seqs=2,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Gemma 3
-def run_gemma3(questions: list[str], modality: str):
+def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "google/gemma-3-4b-it"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
max_model_len=2048,
max_num_seqs=2,
- # Default is False; setting it to True is not supported in V1 yet
mm_processor_kwargs={"do_pan_and_scan": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
@@ -135,22 +176,27 @@ def run_gemma3(questions: list[str], modality: str):
prompts = [("user\n"
f"{question}\n"
"model\n") for question in questions]
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# GLM-4v
-def run_glm4v(questions: list[str], modality: str):
+def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "THUDM/glm-4v-9b"
- llm = LLM(model=model_name,
- max_model_len=2048,
- max_num_seqs=2,
- trust_remote_code=True,
- enforce_eager=True,
- hf_overrides={"architectures": ["GLM4VForCausalLM"]},
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=2048,
+ max_num_seqs=2,
+ trust_remote_code=True,
+ enforce_eager=True,
+ hf_overrides={"architectures": ["GLM4VForCausalLM"]},
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
prompts = [
f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
@@ -158,16 +204,21 @@ def run_glm4v(questions: list[str], modality: str):
]
stop_token_ids = [151329, 151336, 151338]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# H2OVL-Mississippi
-def run_h2ovl(questions: list[str], modality: str):
+def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "h2oai/h2ovl-mississippi-800m"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
@@ -187,15 +238,20 @@ def run_h2ovl(questions: list[str], modality: str):
# Stop tokens for H2OVL-Mississippi
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
stop_token_ids = [tokenizer.eos_token_id]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# Idefics3-8B-Llama3
-def run_idefics3(questions: list[str], modality: str):
+def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
@@ -212,17 +268,20 @@ def run_idefics3(questions: list[str], modality: str):
prompts = [(
f"<|begin_of_text|>User:{question}\nAssistant:"
) for question in questions]
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# InternVL
-def run_internvl(questions: list[str], modality: str):
+def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "OpenGVLab/InternVL2-2B"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
@@ -245,53 +304,75 @@ def run_internvl(questions: list[str], modality: str):
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# LLaVA-1.5
-def run_llava(questions: list[str], modality: str):
+def run_llava(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [
f"USER: \n{question}\nASSISTANT:" for question in questions
]
- llm = LLM(model="llava-hf/llava-1.5-7b-hf",
- max_model_len=4096,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="llava-hf/llava-1.5-7b-hf",
+ max_model_len=4096,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(questions: list[str], modality: str):
+def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"[INST] \n{question} [/INST]" for question in questions]
- llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
- max_model_len=8192,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="llava-hf/llava-v1.6-mistral-7b-hf",
+ max_model_len=8192,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# LlaVA-NeXT-Video
# Currently only support for video input
-def run_llava_next_video(questions: list[str], modality: str):
+def run_llava_next_video(questions: list[str],
+ modality: str) -> ModelRequestData:
assert modality == "video"
prompts = [
f"USER: