diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index b39f9899a8f28..e6f5c8b60f459 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index f96c38bf57db7..86aae426c258c 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -7,7 +7,7 @@ steps:
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -62,23 +62,49 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- - block: "Build release image"
+ - block: "Build release image (x86)"
depends_on: ~
key: block-release-image-build
- - label: "Build release image"
+ - label: "Build release image (x86)"
depends_on: block-release-image-build
- id: build-release-image
+ id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+ # re-tag to default image tag and push, just in case arm64 build fails
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ - label: "Build release image (arm64)"
+ depends_on: block-release-image-build
+ id: build-release-image-arm64
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+ # Add job to create multi-arch manifest
+ - label: "Create multi-arch manifest"
+ depends_on:
+ - build-release-image-x86
+ - build-release-image-arm64
+ id: create-multi-arch-manifest
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+ - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
- label: "Annotate release workflow"
depends_on:
- - build-release-image
+ - create-multi-arch-manifest
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-11-8
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index b571618f48c2b..1073a4ee30afa 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
- && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index d55a786e41e8b..505664f3aecd0 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
- && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 445cd2735c190..73f3e63fbf5f6 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -31,6 +31,7 @@ docker run \
set -e
echo $ZE_AFFINITY_MASK
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+ VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
cd tests
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 20f3ce1adb46d..0d3b7a294d963 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -390,6 +390,7 @@ steps:
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
+ - vllm/distributed/device_communicators/
commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
@@ -654,6 +655,7 @@ steps:
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ce9590f02ce71..c087fd555c661 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -79,4 +79,10 @@ mkdocs.yaml @hmellor
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
/vllm/attention/ops/triton_unified_attention.py @tdoublep
+# ROCm related: specify owner with write access to notify AMD folks for careful code review
+/docker/Dockerfile.rocm* @gshtras
+/vllm/v1/attention/backends/rocm*.py @gshtras
+/vllm/v1/attention/backends/mla/rocm*.py @gshtras
+/vllm/attention/ops/rocm*.py @gshtras
+/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 1b30c1292df85..8043df65d5585 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -7,8 +7,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
## Test Result
-## (Optional) Documentation Update
-
---
Essential Elements of an Effective PR Description Checklist
@@ -17,6 +15,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
**BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions)
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
new file mode 100644
index 0000000000000..6401d6586cc3d
--- /dev/null
+++ b/.github/workflows/issue_autolabel.yml
@@ -0,0 +1,305 @@
+name: Label issues based on keywords
+on:
+ issues:
+ types: [opened, edited, reopened]
+permissions:
+ issues: write # needed so the workflow can add labels
+ contents: read
+concurrency:
+ group: issue-labeler-${{ github.event.issue.number }}
+ cancel-in-progress: true
+jobs:
+ add-labels:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Label issues based on keywords
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ with:
+ script: |
+ // Configuration: Add new labels and keywords here
+ const labelConfig = {
+ rocm: {
+ // Keyword search - matches whole words only (with word boundaries)
+ keywords: [
+ {
+ term: "composable kernel",
+ searchIn: "both"
+ },
+ {
+ term: "rccl",
+ searchIn: "body" // only search in body
+ },
+ {
+ term: "migraphx",
+ searchIn: "title" // only search in title
+ },
+ {
+ term: "hipgraph",
+ searchIn: "both"
+ },
+ {
+ term: "ROCm System Management Interface",
+ searchIn: "body"
+ },
+ ],
+
+ // Substring search - matches anywhere in text (partial matches)
+ substrings: [
+ {
+ term: "VLLM_ROCM_",
+ searchIn: "both"
+ },
+ {
+ term: "rocm",
+ searchIn: "title"
+ },
+ {
+ term: "amd",
+ searchIn: "title"
+ },
+ {
+ term: "hip-",
+ searchIn: "both"
+ },
+ {
+ term: "gfx",
+ searchIn: "both"
+ },
+ {
+ term: "cdna",
+ searchIn: "both"
+ },
+ {
+ term: "rdna",
+ searchIn: "both"
+ },
+ {
+ term: "torch_hip",
+ searchIn: "body" // only in body
+ },
+ {
+ term: "_hip",
+ searchIn: "both"
+ },
+ {
+ term: "hip_",
+ searchIn: "both"
+ },
+
+ // ROCm tools and libraries
+ {
+ term: "hipify",
+ searchIn: "both"
+ },
+ ],
+
+ // Regex patterns - for complex pattern matching
+ regexPatterns: [
+ {
+ pattern: "\\bmi\\d{3}[a-z]*\\b",
+ description: "AMD GPU names (mi + 3 digits + optional letters)",
+ flags: "gi",
+ searchIn: "both" // "title", "body", or "both"
+ }
+ ],
+ },
+ };
+
+ // Helper function to create regex based on search type
+ function createSearchRegex(term, type) {
+ // Escape special regex characters in the term
+ const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+
+ switch (type) {
+ case 'keyword':
+ // Word boundary search - matches whole words only
+ return new RegExp(`\\b${escapedTerm}\\b`, "gi");
+ case 'substring':
+ // Substring search - matches anywhere in the text
+ return new RegExp(escapedTerm, "gi");
+ default:
+ throw new Error(`Unknown search type: ${type}`);
+ }
+ }
+
+ // Helper function to find matching terms in text with line information
+ function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
+ const matches = [];
+ const lines = text.split('\n');
+
+ for (const termConfig of searchTerms) {
+ let regex;
+ let term, searchIn, pattern, description, flags;
+
+ // Handle different input formats (string or object)
+ if (typeof termConfig === 'string') {
+ term = termConfig;
+ searchIn = 'both'; // default
+ } else {
+ term = termConfig.term;
+ searchIn = termConfig.searchIn || 'both';
+ pattern = termConfig.pattern;
+ description = termConfig.description;
+ flags = termConfig.flags;
+ }
+
+ // Skip if this term shouldn't be searched in the current location
+ if (searchIn !== 'both' && searchIn !== searchLocation) {
+ continue;
+ }
+
+ // Create appropriate regex
+ if (searchType === 'regex') {
+ regex = new RegExp(pattern, flags || "gi");
+ } else {
+ regex = createSearchRegex(term, searchType);
+ }
+
+ const termMatches = [];
+
+ // Check each line for matches
+ lines.forEach((line, lineIndex) => {
+ const lineMatches = line.match(regex);
+ if (lineMatches) {
+ lineMatches.forEach(match => {
+ termMatches.push({
+ match: match,
+ lineNumber: lineIndex + 1,
+ lineContent: line.trim(),
+ searchType: searchType,
+ searchLocation: searchLocation,
+ originalTerm: term || pattern,
+ description: description,
+ // Show context around the match in the line
+ context: line.length > 100 ?
+ line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
+ line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
+ : line.trim()
+ });
+ });
+ }
+ });
+
+ if (termMatches.length > 0) {
+ matches.push({
+ term: term || (description || pattern),
+ searchType: searchType,
+ searchLocation: searchLocation,
+ searchIn: searchIn,
+ pattern: pattern,
+ matches: termMatches,
+ count: termMatches.length
+ });
+ }
+ }
+
+ return matches;
+ }
+
+ // Helper function to check if label should be added
+ async function processLabel(labelName, config) {
+ const body = context.payload.issue.body || "";
+ const title = context.payload.issue.title || "";
+
+ core.notice(`Processing label: ${labelName}`);
+ core.notice(`Issue Title: "${title}"`);
+ core.notice(`Issue Body length: ${body.length} characters`);
+
+ let shouldAddLabel = false;
+ let allMatches = [];
+ let reason = '';
+
+ const keywords = config.keywords || [];
+ const substrings = config.substrings || [];
+ const regexPatterns = config.regexPatterns || [];
+
+ core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
+
+ // Search in title
+ if (title.trim()) {
+ core.notice(`Searching in title: "${title}"`);
+
+ const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
+ const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
+ const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
+
+ allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
+ }
+
+ // Search in body
+ if (body.trim()) {
+ core.notice(`Searching in body (${body.length} characters)`);
+
+ const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
+ const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
+ const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
+
+ allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
+ }
+
+ if (allMatches.length > 0) {
+ core.notice(`Found ${allMatches.length} matching term(s):`);
+
+ for (const termMatch of allMatches) {
+ const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
+ const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
+
+ if (termMatch.searchType === 'regex') {
+ core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+ } else {
+ core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+ }
+
+ // Show details for each match
+ termMatch.matches.forEach((match, index) => {
+ core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
+ if (match.description) {
+ core.notice(` Description: ${match.description}`);
+ }
+ core.notice(` Context: ${match.context}`);
+ if (match.lineContent !== match.context) {
+ core.notice(` Full line: ${match.lineContent}`);
+ }
+ });
+ }
+
+ shouldAddLabel = true;
+ const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
+ const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
+ const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
+ const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
+ const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
+ const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
+
+ reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
+ }
+
+ core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
+ core.notice(`Reason: ${reason || 'No matching terms found'}`);
+
+ if (shouldAddLabel) {
+ const existingLabels = context.payload.issue.labels.map(l => l.name);
+ if (!existingLabels.includes(labelName)) {
+ await github.rest.issues.addLabels({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ labels: [labelName],
+ });
+ core.notice(`Label "${labelName}" added. ${reason}`);
+ return true;
+ }
+ core.notice(`Label "${labelName}" already present.`);
+ return false;
+ }
+
+ core.notice(`No matching terms found for label "${labelName}".`);
+ return false;
+ }
+
+ // Process all configured labels
+ const processLabels = Object.entries(labelConfig)
+ .map(([labelName, config]) => processLabel(labelName, config));
+ const labelsAdded = await Promise.all(processLabels);
+ const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
+ core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 612b290e88d46..c16bdeeecd07a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
- id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos
- rev: v1.34.0
+ rev: v1.35.5
hooks:
- id: typos
- repo: https://github.com/PyCQA/isort
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aca42c3fe5553..b0eb0f32e03a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12", "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
@@ -817,7 +817,9 @@ set(VLLM_MOE_EXT_SRC
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
- list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
+ list(APPEND VLLM_MOE_EXT_SRC
+ "csrc/moe/moe_wna16.cu"
+ "csrc/moe/grouped_topk_kernels.cu")
endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/README.md b/README.md
index ef5b43588953c..8812aac4ea266 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ Easy, fast, and cheap LLM serving for everyone
*Latest News* 🔥
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
diff --git a/SECURITY.md b/SECURITY.md
index 414669fb3712e..d6319cdb1ac27 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -42,4 +42,9 @@ For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we ma
* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
+* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
+ * Substantial internal deployment leveraging the upstream vLLM project.
+ * Established internal security teams and comprehensive compliance measures.
+ * Active and consistent contributions to the upstream vLLM project.
+
* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
diff --git a/benchmarks/README.md b/benchmarks/README.md
index a2dd5bb58325c..38072152b653b 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -749,7 +749,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
-Ex.1: Fixed number of items and a single image resolutionm, enforcing generation of approx 40 tokens:
+Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
```bash
vllm bench serve \
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c7f290e1eb88e..6b24b8c8f3c67 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -96,7 +96,6 @@ def run_vllm(
end = time.perf_counter()
else:
assert lora_requests is None, "BeamSearch API does not support LoRA"
- prompts = [request.prompt for request in requests]
# output_len should be the same for all requests.
output_len = requests[0].expected_output_len
for request in requests:
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
new file mode 100644
index 0000000000000..883f0cf7e55f1
--- /dev/null
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ w8a8_block_fp8_matmul,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton as vllm_triton
+
+assert current_platform.is_cuda(), (
+ "Only support benchmarking w8a8 block fp8 kernel on CUDA device."
+)
+
+# DeepSeek-V3 weight shapes
+DEEPSEEK_V3_SHAPES = [
+ (512 + 64, 7168),
+ ((128 + 64) * 128, 7168),
+ (128 * (128 + 128), 512),
+ (7168, 16384),
+ (7168, 18432),
+ (18432 * 2, 7168),
+ (24576, 1536),
+ (12288, 7168),
+ (4096, 7168),
+ (7168, 2048),
+]
+
+
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+ """Build runner function for w8a8 block fp8 matmul."""
+ factor_for_scale = 1e-2
+
+ fp8_info = torch.finfo(torch.float8_e4m3fn)
+ fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+ # Create random FP8 tensors
+ A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+ A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+ B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+ B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+ # Create scales
+ block_n, block_k = block_size[0], block_size[1]
+ n_tiles = (N + block_n - 1) // block_n
+ k_tiles = (K + block_k - 1) // block_k
+
+ As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
+ Bs = (
+ torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
+ * factor_for_scale
+ )
+
+ def run():
+ return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+
+ return run
+
+
+@vllm_triton.testing.perf_report(
+ vllm_triton.testing.Benchmark(
+ x_names=["batch_size"],
+ x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+ x_log=False,
+ line_arg="provider",
+ line_vals=["torch-bf16", "w8a8-block-fp8"],
+ line_names=["torch-bf16", "w8a8-block-fp8"],
+ ylabel="TFLOP/s (larger is better)",
+ plot_name="BF16 vs W8A8 Block FP8 GEMMs",
+ args={},
+ )
+)
+def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
+ M = batch_size
+ device = "cuda"
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "torch-bf16":
+ a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
+ b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+ )
+ else: # w8a8-block-fp8
+ run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: run_w8a8(), quantiles=quantiles
+ )
+
+ to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+ return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+if __name__ == "__main__":
+ block_size = (128, 128)
+
+ for N, K in DEEPSEEK_V3_SHAPES:
+ print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
+
+ print(f"TFLOP/s comparison (block_size={block_size}):")
+ benchmark_tflops.run(
+ print_data=True,
+ # show_plots=False,
+ # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
+ N=N,
+ K=K,
+ block_size=block_size,
+ )
+
+ print("\nBenchmark finished!")
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index cc38cd41a5b24..52bfd82c7fcfe 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,6 +1,7 @@
include(FetchContent)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
new file mode 100644
index 0000000000000..78f7b3cc1aa25
--- /dev/null
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -0,0 +1,757 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/TensorRT-LLM/blob/v0.21.0/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu
+ * Copyright (c) 2025, The vLLM team.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include
+#include
+#include
+#include
+#include
+#include
+namespace cg = cooperative_groups;
+
+namespace vllm {
+namespace moe {
+
+constexpr unsigned FULL_WARP_MASK = 0xffffffff;
+constexpr int32_t WARP_SIZE = 32;
+constexpr int32_t BLOCK_SIZE = 512;
+constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
+
+namespace warp_topk {
+
+template
+__host__ __device__ constexpr T round_up_to_multiple_of(T len) {
+ if (len == 0) {
+ return 0;
+ }
+ return ((len - 1) / size + 1) * size;
+}
+
+template
+constexpr __host__ __device__ bool isPowerOf2(T v) {
+ return (v && !(v & (v - 1)));
+}
+
+template
+__forceinline__ __device__ bool is_better_than(T val, T baseline) {
+ return (val > baseline && greater) || (val < baseline && !greater);
+}
+
+template
+__forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
+ idxT baseline_index) {
+ bool res = (val > baseline && greater) || (val < baseline && !greater);
+ if (val == baseline) {
+ res = (index < baseline_index && greater) ||
+ (index < baseline_index && !greater);
+ }
+ return res;
+}
+
+template
+int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
+ int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
+ int64_t n = std::max(num_of_warp / 2 * k, num_of_warp * WARP_SIZE);
+ return max(cache_topk,
+ round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
+}
+
+template
+struct BitonicMerge {
+ // input should be a bitonic sequence, and sort it to be a monotonic sequence
+ __device__ static void merge(T* __restrict__ val_arr,
+ idxT* __restrict__ idx_arr) {
+ static_assert(isPowerOf2(size));
+ static_assert(size >= 2 * WARP_SIZE);
+ constexpr int arr_len = size / WARP_SIZE;
+
+ constexpr int stride = arr_len / 2;
+ for (int i = 0; i < stride; ++i) {
+ int const other_i = i + stride;
+ T& val = val_arr[i];
+ T& other_val = val_arr[other_i];
+ bool is_better;
+ if constexpr (is_stable) {
+ is_better = is_better_than(val, other_val, idx_arr[i],
+ idx_arr[other_i]);
+ } else {
+ is_better = is_better_than(val, other_val);
+ }
+
+ if (is_better) {
+ T tmp = val;
+ val = other_val;
+ other_val = tmp;
+
+ idxT tmp2 = idx_arr[i];
+ idx_arr[i] = idx_arr[other_i];
+ idx_arr[other_i] = tmp2;
+ }
+ }
+
+ BitonicMerge::merge(
+ val_arr, idx_arr);
+ BitonicMerge::merge(
+ val_arr + arr_len / 2, idx_arr + arr_len / 2);
+ }
+};
+
+template
+struct BitonicSort {
+ __device__ static void sort(T* __restrict__ val_arr,
+ idxT* __restrict__ idx_arr) {
+ static_assert(isPowerOf2(size));
+ static_assert(size >= 2 * WARP_SIZE);
+ constexpr int arr_len = size / WARP_SIZE;
+
+ BitonicSort::sort(val_arr, idx_arr);
+ BitonicSort::sort(
+ val_arr + arr_len / 2, idx_arr + arr_len / 2);
+ BitonicMerge::merge(
+ val_arr, idx_arr);
+ }
+};
+
+template
+struct BitonicSort<32, ascending, T, idxT, is_stable> {
+ __device__ static void sort(T* __restrict__ val_arr,
+ idxT* __restrict__ idx_arr) {
+ int const lane = threadIdx.x % WARP_SIZE;
+
+ // ascending doesn't matter before merging since all we need is a bitonic
+ // sequence
+ for (int stage = 0; stage < 4; ++stage) {
+ for (int stride = (1 << stage); stride > 0; stride /= 2) {
+ bool reverse = (lane >> stage) & 2;
+ bool is_second = lane & stride;
+
+ T other = __shfl_xor_sync(FULL_WARP_MASK, *val_arr, stride);
+ idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, *idx_arr, stride);
+
+ bool is_better;
+ if constexpr (is_stable) {
+ if constexpr (ascending) {
+ is_better = ((*val_arr > other) ||
+ ((*val_arr == other) && (*idx_arr < other_idx))) !=
+ (reverse != is_second);
+ } else {
+ is_better = ((*val_arr > other) ||
+ ((*val_arr == other) && (*idx_arr > other_idx))) !=
+ (reverse != is_second);
+ }
+ } else {
+ is_better = (*val_arr != other &&
+ (*val_arr > other) != (reverse != is_second));
+ }
+ if (is_better) {
+ *val_arr = other;
+ *idx_arr = other_idx;
+ }
+ }
+ }
+
+ BitonicMerge<32, ascending, ascending, T, idxT, is_stable>::merge(val_arr,
+ idx_arr);
+ }
+};
+
+template
+struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
+ __device__ static void merge(T* __restrict__ val_arr,
+ idxT* __restrict__ idx_arr) {
+ int const lane = threadIdx.x % WARP_SIZE;
+ for (int stride = WARP_SIZE / 2; stride > 0; stride /= 2) {
+ bool is_second = lane & stride;
+ T& val = *val_arr;
+ T other = __shfl_xor_sync(FULL_WARP_MASK, val, stride);
+ idxT& idx = *idx_arr;
+ idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, idx, stride);
+
+ bool is_better;
+ if constexpr (is_stable) {
+ if constexpr (ascending) {
+ is_better = ((*val_arr > other) ||
+ ((*val_arr == other) && (*idx_arr < other_idx))) ==
+ (reverse != is_second); // for min
+ } else {
+ is_better = ((*val_arr > other) ||
+ ((*val_arr == other) && (*idx_arr > other_idx))) ==
+ (reverse != is_second); // for max
+ }
+ } else {
+ is_better =
+ (val != other && ((val > other) == (ascending != is_second)));
+ }
+
+ if (is_better) {
+ val = other;
+ idx = other_idx;
+ }
+ }
+ }
+};
+
+template
+class WarpSort {
+ public:
+ __device__ WarpSort(idxT k, T dummy)
+ : lane_(threadIdx.x % WARP_SIZE), k_(k), dummy_(dummy) {
+ static_assert(capacity >= WARP_SIZE && isPowerOf2(capacity));
+
+ for (int i = 0; i < max_arr_len_; ++i) {
+ val_arr_[i] = dummy_;
+ idx_arr_[i] = 0;
+ }
+ }
+
+ // load and merge k sorted values
+ __device__ void load_sorted(T const* __restrict__ in,
+ idxT const* __restrict__ in_idx, idxT start) {
+ idxT idx = start + WARP_SIZE - 1 - lane_;
+ for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WARP_SIZE) {
+ if (idx < start + k_) {
+ T t = in[idx];
+ bool is_better;
+ if constexpr (is_stable) {
+ is_better =
+ is_better_than(t, val_arr_[i], in_idx[idx], idx_arr_[i]);
+ } else {
+ is_better = is_better_than(t, val_arr_[i]);
+ }
+ if (is_better) {
+ val_arr_[i] = t;
+ idx_arr_[i] = in_idx[idx];
+ }
+ }
+ }
+
+ BitonicMerge::merge(
+ val_arr_, idx_arr_);
+ }
+
+ __device__ void dump(T* __restrict__ out, idxT* __restrict__ out_idx) const {
+ for (int i = 0; i < max_arr_len_; ++i) {
+ idxT out_i = i * WARP_SIZE + lane_;
+ if (out_i < k_) {
+ out[out_i] = val_arr_[i];
+ out_idx[out_i] = idx_arr_[i];
+ }
+ }
+ }
+
+ __device__ void dumpIdx(idxT* __restrict__ out_idx) const {
+ for (int i = 0; i < max_arr_len_; ++i) {
+ idxT out_i = i * WARP_SIZE + lane_;
+ if (out_i < k_) {
+ out_idx[out_i] = idx_arr_[i];
+ }
+ }
+ }
+
+ protected:
+ static constexpr int max_arr_len_ = capacity / WARP_SIZE;
+
+ T val_arr_[max_arr_len_];
+ idxT idx_arr_[max_arr_len_];
+
+ int const lane_;
+ idxT const k_;
+ T const dummy_;
+
+}; // end class WarpSort
+
+template
+class WarpSelect : public WarpSort {
+ public:
+ __device__ WarpSelect(idxT k, T dummy)
+ : WarpSort(k, dummy),
+ k_th_(dummy),
+ k_th_lane_((k - 1) % WARP_SIZE) {
+ extern __shared__ char smem_buf[]; // extern __shared__ T smem_buf[];
+
+ int const num_of_warp = blockDim.x / WARP_SIZE;
+ int const warp_id = threadIdx.x / WARP_SIZE;
+ val_smem_ = reinterpret_cast(smem_buf);
+ val_smem_ += warp_id * WARP_SIZE;
+ idx_smem_ = reinterpret_cast(
+ smem_buf +
+ round_up_to_multiple_of<256>(num_of_warp * sizeof(T) * WARP_SIZE));
+ idx_smem_ += warp_id * WARP_SIZE;
+ }
+
+ __device__ void add(T const* in, idxT start, idxT end) {
+ idxT const end_for_fullwarp =
+ round_up_to_multiple_of(end - start) + start;
+ for (idxT i = start + lane_; i < end_for_fullwarp; i += WARP_SIZE) {
+ T val = (i < end) ? in[i] : dummy_;
+ add(val, i);
+ }
+ }
+
+ __device__ void add(T val, idxT idx) {
+ bool do_add;
+ if constexpr (is_stable) {
+ do_add = is_better_than(val, k_th_, idx, k_th_idx_);
+ } else {
+ do_add = is_better_than(val, k_th_);
+ }
+
+ uint32_t mask = __ballot_sync(FULL_WARP_MASK, do_add);
+ if (mask == 0) {
+ return;
+ }
+
+ int pos = smem_buf_len_ + __popc(mask & ((0x1u << lane_) - 1));
+ if (do_add && pos < WARP_SIZE) {
+ val_smem_[pos] = val;
+ idx_smem_[pos] = idx;
+ do_add = false;
+ }
+ smem_buf_len_ += __popc(mask);
+ if (smem_buf_len_ >= WARP_SIZE) {
+ __syncwarp();
+ merge_buf_(val_smem_[lane_], idx_smem_[lane_]);
+ smem_buf_len_ -= WARP_SIZE;
+ }
+ if (do_add) {
+ pos -= WARP_SIZE;
+ val_smem_[pos] = val;
+ idx_smem_[pos] = idx;
+ }
+ __syncwarp();
+ }
+
+ __device__ void done() {
+ if (smem_buf_len_) {
+ T val = (lane_ < smem_buf_len_) ? val_smem_[lane_] : dummy_;
+ idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0;
+ merge_buf_(val, idx);
+ }
+
+ // after done(), smem is used for merging results among warps
+ __syncthreads();
+ }
+
+ private:
+ __device__ void set_k_th_() {
+ k_th_ = __shfl_sync(FULL_WARP_MASK, val_arr_[max_arr_len_ - 1], k_th_lane_);
+ if constexpr (is_stable) {
+ k_th_idx_ =
+ __shfl_sync(FULL_WARP_MASK, idx_arr_[max_arr_len_ - 1], k_th_lane_);
+ }
+ }
+
+ __device__ void merge_buf_(T val, idxT idx) {
+ BitonicSort::sort(&val, &idx);
+
+ T& old = val_arr_[max_arr_len_ - 1];
+
+ bool is_better;
+ if constexpr (is_stable) {
+ is_better =
+ is_better_than(val, old, idx, idx_arr_[max_arr_len_ - 1]);
+ } else {
+ is_better = is_better_than(val, old);
+ }
+
+ if (is_better) {
+ old = val;
+ idx_arr_[max_arr_len_ - 1] = idx;
+ }
+
+ BitonicMerge::merge(
+ val_arr_, idx_arr_);
+
+ set_k_th_();
+ }
+
+ using WarpSort::max_arr_len_;
+ using WarpSort::val_arr_;
+ using WarpSort::idx_arr_;
+ using WarpSort::lane_;
+ using WarpSort::k_;
+ using WarpSort::dummy_;
+
+ T* val_smem_;
+ idxT* idx_smem_;
+ int smem_buf_len_ = 0;
+
+ T k_th_;
+ idxT k_th_idx_;
+ int const k_th_lane_;
+}; // end class WarpSelect
+} // namespace warp_topk
+
+template
+__device__ inline T_OUT cuda_cast(T_IN val) {
+ return val;
+}
+
+template <>
+__device__ inline float cuda_cast(__nv_bfloat16 val) {
+ return __bfloat162float(val);
+}
+
+template
+__device__ void topk_with_k2(T* output, T const* input,
+ cg::thread_block_tile<32> const& tile,
+ int32_t const lane_id,
+ int const num_experts_per_group) {
+ // Get the top2 per thread
+ T largest = -INFINITY;
+ T second_largest = -INFINITY;
+
+ if (num_experts_per_group > WARP_SIZE) {
+ for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
+ T value = input[i];
+ if (value > largest) {
+ second_largest = largest;
+ largest = value;
+ } else if (value > second_largest) {
+ second_largest = value;
+ }
+ }
+ } else {
+ for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
+ largest = input[i];
+ }
+ }
+
+ __syncwarp(); // Ensure all threads have valid data before reduction
+ // Get the top2 warpwise
+ T max1 = cg::reduce(tile, largest, cg::greater());
+
+ T max2 = max1;
+ bool equal_to_max1 = (max1 == largest);
+
+ int count_max1 = __popc(__ballot_sync(FULL_WARP_MASK, equal_to_max1));
+
+ if (count_max1 == 1) {
+ largest = (largest == max1) ? second_largest : largest;
+ max2 = cg::reduce(tile, largest, cg::greater());
+ }
+
+ if (lane_id == 0) {
+ *output = max1 + max2;
+ }
+}
+
+template
+__global__ void topk_with_k2_kernel(T* output, T* input,
+ int64_t const num_tokens,
+ int64_t const num_cases,
+ int64_t const n_group,
+ int64_t const num_experts_per_group) {
+ int32_t warp_id = threadIdx.x / WARP_SIZE;
+ int32_t lane_id = threadIdx.x % WARP_SIZE;
+
+ int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
+ if (case_id < num_cases) {
+ input += case_id * num_experts_per_group;
+ output += case_id;
+
+ cg::thread_block block = cg::this_thread_block();
+ cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+ asm volatile("griddepcontrol.wait;");
+#endif
+ topk_with_k2(output, input, tile, lane_id, num_experts_per_group);
+ }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+ asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template
+__global__ void group_idx_and_topk_idx_kernel(
+ T* scores, T const* group_scores, T* topk_values, IdxT* topk_indices,
+ T* scores_with_bias, int64_t const num_tokens, int64_t const n_group,
+ int64_t const topk_group, int64_t const topk, int64_t const num_experts,
+ int64_t const num_experts_per_group, bool renormalize,
+ double routed_scaling_factor) {
+ int32_t warp_id = threadIdx.x / WARP_SIZE;
+ int32_t lane_id = threadIdx.x % WARP_SIZE;
+ int32_t case_id =
+ blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id; // one per token
+ scores_with_bias += case_id * num_experts;
+ scores += case_id * num_experts;
+ group_scores += case_id * n_group;
+ topk_values += case_id * topk;
+ topk_indices += case_id * topk;
+
+ int32_t align_num_experts_per_group =
+ warp_topk::round_up_to_multiple_of(num_experts_per_group);
+
+ cg::thread_block block = cg::this_thread_block();
+ cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
+
+ extern __shared__ char smem_buf[]; // NOTE: reuse the shared memory here to
+ // store the target topk idx
+ int32_t* s_topk_idx = reinterpret_cast(smem_buf);
+ T* s_topk_value =
+ reinterpret_cast(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
+ warp_id * topk;
+ s_topk_idx += warp_id * topk;
+
+ T value = cuda::std::numeric_limits::min();
+ T topk_group_value = cuda::std::numeric_limits::min();
+ int32_t num_equalto_topkth_group;
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+ asm volatile("griddepcontrol.wait;"); // I think all prolog can be put before
+ // acqbulk because it's ptr arithmetic
+#endif
+
+ if (case_id < num_tokens) {
+ // calculate group_idx
+ int32_t target_num_min = WARP_SIZE - n_group + topk_group;
+ if (lane_id < n_group &&
+ (isfinite(cuda_cast(
+ group_scores[lane_id])))) // The check is necessary to avoid
+ // abnormal input
+ {
+ value = group_scores[lane_id];
+ }
+
+ int count_equal_to_top_value = WARP_SIZE - n_group;
+ int pre_count_equal_to_top_value = 0;
+ // Use loop to find the largset top_group
+ while (count_equal_to_top_value < target_num_min) {
+ __syncwarp(); // Ensure all threads have valid data before reduction
+ topk_group_value = cg::reduce(tile, value, cg::greater());
+ if (value == topk_group_value) {
+ value = cuda::std::numeric_limits::min();
+ }
+ pre_count_equal_to_top_value = count_equal_to_top_value;
+ count_equal_to_top_value = __popc(__ballot_sync(
+ FULL_WARP_MASK, (value == cuda::std::numeric_limits::min())));
+ }
+ num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
+ }
+ __syncthreads();
+
+ warp_topk::WarpSelect*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
+ /* is_stable */ true>
+ queue((int32_t)topk, -INFINITY);
+
+ int count_equalto_topkth_group = 0;
+ bool if_proceed_next_topk =
+ (topk_group_value != cuda::std::numeric_limits::min());
+ if (case_id < num_tokens && if_proceed_next_topk) {
+ for (int i_group = 0; i_group < n_group; i_group++) {
+ if ((group_scores[i_group] > topk_group_value) ||
+ ((group_scores[i_group] == topk_group_value) &&
+ (count_equalto_topkth_group < num_equalto_topkth_group))) {
+ int32_t offset = i_group * num_experts_per_group;
+ for (int32_t i = lane_id; i < align_num_experts_per_group;
+ i += WARP_SIZE) {
+ T candidates =
+ (i < num_experts_per_group) && isfinite(cuda_cast(
+ scores_with_bias[offset + i]))
+ ? scores_with_bias[offset + i]
+ : cuda::std::numeric_limits::min();
+ queue.add(candidates, offset + i);
+ }
+ if (group_scores[i_group] == topk_group_value) {
+ count_equalto_topkth_group++;
+ }
+ }
+ }
+ queue.done();
+ __syncwarp();
+ // Get the topk_idx
+ queue.dumpIdx(s_topk_idx);
+ __syncwarp();
+ }
+
+ // Load the valid score value
+ // Calculate the summation
+ float topk_sum = 1e-20;
+ if (case_id < num_tokens && if_proceed_next_topk) {
+ for (int i = lane_id;
+ i < warp_topk::round_up_to_multiple_of(topk);
+ i += WARP_SIZE) {
+ T value =
+ i < topk
+ ? scores[s_topk_idx[i]]
+ : cuda_cast(0.0f); // Load the valid value of expert
+ if (i < topk) {
+ s_topk_value[i] = value;
+ }
+ topk_sum += reduce(tile, cuda_cast(value), cg::plus());
+ }
+ }
+
+ __syncthreads();
+
+ if (case_id < num_tokens) {
+ if (if_proceed_next_topk) {
+ for (int i = lane_id; i < topk; i += WARP_SIZE) {
+ float value;
+ if (renormalize) {
+ value = cuda_cast(s_topk_value[i]) / topk_sum *
+ routed_scaling_factor;
+ } else {
+ value = cuda_cast(s_topk_value[i]) * routed_scaling_factor;
+ }
+ topk_indices[i] = s_topk_idx[i];
+ topk_values[i] = cuda_cast(value);
+ }
+ } else {
+ for (int i = lane_id; i < topk; i += WARP_SIZE) {
+ topk_indices[i] = i;
+ topk_values[i] = cuda_cast(1.0f / topk);
+ }
+ }
+ // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
+ // default result.
+ }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+ asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template
+void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
+ IdxT* topk_indices, T* scores_with_bias,
+ int64_t const num_tokens, int64_t const num_experts,
+ int64_t const n_group, int64_t const topk_group,
+ int64_t const topk, bool const renormalize,
+ double const routed_scaling_factor, bool enable_pdl = false,
+ cudaStream_t const stream = 0) {
+ int64_t num_cases = num_tokens * n_group;
+ int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
+ auto* kernel_instance1 = &topk_with_k2_kernel;
+ cudaLaunchConfig_t config;
+ config.gridDim = topk_with_k2_num_blocks;
+ config.blockDim = BLOCK_SIZE;
+ config.dynamicSmemBytes = 0;
+ config.stream = stream;
+ cudaLaunchAttribute attrs[1];
+ attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+ attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+ config.numAttrs = 1;
+ config.attrs = attrs;
+ cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias,
+ num_tokens, num_cases, n_group, num_experts / n_group);
+
+ int64_t topk_with_k_group_num_blocks =
+ (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
+ size_t dynamic_smem_in_bytes =
+ warp_topk::calc_smem_size_for_block_wide(NUM_WARPS_PER_BLOCK,
+ topk);
+ auto* kernel_instance2 = &group_idx_and_topk_idx_kernel;
+ config.gridDim = topk_with_k_group_num_blocks;
+ config.blockDim = BLOCK_SIZE;
+ config.dynamicSmemBytes = dynamic_smem_in_bytes;
+ config.stream = stream;
+ attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+ attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+ config.numAttrs = 1;
+ config.attrs = attrs;
+ cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
+ topk_values, topk_indices, scores_with_bias, num_tokens,
+ n_group, topk_group, topk, num_experts,
+ num_experts / n_group, renormalize, routed_scaling_factor);
+}
+
+#define INSTANTIATE_NOAUX_TC(T, IdxT) \
+ template void invokeNoAuxTc( \
+ T * scores, T * group_scores, T * topk_values, IdxT * topk_indices, \
+ T * scores_with_bias, int64_t const num_tokens, \
+ int64_t const num_experts, int64_t const n_group, \
+ int64_t const topk_group, int64_t const topk, bool const renormalize, \
+ double const routed_scaling_factor, bool enable_pdl, \
+ cudaStream_t const stream);
+
+INSTANTIATE_NOAUX_TC(float, int32_t);
+INSTANTIATE_NOAUX_TC(half, int32_t);
+INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t);
+} // end namespace moe
+} // namespace vllm
+
+std::tuple grouped_topk(
+ torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
+ int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
+ double routed_scaling_factor) {
+ auto data_type = scores_with_bias.scalar_type();
+ auto input_size = scores_with_bias.sizes();
+ int64_t num_tokens = input_size[0];
+ int64_t num_experts = input_size[1];
+ TORCH_CHECK(input_size.size() == 2, "scores_with_bias must be a 2D Tensor");
+ TORCH_CHECK(num_experts % n_group == 0,
+ "num_experts should be divisible by n_group");
+ TORCH_CHECK(n_group <= 32,
+ "n_group should be smaller than or equal to 32 for now");
+ TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
+
+ torch::Tensor group_scores = torch::empty(
+ {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
+ torch::Tensor topk_values = torch::empty(
+ {num_tokens, topk}, torch::dtype(data_type).device(torch::kCUDA));
+ torch::Tensor topk_indices = torch::empty(
+ {num_tokens, topk}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+
+ auto stream = c10::cuda::getCurrentCUDAStream(scores_with_bias.get_device());
+
+ switch (data_type) {
+ case torch::kFloat16:
+ // Handle Float16
+ vllm::moe::invokeNoAuxTc(
+ reinterpret_cast(scores.mutable_data_ptr()),
+ reinterpret_cast(group_scores.mutable_data_ptr()),
+ reinterpret_cast(topk_values.mutable_data_ptr()),
+ reinterpret_cast(topk_indices.mutable_data_ptr()),
+ reinterpret_cast(scores_with_bias.data_ptr()), num_tokens,
+ num_experts, n_group, topk_group, topk, renormalize,
+ routed_scaling_factor, false, stream);
+ break;
+ case torch::kFloat32:
+ // Handle Float32
+ vllm::moe::invokeNoAuxTc(
+ reinterpret_cast(scores.mutable_data_ptr()),
+ reinterpret_cast(group_scores.mutable_data_ptr()),
+ reinterpret_cast(topk_values.mutable_data_ptr()),
+ reinterpret_cast(topk_indices.mutable_data_ptr()),
+ reinterpret_cast(scores_with_bias.data_ptr()), num_tokens,
+ num_experts, n_group, topk_group, topk, renormalize,
+ routed_scaling_factor, false, stream);
+ break;
+ case torch::kBFloat16:
+ // Handle BFloat16
+ vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>(
+ reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()),
+ reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()),
+ reinterpret_cast<__nv_bfloat16*>(topk_values.mutable_data_ptr()),
+ reinterpret_cast(topk_indices.mutable_data_ptr()),
+ reinterpret_cast<__nv_bfloat16*>(scores_with_bias.data_ptr()),
+ num_tokens, num_experts, n_group, topk_group, topk, renormalize,
+ routed_scaling_factor, false, stream);
+ break;
+ default:
+ // Handle other data types
+ throw std::invalid_argument(
+ "Invalid dtype, only supports float16, float32, and bfloat16");
+ break;
+ }
+ return {topk_values, topk_indices};
+}
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 661730c96867e..92fc280b362b9 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -22,6 +22,11 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
torch::Tensor num_tokens_post_pad, int64_t top_k,
int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
int64_t BLOCK_SIZE_K, int64_t bit);
+
+std::tuple grouped_topk(
+ torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
+ int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
+ double routed_scaling_factor);
#endif
bool moe_permute_unpermute_supported();
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 99c52ef17d08b..cd80bfda7dfde 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -573,7 +573,7 @@ void topk_softmax(
stream);
}
else {
- assert(topk_indices.scalar_type() == at::ScalarType::Int64);
+ TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr(),
topk_weights.data_ptr(),
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 7e49f68f62438..8f33d6cd666fa 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -78,6 +78,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
"output_tensor) -> ()");
m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
+ // Apply grouped topk routing to select experts.
+ m.def(
+ "grouped_topk(Tensor scores, Tensor scores_with_bias, int n_group, int "
+ "topk_group, int topk, bool renormalize, float "
+ "routed_scaling_factor) -> (Tensor, Tensor)");
+ m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
#endif
}
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 839ac501dbaf0..2e272cbca8417 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
# Install FlashInfer from source
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.2.12"
+ARG FLASHINFER_GIT_REF="v0.2.14.post1"
# Flag to control whether to compile FlashInfer AOT kernels
# Set to "true" to enable AOT compilation:
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
diff --git a/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png b/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png
new file mode 100644
index 0000000000000..185f61e6a3ede
Binary files /dev/null and b/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png differ
diff --git a/docs/assets/design/hybrid_kv_cache_manager/full_attn.png b/docs/assets/design/hybrid_kv_cache_manager/full_attn.png
new file mode 100644
index 0000000000000..30eade5c7051c
Binary files /dev/null and b/docs/assets/design/hybrid_kv_cache_manager/full_attn.png differ
diff --git a/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png b/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png
new file mode 100644
index 0000000000000..bcffc27a71649
Binary files /dev/null and b/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png differ
diff --git a/docs/assets/design/hybrid_kv_cache_manager/overview.png b/docs/assets/design/hybrid_kv_cache_manager/overview.png
new file mode 100644
index 0000000000000..ac80581f491da
Binary files /dev/null and b/docs/assets/design/hybrid_kv_cache_manager/overview.png differ
diff --git a/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png b/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png
new file mode 100644
index 0000000000000..10aa6146dc7ab
Binary files /dev/null and b/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png differ
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 61ea44220ad2e..221a7bd96213f 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -3,6 +3,7 @@
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
+- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
- [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
- [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 058eba5fe0b1e..efda9c8e019eb 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
If you run out of CPU RAM, try the following options:
-- (Multi-modal models only) you can set the size of multi-modal processor cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB per API process + 4 GiB per engine core process)
+- (Multi-modal models only) you can set the size of multi-modal cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB).
- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
## Multi-modal input limits
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 6c7c31f503c15..b11ccb5c00273 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -164,12 +164,15 @@ llm = LLM(
)
```
-!! important
+!!! important
Batch-level DP is not to be confused with API request-level DP
(which is instead controlled by `data_parallel_size`).
-The availablilty of batch-level DP is based on model implementation.
-Currently, the following models support `mm_encoder_tp_mode="data"`:
+Batch-level DP needs to be implemented on a per-model basis,
+and enabled by setting `supports_encoder_tp_data = True` in the model class.
+Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to use this feature.
+
+Known supported models:
- Llama4 ()
- MiniCPM-V-4 ()
@@ -204,20 +207,33 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
to avoid CPU resource exhaustion.
!!! note
- [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
+ API server scale-out disables [multi-modal IPC caching](#ipc-caching)
because it requires a one-to-one correspondance between API and engine core processes.
+ This does not impact [multi-modal processor caching](#processor-caching).
+
## Multi-Modal Caching
-### Processor Cache
-
-By default, the multi-modal processor cache is enabled to avoid repeatedly processing
-the same multi-modal inputs via Hugging Face `AutoProcessor`,
+Multi-modal caching avoids repeated transfer or processing of the same multi-modal data,
which commonly occurs in multi-turn conversations.
-You can adjust the size of the cache by setting the value of `mm_processor_cache_gb`
-(default 4 GiB per API process + 4 GiB per engine core process).
-If you do not benefit much from the cache, you can disable it completely via `mm_processor_cache_gb=0`.
+### Processor Caching
+
+Multi-modal processor caching is automatically enabled
+to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalProcessor`.
+
+### IPC Caching
+
+Multi-modal IPC caching is automatically enabled when
+there is a one-to-one correspondance between API (`P0`) and engine core (`P1`) processes,
+to avoid repeatedly transferring the same multi-modal inputs between them.
+
+### Configuration
+
+You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` (default 4 GiB).
+
+If you do not benefit much from the cache, you can disable both IPC
+and processor caching completely via `mm_processor_cache_gb=0`.
Examples:
@@ -230,3 +246,16 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_cache_gb=0)
```
+
+### Cache Placement
+
+Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:
+
+| Processor Caching | IPC Caching | `P0` Cache | `P1` Cache | Max. Memory |
+|-------------------|-------------|------------|------------|-------------|
+| ✅ | ✅ | K | K + V | `mm_processor_cache_gb * data_parallel_size` |
+| ✅ | ❌ | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
+| ❌ | ❌ | N/A | N/A | `0` |
+
+K: Stores the hashes of multi-modal items
+V: Stores the processed tensor data of multi-modal items
diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
index a93435ed71b50..e456077e04958 100644
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@@ -45,32 +45,32 @@ This initial compilation time ranges significantly and is impacted by many of th
### Optimize based on your data
-#### max model len vs. most model len
+#### max-model-len vs. most-model-len

-If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most model len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
+If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`.
-The requests get subdivided into max-model-len and most-model-len categories, for the latter category, we can gain better performance since the server can process more requests at a time.
+The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time.
#### Padding
-For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128: 128, 256, etc.
+For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.)
-The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about tpu padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
+The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
-1) the default exponential padding (pad to the nearest power of 2)
-2) bucket padding (pad to the nearest linearly increasing bucket).
+1. the default exponential padding (pad to the nearest power of 2)
+2. bucket padding (pad to the nearest linearly increasing bucket).
When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`.
For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512].
-The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320.
+The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320.
-However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compilaed graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
+However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
#### Quantization
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 3c4c7d2102170..202e9c1caf113 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -133,7 +133,7 @@ class FusedMoEModularKernel:
Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & Combine implementation / kernel. For example,
* PplxPrepareAndFinalize type is backed by Pplx All2All kernels,
-* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughtput All2All kernels, and
+* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughput All2All kernels, and
* DeepEPLLPrepareAndFinalize type is backed by DeepEP Low-Latency All2All kernels.
#### Step 1: Add an All2All manager
@@ -183,7 +183,7 @@ implementations that input `FusedMoEActivationFormat.Standard` support chunking
#### maybe_make_prepare_finalize
-The `maybe_make_prepare_finalize` method is responsbile for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled. The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case. Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case.
+The `maybe_make_prepare_finalize` method is responsible for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled. The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case. Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case.
Please refer to the implementations in,
* `ModelOptNvFp4FusedMoE`
@@ -198,7 +198,7 @@ Please refer to the implementations in,
* `CompressedTensorsW8A8Fp8MoECutlassMethod`
* `Fp8MoEMethod`
* `ModelOptNvFp4FusedMoE`
-dervied classes.
+derived classes.
#### init_prepare_finalize
diff --git a/docs/design/hybrid_kv_cache_manager.md b/docs/design/hybrid_kv_cache_manager.md
new file mode 100644
index 0000000000000..8f17b473adc08
--- /dev/null
+++ b/docs/design/hybrid_kv_cache_manager.md
@@ -0,0 +1,245 @@
+# Hybrid KV Cache Manager
+
+!!! warning
+ This document was written based on commit [458e74](https://github.com/vllm-project/vllm/commit/458e74eb907f96069e6d8a4f3c9f457001fef2ea). This feature is still in its early stage and things may change.
+
+## What is a hybrid model?
+
+Many recent "hybrid" LLMs combine multiple attention types within one model. For example:
+
+1. Sliding window attention (sw) + full attention (full): gpt-oss, Gemma 2/3, Ministral, cohere, etc.
+2. Mamba + full: Bamba, Jamba, Minimax, etc.
+3. Local chunked attention + full: Llama4
+
+To serve these models efficiently, our [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] must:
+
+1. Allocate different slots to different layer type, for example:
+ - Full attention layers: reserve slots for **all** tokens.
+ - Sliding window layers: reserve slots only for the most recent **`sliding_window_size`** tokens.
+2. Support layer-specific prefix-cache rules, for example:
+ - Full attention: a cache hit prefix requires **all** tokens remain in the KV cache.
+ - Sliding window: a cache hit prefix only requires the last **`sliding_window_size`** tokens remain in the KV cache.
+
+## Definitions
+
+1. **kv hidden size**: The number of bytes to store one token's KV cache for a single layer.
+2. **block**: the memory reserved for kv cache are divided into multiple *blocks* with the same *page size* (defined below)
+3. **block size**: number of tokens inside a block
+4. **page size**: the physical memory size of a block, defined as:
+
+ $$
+ \text{num_layers} \times \text{block_size} \times \text{kv_hidden_size}
+ $$
+
+ `num_layers` doesn't mean the total number of layers in the model. The exact number depends on the context in this doc.
+
+ !!! note
+ This is different from `KVCacheSpec.page_size_bytes` in the code, which is defined as:
+
+ $$
+ \text{block_size} \times \text{kv_hidden_size}
+ $$
+
+## Allocation
+
+### High level idea
+
+We use a single memory pool for all layer types. The memory pool is split into multiple blocks with the same page size. [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] allocates different numbers of blocks to different layers according to its attention type.
+
+The core challenge is ensuring every layer type uses the same **page size**. For full-attention-only models, the page size is straightforward, defined as:
+
+$$
+\text{page_size} = \text{block_size} \times \text{num_hidden_layers} \times \text{kv_hidden_size}
+$$
+
+However, in hybrid models, `num_hidden_layers` varies by attention type, which would normally produce mismatched page sizes. The cases below show how we unify them.
+
+### Case 1: toy model
+
+Let's start with a toy example: a model has 1 full attention layer and 3 sliding window attention layers. All layers have the same `kv_hidden_size`.
+
+We let each block to hold `block_size` tokens for one layer, so:
+
+$$
+\text{page_size} = \text{kv_hidden_size} \times \text{block_size}
+$$
+
+[KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] allocates a different number of blocks to each layer.
+
+This case is only a toy example. For real models, please refer to the following cases.
+
+### Case 2: same `kv_hidden_size` and a regular pattern
+
+When the model has more layers, e.g., 20 sliding window attention layers and 10 full attention layers with the same `kv_hidden_size`. Calling the allocator once per layer (30 calls) is OK but becomes inefficient. As a solution, we group the allocation of layers that need the same number of blocks to reduce the number of calls.
+
+The grouping is feasible because there is usually a beautiful ratio between the number of different types of layers. For example:
+
+- Gemma-2: 1 sw : 1 full
+- Llama 4: 3 local : 1 full
+
+Our example can be regarded as 2 sw : 1 full. We can allocate blocks as if there are 2 sw and 1 full in the model, and repeat the result by 10 times to generate the `block_ids` for the 30 layers. The page size becomes:
+
+$$
+10 \times \text{kv_hidden_size} \times \text{block_size}
+$$
+
+Assume `block_size` 16, sliding window size 32, request length 112, then for the above example model, we need to allocate 11 blocks (0-6 for full, 7-8 for sw group 1, 9-10 for sw group 2).
+
+
+
+Here, "/" denotes no block needed (sliding‑window layers don't need slots for early tokens).
+
+See the formal definition below. The layers are divided into multiple *KV Cache Groups* so that there is:
+
+1. **Identical attention type inside each group**: Each group only contains layers with the same attention type and thus need the same number of blocks for a given request. This enables layers in the same group share the same block ids without memory waste.
+2. **Identical page size across groups**: Because our memory pool only have one page size.
+
+Our example model is divided into 3 KV cache groups:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+
+Obviously, it satisfies rule 1. For rule 2, all 3 groups have
+
+$$
+10 \times \text{kv_hidden_size} \times \text{block_size}
+$$
+
+as their page size.
+
+### Case 3: same `kv_hidden_size` and no regular pattern
+
+Unfortunately, not all models have such a beautiful ratio, and approach in Case 2 will produce too many small groups. For example, Gemma-3-27b has 52 sliding window attention layers and 10 full attention layers. With the constraints in case 2, it would be 26 sliding window groups and 5 full attention groups, each contains 2 layers. The allocation is still inefficient. To reduce the number of kv cache groups, we group layers using the smallest layer count among all attention types. For example, min(52, 10)=10 layers per group in Gemma-3-27b. Then the grouping result is:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+- ...
+- Group 6: 10 sliding window attention layers (sw.40 - sw.49)
+- Group 7: 2 sliding window attention layers (sw.50 - sw.51) and 8 padding layers
+
+We will update this algorithm if this heuristic leads to a bad result when a new model comes out (e.g., 20 full + 30 sw, the group size should be 10 instead of 20).
+
+This case happens in Gemma-3 series models, and models in case 2 but with eagle speculative decoding which introduce one full attention layer. The solution has some memory waste and is not perfect. Please report any cases where padding overhead becomes unacceptable so we can refine the algorithm.
+
+### Case 4: different `kv_hidden_size` (mainly hybrid mamba models)
+
+Some architectures (e.g., Bamba, Jamba, Minimax) interleave standard attention layers with Mamba layers, where each Mamba layer's state size per token can be much larger than the attention layers' `kv_hidden_size`. Because we only support a single page size across all groups, we must reconcile these differing hidden sizes.
+
+The current algorithm is:
+
+1. Increase the `block_size` of attention layers until
+ $$
+ \text{block_size} \times \text{kv_hidden_size}_{\text{att}} \ge \text{state_size}_{\text{mamba}}
+ $$
+2. Pad the mamba state per layer to
+ $$
+ \text{block_size} \times \text{kv_hidden_size}_{\text{att}}
+ $$
+3. Apply the grouping strategy in case 3.
+
+!!! note
+ This can lead to more than 400 `block_size` for attention layers, which is too large. Another padding strategy is to increase `block_size` until
+
+ $$
+ \text{block_size} \times \text{kv_hidden_size}_{\text{att}} \times \text{num_attn_layers} \ge \text{state_size}_{\text{mamba}}
+ $$
+
+ This padding strategy is still a work in progress.
+
+### Case 5: KV sharing
+
+KV sharing refers to a layer using the KV cache of another layer, e.g., gemma-3n.
+In these models, [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] ignores all layers with kv sharing and only allocates KV cache for layers that need kv cache, and some patches are made in model runner to apply the allocation result to kv sharing layers.
+
+## Prefix caching
+
+For simplicity, we assume `block_size=1` in this section.
+
+### High level idea
+
+The block pool uses a dict similar to `tuple(block_hash, group_id) -> block` to catch the full blocks. That means the same tokens of different groups are cached and evicted independently.
+
+When a new request comes in, we check the cache hit prefix of each group, and return the intersection of these groups as the cached prefix of the request. See below for the detailed algorithm for checking the cache hit of one group & performing the intersection.
+
+### Case 0: full attention only models
+
+For full attention layers, blocks are allocated for all tokens in the request. For details on the underlying design, see [Prefix Caching](prefix_caching.md)
+
+To find the longest cache hit prefix of a request, we enumerate from left (the first block) to right (the last block), checking whether the block is cached, and exit when cache misses. For example, we will return the first 7 tokens (0-6) as the cache hit prefix in the below example (blue blocks are cached):
+
+
+
+### Case 1: sliding window attention only models
+
+For sliding window attention layers, a naive implementation for memory allocation is to allocate `sliding_window_size` blocks and fill in the blocks in a round-robin way. But this naive implementation is not compatible with prefix caching so we didn't pick this design. In vLLM, we allocate different blocks for different tokens and free blocks that are outside the sliding window.
+
+For a new request, the cache hit prefix only requires the last `sliding_window_size - 1` tokens being cached.
+Let's say `sliding_window_size = 4` and `block_size = 1`, and the request is a 15-token prompt (blue blocks are cached):
+
+
+
+There are 3 possible cache hit prefixes:
+
+- cache hit length 5, compute prefill with [2, 3, 4] → [5, 6, …, 14]
+- cache hit length 6, compute prefill with [3, 4, 5] → [6, 7, …, 14]
+- cache hit length 14, compute prefill with [11, 12, 13] → [14] (most efficient)
+
+We can check the cache hit from right to left, and early exit when we find a match.This is opposite from full attention, where we check from left to right and early exit when the match fails. One potential cons (compared to full attention) is that we end up iterating over the entire list of tokens when there's no match, which is often a common case. This could potentially cause non-negligible overheads, but fine with full + swa, as discussed below.
+
+### Case 2: sliding window attention + full attention models
+
+The first problem is how to find the cache hit prefix. We need to "intersect" the cache hits of global and sliding window attention layers by:
+
+1. Get the longest cache hit for full attention (scanning from left to right)
+2. Get the longest cache hit for sliding window attention that is within that length. Implemented by checking cache hits from right to left starting from the cache hit length of full attention.
+
+It can be ensured that the resulting cache hit of sliding window attention layers is also a cache hit of full attention layers. This is more efficient than finding all possible prefixes of each group and doing the intersection, because our approach can exit early if there is no cache hit.
+
+The algorithm applies to models with exactly two attention types full attention + X, where X can be an arbitrary efficient attention algorithm like sliding window, llama 4 local attention, and mamba. It doesn't support models without full attention layers, and models with more than 2 types of attention. This is enough for most hybrid models at the moment of writing this doc.
+
+The second question is the cache eviction policy. For now, we use one LRU queue for all kv cache groups. The blocks are added to the LRU queue when freed, either because the request is finished or the block is out of the sliding window.
+
+### Case 3: mamba models
+
+The prefix caching support of the mamba model is work in progress. Once implemented, models with mamba layer + full attention layer can be supported via the full attention + X algorithm in case 2.
+
+## Implementation
+
+### Overview
+
+
+
+The `KVCacheManager` is organized into 3 layers:
+
+- **[KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager]**: The interface between the scheduler and kv cache management system.
+- **[KVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.KVCacheCoordinator]**: coordinate per-group SingleTypeKVCacheManagers to generate the allocation result of a request. Depending on the model's configuration, one of these coordinators is chosen:
+ - **[KVCacheCoordinatorNoPrefixCache][vllm.v1.core.kv_cache_coordinator.KVCacheCoordinatorNoPrefixCache]**: Used when prefix caching is disabled.
+ - **[UnitaryKVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.UnitaryKVCacheCoordinator]**: If only one KV cache group. The prefix caching logic is simplified as no intersection is needed.
+ - **[HybridKVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.HybridKVCacheCoordinator]**: Handles exactly two KV cache groups (must include one full‑attention group plus one other efficient‑attention group). Other cases are not implemented. You can disable prefix caching to use the KVCacheCoordinatorNoPrefixCache.
+- **[SingleTypeKVCacheManager][vllm.v1.core.single_type_kv_cache_manager.SingleTypeKVCacheManager]**: Each instance manages allocation and prefix caching for one KV cache group, implementing the attention‑type–specific logic (e.g., full attention, sliding window, Mamba).
+
+The blue box in the above figure shows the case with 10 full attention layers and 20 sliding window attention layers, thus:
+
+- use `HybridKVCacheCoordinator`
+- use 1 `FullAttentionManager` and 2 `SlidingWindowManager` for the 3 `KVCacheGroup`s.
+
+### Memory Layout
+
+For a model with n `KVCacheGroup`s, each with m layers, we allocate m buffers. Each buffer is shared by n layers, one from each group.
+
+The following figure is for a model with 10 full attention layers (full.0 - full.9) and 20 sliding window attention layers (sw.0-sw.19). It follows "case 2" in "Allocation" section and is divided into 3 groups:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+
+And for a request, we allocate 11 blocks with `block_id` 0-6 to group 0, 7-8 to group 1, and 9-10 to group 2.
+
+With such an example, the physical memory is divided into 10 buffers (`KVCacheTensor` 0 - `KVCacheTensor` 9). Each buffer is shared by 3 layers (e.g., `KVCacheTensor` 0 is shared by full.0 from group 0, sw.0 from group 1, and sw.10 from group 2) and is divided into pieces with size `block_size * kv_hidden_size`. The KV cache of these 3 attention layers are saved to different pieces of the buffer based on the allocated `block_ids`:
+
+
+
+!!! note
+ One logic "block" is mapped to 10 pieces in the 10 buffers of the physical memory.
diff --git a/docs/examples/README.md b/docs/examples/README.md
index 34e4dfd408a20..3cf93027f4209 100644
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -2,6 +2,6 @@
vLLM's examples are split into three categories:
-- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference/)
-- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving/)
-- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others/)
+- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference)
+- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving)
+- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others)
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index e18c128f30fc9..4605ba7781ed4 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -4,7 +4,6 @@ Quantization trades off model precision for smaller memory footprint, allowing l
Contents:
-- [Supported Hardware](supported_hardware.md)
- [AutoAWQ](auto_awq.md)
- [AutoRound](auto_round.md)
- [BitsAndBytes](bnb.md)
@@ -19,3 +18,50 @@ Contents:
- [AMD Quark](quark.md)
- [Quantized KV Cache](quantized_kvcache.md)
- [TorchAO](torchao.md)
+
+## Supported Hardware
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+
+
+| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | AWS Neuron | Google TPU |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
+| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ |
+| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ |
+| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ |
+| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ |
+| BitBLAS | ✅︎ | ✅ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+| BitBLAS (GPTQ) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
+| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | ❌ |
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- ✅︎ indicates that the quantization method is supported on the specified hardware.
+- ❌ indicates that the quantization method is not supported on the specified hardware.
+
+!!! note
+ This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+ For the most up-to-date information on hardware support and quantization methods, please refer to or consult with the vLLM development team.
diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md
index 6f53a448ee364..53b689ad53ff6 100644
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -5,7 +5,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic
!!! note
Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
- For details see [supported hardware](supported_hardware.md).
+ For details see [supported hardware](README.md#supported-hardware).
Below are the steps to utilize BitBLAS with vLLM.
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
deleted file mode 100644
index 06264d08b56aa..0000000000000
--- a/docs/features/quantization/supported_hardware.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Supported Hardware
-
-The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
-
-
-
-| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | AWS Neuron | Google TPU |
-|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
-| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ |
-| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ |
-| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ |
-| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ |
-| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
-| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | ❌ |
-
-- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
-- ✅︎ indicates that the quantization method is supported on the specified hardware.
-- ❌ indicates that the quantization method is not supported on the specified hardware.
-
-!!! note
- This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
-
- For the most up-to-date information on hardware support and quantization methods, please refer to or consult with the vLLM development team.
diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index 0ee680f5c688c..8a658b7a9103f 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -12,7 +12,6 @@ vLLM supports the following hardware platforms:
- [Apple silicon](cpu.md#apple-silicon)
- [IBM Z (S390X)](cpu.md#ibm-z-s390x)
- [Google TPU](google_tpu.md)
-- [Intel Gaudi](intel_gaudi.md)
- [AWS Neuron](aws_neuron.md)
## Hardware Plugins
diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md
deleted file mode 100644
index ff912efec9ca8..0000000000000
--- a/docs/getting_started/installation/intel_gaudi.md
+++ /dev/null
@@ -1,388 +0,0 @@
-# Intel Gaudi
-
-This page provides instructions on running vLLM with Intel Gaudi devices.
-
-!!! warning
- There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
-## Requirements
-
-- OS: Ubuntu 22.04 LTS
-- Python: 3.10
-- Intel Gaudi accelerator
-- Intel Gaudi software version 1.18.0
-
-Please follow the instructions provided in the
-[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
-to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the
-[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
-
-## Configure a new environment
-
-### Environment verification
-
-To verify that the Intel Gaudi software was correctly installed, run:
-
-```bash
-hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
-pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-pip list | grep neural # verify that neural_compressor_pt is installed
-```
-
-Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
-for more details.
-
-### Run Docker Image
-
-It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
-for more details.
-
-Use the following commands to run a Docker image:
-
-```bash
-docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-docker run \
- -it \
- --runtime=habana \
- -e HABANA_VISIBLE_DEVICES=all \
- -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
- --cap-add=sys_nice \
- --net=host \
- --ipc=host \
- vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-```
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built Intel Gaudi wheels.
-
-### Build wheel from source
-
-To build and install vLLM from source, run:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -r requirements/hpu.txt
-python setup.py develop
-```
-
-Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
-
-```bash
-git clone https://github.com/HabanaAI/vllm-fork.git
-cd vllm-fork
-git checkout habana_main
-pip install -r requirements/hpu.txt
-python setup.py develop
-```
-
-## Set up using Docker
-
-### Pre-built images
-
-Currently, there are no pre-built Intel Gaudi images.
-
-### Build image from source
-
-```bash
-docker build -f docker/Dockerfile.hpu -t vllm-hpu-env .
-docker run \
- -it \
- --runtime=habana \
- -e HABANA_VISIBLE_DEVICES=all \
- -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
- --cap-add=sys_nice \
- --net=host \
- --rm vllm-hpu-env
-```
-
-!!! tip
- If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-
-## Extra information
-
-### Supported features
-
-- [Offline inference](../../serving/offline_inference.md)
-- Online serving via [OpenAI-Compatible Server](../../serving/openai_compatible_server.md)
-- HPU autodetection - no need to manually select device within vLLM
-- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
-- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
- prefill attention, Root Mean Square Layer Normalization, Rotary
- Positional Encoding
-- Tensor parallelism support for multi-card inference
-- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
- for accelerating low-batch latency and throughput
-- Attention with Linear Biases (ALiBi)
-- INC quantization
-
-### Unsupported features
-
-- Beam search
-- LoRA adapters
-- AWQ quantization
-- Prefill chunking (mixed-batch inferencing)
-
-### Supported configurations
-
-The following configurations have been validated to function with
-Gaudi2 devices. Configurations that are not listed may or may not work.
-
-| Model | TP Size| dtype | Sampling |
-|-------|--------|--------|----------|
-| [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 8 | BF16 | Random / Greedy |
-
-## Performance tuning
-
-### Execution modes
-
-Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
-
-| `PT_HPU_LAZY_MODE` | `enforce_eager` | execution mode |
-|----------------------|-------------------|--------------------|
-| 0 | 0 | torch.compile |
-| 0 | 1 | PyTorch eager mode |
-| 1 | 0 | HPU Graphs |
-
-!!! warning
- In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-
-[](){ #gaudi-bucketing-mechanism }
-
-### Bucketing mechanism
-
-Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
-In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
-
-!!! note
- Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-
-Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
-
-```text
-INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-```
-
-| Parameter | Description |
-|----------------|-----------------------------------------------------------------------------|
-| `min` | Determines the lowest value of the bucket. |
-| `step` | Determines the interval between buckets. |
-| `max` | Determines the upper bound of the bucket. |
-| Ramp-up phase | A special handling phase applied between `min` and `step`:
- `min` is multiplied by consecutive powers of two until `step` is reached.
- Minimizes resource wastage for small batch sizes.
- Allows larger padding for larger batches. |
-
-Example (with ramp-up):
-
-```text
-min = 2, step = 32, max = 64
-=> ramp_up = (2, 4, 8, 16)
-=> stable = (32, 64)
-=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
-```
-
-Example (without ramp-up):
-
-```text
-min = 128, step = 128, max = 512
-=> ramp_up = ()
-=> stable = (128, 256, 384, 512)
-=> buckets = ramp_up + stable => (128, 256, 384, 512)
-```
-
-In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
-
-!!! warning
- If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-
-As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
-
-!!! note
- Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-
-### Warmup
-
-Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
-
-??? console "Logs"
-
- ```text
- INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
- INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
- INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
- ...
- INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
- INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
- INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
- INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
- ...
- INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
- INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
- ```
-
-This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
-
-!!! tip
- Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-
-### HPU Graph capture
-
-[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
-
-When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default).
-Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
-Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
-Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
-Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture.
-With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
-Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
-Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
-
-!!! note
- `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
-
-User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
-
-- `max_bs` - graph capture queue will be sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
-- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
-
-When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
-
-!!! note
- `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt to do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-
-Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
-
-??? console "Logs"
-
- ```text
- INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
- INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
- INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
- INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
- INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
- INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
- INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
- INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
- INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
- INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
- INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
- INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
- ...
- INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
- INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
- INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
- ...
- INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
- INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
- ...
- INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
- INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
- INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
- INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
- INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
- INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
- INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
- INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
- INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
- ```
-
-### Recommended vLLM Parameters
-
-- We recommend running inference on Gaudi 2 with `block_size` of 128
- for BF16 data type. Using default values (16, 32) might lead to
- sub-optimal performance due to Matrix Multiplication Engine
- under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
-- For max throughput on Llama 7B, we recommend running with batch size
- of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
- If you encounter out-of-memory issues, see troubleshooting section.
-
-### Environment variables
-
-**Diagnostic and profiling knobs:**
-
-- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default.
-
-**Performance tuning knobs:**
-
-- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
-
-- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
-
-- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
-
-- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
-
-- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
-
-- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
-
- - `{phase}` is either `PROMPT` or `DECODE`
-
- - `{dim}` is either `BS`, `SEQ` or `BLOCK`
-
- - `{param}` is either `MIN`, `STEP` or `MAX`
-
- - Default values:
-
-| `{phase}` | Parameter | Env Variable | Value Expression |
-|-----------|-----------|--------------|------------------|
-| Prompt | Batch size min | `VLLM_PROMPT_BS_BUCKET_MIN` | `1` |
-| Prompt | Batch size step | `VLLM_PROMPT_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
-| Prompt | Batch size max | `VLLM_PROMPT_BS_BUCKET_MAX` | `min(max_num_seqs, 64)` |
-| Prompt | Sequence length min | `VLLM_PROMPT_SEQ_BUCKET_MIN` | `block_size` |
-| Prompt | Sequence length step | `VLLM_PROMPT_SEQ_BUCKET_STEP` | `block_size` |
-| Prompt | Sequence length max | `VLLM_PROMPT_SEQ_BUCKET_MAX` | `max_model_len` |
-| Decode | Batch size min | `VLLM_DECODE_BS_BUCKET_MIN` | `1` |
-| Decode | Batch size step | `VLLM_DECODE_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
-| Decode | Batch size max | `VLLM_DECODE_BS_BUCKET_MAX` | `max_num_seqs` |
-| Decode | Sequence length min | `VLLM_DECODE_BLOCK_BUCKET_MIN` | `block_size` |
-| Decode | Sequence length step | `VLLM_DECODE_BLOCK_BUCKET_STEP` | `block_size` |
-| Decode | Sequence length max | `VLLM_DECODE_BLOCK_BUCKET_MAX` | `max(128, (max_num_seqs*max_model_len)/block_size)` |
-
-Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
-
-- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default.
-- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
-
-## Troubleshooting: tweaking HPU graphs
-
-If you experience device out-of-memory issues or want to attempt
-inference at higher batch sizes, try tweaking HPU Graphs by following
-the below:
-
-- Tweak `gpu_memory_utilization` knob. It will decrease the
- allocation of KV cache, leaving some headroom for capturing graphs
- with larger batch size. By default `gpu_memory_utilization` is set
- to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
- short profiling run. Note that decreasing reduces the number of KV
- cache blocks you have available, and therefore reduces the effective
- maximum number of tokens you can handle at a given time.
-- If this method is not efficient, you can disable `HPUGraph`
- completely. With HPU Graphs disabled, you are trading latency and
- throughput at lower batches for potentially higher throughput on
- higher batches. You can do that by adding `--enforce-eager` flag to
- server (for online serving), or by passing `enforce_eager=True`
- argument to LLM constructor (for offline inference).
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index ed5d3b0092ae7..051a2d904406d 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
import logging
import sys
from argparse import SUPPRESS, HelpFormatter
@@ -7,25 +8,52 @@ from pathlib import Path
from typing import Literal
from unittest.mock import MagicMock, patch
+from pydantic_core import core_schema
+
+logger = logging.getLogger("mkdocs")
+
ROOT_DIR = Path(__file__).parent.parent.parent.parent
ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
sys.path.insert(0, str(ROOT_DIR))
-sys.modules["aiohttp"] = MagicMock()
-sys.modules["blake3"] = MagicMock()
sys.modules["vllm._C"] = MagicMock()
-from vllm.benchmarks import latency # noqa: E402
-from vllm.benchmarks import serve # noqa: E402
-from vllm.benchmarks import throughput # noqa: E402
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
-from vllm.entrypoints.cli.openai import ChatCommand # noqa: E402
-from vllm.entrypoints.cli.openai import CompleteCommand # noqa: E402
-from vllm.entrypoints.openai import cli_args # noqa: E402
-from vllm.entrypoints.openai import run_batch # noqa: E402
-from vllm.utils import FlexibleArgumentParser # noqa: E402
-logger = logging.getLogger("mkdocs")
+class PydanticMagicMock(MagicMock):
+ """`MagicMock` that's able to generate pydantic-core schemas."""
+
+ def __get_pydantic_core_schema__(self, source_type, handler):
+ return core_schema.any_schema()
+
+
+def auto_mock(module, attr, max_mocks=50):
+ """Function that automatically mocks missing modules during imports."""
+ logger.info("Importing %s from %s", attr, module)
+ for _ in range(max_mocks):
+ try:
+ # First treat attr as an attr, then as a submodule
+ return getattr(importlib.import_module(module), attr,
+ importlib.import_module(f"{module}.{attr}"))
+ except importlib.metadata.PackageNotFoundError as e:
+ raise e
+ except ModuleNotFoundError as e:
+ logger.info("Mocking %s for argparse doc generation", e.name)
+ sys.modules[e.name] = PydanticMagicMock()
+
+ raise ImportError(
+ f"Failed to import {module}.{attr} after mocking {max_mocks} imports")
+
+
+latency = auto_mock("vllm.benchmarks", "latency")
+serve = auto_mock("vllm.benchmarks", "serve")
+throughput = auto_mock("vllm.benchmarks", "throughput")
+AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs")
+EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs")
+ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
+CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
+cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
+run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
+FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser")
class MarkdownFormatter(HelpFormatter):
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 1e8b848db46d8..881df791698e2 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -70,6 +70,10 @@ class Example:
self.other_files = self.determine_other_files()
self.title = self.determine_title()
+ @property
+ def is_code(self) -> bool:
+ return self.main_file.suffix != ".md"
+
def determine_main_file(self) -> Path:
"""
Determines the main file in the given path.
@@ -101,6 +105,12 @@ class Example:
return [file for file in self.path.rglob("*") if is_other_file(file)]
def determine_title(self) -> str:
+ if not self.is_code:
+ with open(self.main_file) as f:
+ first_line = f.readline().strip()
+ match = re.match(r'^#\s+(?P.+)$', first_line)
+ if match:
+ return match.group('title')
return fix_case(self.path.stem.replace("_", " ").title())
def generate(self) -> str:
@@ -110,11 +120,13 @@ class Example:
# Use long code fence to avoid issues with
# included files containing code fences too
code_fence = "``````"
- is_code = self.main_file.suffix != ".md"
- if is_code:
+ # Skip the title from md snippets as it's been included above
+ start_line = 2
+ if self.is_code:
content += f"{code_fence}{self.main_file.suffix[1:]}\n"
- content += f'--8<-- "{self.main_file}"\n'
- if is_code:
+ start_line = 1
+ content += f'--8<-- "{self.main_file}:{start_line}"\n'
+ if self.is_code:
content += f"{code_fence}\n"
content += "\n"
diff --git a/docs/mkdocs/javascript/mathjax.js b/docs/mkdocs/javascript/mathjax.js
new file mode 100644
index 0000000000000..5da0d443578c4
--- /dev/null
+++ b/docs/mkdocs/javascript/mathjax.js
@@ -0,0 +1,20 @@
+// Enables MathJax rendering
+window.MathJax = {
+ tex: {
+ inlineMath: [["\\(", "\\)"]],
+ displayMath: [["\\[", "\\]"]],
+ processEscapes: true,
+ processEnvironments: true
+ },
+ options: {
+ ignoreHtmlClass: ".*|",
+ processHtmlClass: "arithmatex"
+ }
+};
+
+document$.subscribe(() => {
+ MathJax.startup.output.clearCache()
+ MathJax.typesetClear()
+ MathJax.texReset()
+ MathJax.typesetPromise()
+})
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index a64ecd31ebaef..d02522a6657de 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -19,7 +19,7 @@ Run a model in generation mode via the option `--runner generate`.
## Offline Inference
The [LLM][vllm.LLM] class provides various methods for offline inference.
-See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
+See [configuration](../api/README.md#configuration) for a list of options when initializing the model.
### `LLM.generate`
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 39f209d0eb7ed..fbb5f6f6dd171 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -81,7 +81,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
## Offline Inference
The [LLM][vllm.LLM] class provides various methods for offline inference.
-See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
+See [configuration](../api/README.md#configuration) for a list of options when initializing the model.
### `LLM.embed`
@@ -205,12 +205,12 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
-For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": []}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`, `--hf_overrides '{"matryoshka_dimensions": []}'`(online).
+For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": []}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`, `--hf-overrides '{"matryoshka_dimensions": []}'`(online).
Here is an example to serve a model with Matryoshka Embeddings enabled.
```text
-vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_dimensions":[256]}'
+vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
```
### Offline Inference
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 8fb1019f2bdfb..20cf75873af76 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -332,7 +332,7 @@ th {
| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
| `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | |
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
@@ -616,6 +616,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I+ | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
| `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
| `DonutForConditionalGeneration`^ | Donut | T + I | `ByteDance/Dolphin`, `naver-clova-ix/donut-base-finetuned-docvqa`, etc. | | | |
+| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I+/ V+ | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ |
| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
@@ -627,7 +628,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + IE+ + VE+ | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
@@ -637,7 +639,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
| `MiniCPMO` | MiniCPM-O | T + IE+ + VE+ + AE+ | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, etc. | ✅︎ | | ✅︎ |
+| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `MllamaForConditionalGeneration` | Llama 3.2 | T + I+ | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
@@ -701,7 +703,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th
- There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.
!!! note
- Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
+ For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc), InternVL3 and InternVL3.5 have video inputs support currently.
!!! note
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 7fc615d4c042f..20234e7611333 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -107,15 +107,16 @@ to enable simultaneous generation and embedding using the same engine instance i
#### Mamba Models
Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported.
+Please note that prefix caching is not yet supported for these models.
Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
-`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
-these models currently require disabling prefix caching and using the FlashInfer attention backend in V1.
+`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
+Please note that prefix caching is not yet supported for these models.
Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
-Please note that these models currently require disabling prefix caching, enforcing eager mode, and using the FlashInfer
-attention backend in V1.
+Please note that prefix caching is not yet supported for these models.
+It is also necessary to enforce eager mode for these models in V1.
#### Encoder-Decoder Models
diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py
index 7ef20efa7d28c..3e122319169eb 100644
--- a/examples/offline_inference/logits_processor.py
+++ b/examples/offline_inference/logits_processor.py
@@ -42,8 +42,8 @@ from vllm.config import VllmConfig
from vllm.v1.sample.logits_processor import (
BatchUpdate,
LogitsProcessor,
- MoveDirectionality,
)
+from vllm.v1.sample.logits_processor.builtin import process_dict_updates
# Hypothetical custom logits processor
@@ -53,38 +53,22 @@ class DummyLogitsProcessor(LogitsProcessor):
def __init__(
self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
):
- self.req_info: dict[int, SamplingParams] = {}
+ self.req_info: dict[int, int] = {}
def is_argmax_invariant(self) -> bool:
"""Never impacts greedy sampling"""
return False
def update_state(self, batch_update: Optional[BatchUpdate]):
- if not batch_update:
- return
-
- # Process added requests.
- for index, params, _, _ in batch_update.added:
- assert params is not None
- if params.extra_args and (
- target_token := params.extra_args.get("target_token")
- ):
- self.req_info[index] = target_token
-
- if self.req_info:
- # Process removed requests.
- for index in batch_update.removed:
- self.req_info.pop(index, None)
-
- # Process moved requests, unidirectional move (a->b) and swap
- # (a<->b)
- for adx, bdx, direct in batch_update.moved:
- a_val = self.req_info.pop(adx, None)
- b_val = self.req_info.pop(bdx, None)
- if a_val is not None:
- self.req_info[bdx] = a_val
- if direct == MoveDirectionality.SWAP and b_val is not None:
- self.req_info[adx] = b_val
+ process_dict_updates(
+ self.req_info,
+ batch_update,
+ # This function returns the LP's per-request state based on the
+ # request details, or None if this LP does not apply to the
+ # request.
+ lambda params, _, __: params.extra_args
+ and (params.extra_args.get("target_token")),
+ )
def apply(self, logits: torch.Tensor) -> torch.Tensor:
if not self.req_info:
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 8d97ba2668263..4e879666f61d7 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -173,6 +173,37 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
)
+# Ernie4.5-VL
+def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
+ model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
+
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=5,
+ limit_mm_per_prompt={modality: 1},
+ trust_remote_code=True,
+ )
+
+ if modality == "image":
+ placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+ elif modality == "video":
+ placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
+
+ prompts = [
+ (
+ f"<|begin_of_sentence|>User: {question}{placeholder}\n"
+ "Assistant: "
+ )
+ for question in questions
+ ]
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
+
+
# Florence2
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@@ -1602,6 +1633,7 @@ model_example_map = {
"chameleon": run_chameleon,
"command_a_vision": run_command_a_vision,
"deepseek_vl_v2": run_deepseek_vl2,
+ "ernie45_vl": run_ernie45_vl,
"florence2": run_florence2,
"fuyu": run_fuyu,
"gemma3": run_gemma3,
diff --git a/examples/tool_chat_template_qwen3coder.jinja b/examples/tool_chat_template_qwen3coder.jinja
new file mode 100644
index 0000000000000..49b0e8d0ee7e6
--- /dev/null
+++ b/examples/tool_chat_template_qwen3coder.jinja
@@ -0,0 +1,117 @@
+{% macro render_extra_keys(json_dict, handled_keys) %}
+ {%- if json_dict is mapping %}
+ {%- for json_key in json_dict if json_key not in handled_keys %}
+ {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
+ {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '' ~ json_key ~ '>' }}
+ {%- else %}
+ {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '' ~ json_key ~ '>' }}
+ {%- endif %}
+ {%- endfor %}
+ {%- endif %}
+{% endmacro %}
+
+{%- if messages[0]["role"] == "system" %}
+ {%- set system_message = messages[0]["content"] %}
+ {%- set loop_messages = messages[1:] %}
+{%- else %}
+ {%- set loop_messages = messages %}
+{%- endif %}
+
+{%- if not tools is defined %}
+ {%- set tools = [] %}
+{%- endif %}
+
+{%- if system_message is defined %}
+ {{- "<|im_start|>system\n" + system_message }}
+{%- else %}
+ {%- if tools is iterable and tools | length > 0 %}
+ {{- "<|im_start|>system\nYou are Qwen, a helpful AI assistant that can interact with a computer to solve tasks." }}
+ {%- endif %}
+{%- endif %}
+{%- if tools is iterable and tools | length > 0 %}
+ {{- "\n\n# Tools\n\nYou have access to the following functions:\n\n" }}
+ {{- "" }}
+ {%- for tool in tools %}
+ {%- if tool.function is defined %}
+ {%- set tool = tool.function %}
+ {%- endif %}
+ {{- "\n\n" ~ tool.name ~ "" }}
+ {%- if tool.description is defined %}
+ {{- '\n' ~ (tool.description | trim) ~ '' }}
+ {%- endif %}
+ {{- '\n' }}
+ {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
+ {%- for param_name, param_fields in tool.parameters.properties|items %}
+ {{- '\n' }}
+ {{- '\n' ~ param_name ~ '' }}
+ {%- if param_fields.type is defined %}
+ {{- '\n' ~ (param_fields.type | string) ~ '' }}
+ {%- endif %}
+ {%- if param_fields.description is defined %}
+ {{- '\n' ~ (param_fields.description | trim) ~ '' }}
+ {%- endif %}
+ {%- set handled_keys = ['name', 'type', 'description'] %}
+ {{- render_extra_keys(param_fields, handled_keys) }}
+ {{- '\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {% set handled_keys = ['type', 'properties'] %}
+ {{- render_extra_keys(tool.parameters, handled_keys) }}
+ {{- '\n' }}
+ {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
+ {{- render_extra_keys(tool, handled_keys) }}
+ {{- '\n' }}
+ {%- endfor %}
+ {{- "\n" }}
+ {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }}
+{%- endif %}
+{%- if system_message is defined %}
+ {{- '<|im_end|>\n' }}
+{%- else %}
+ {%- if tools is iterable and tools | length > 0 %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- for message in loop_messages %}
+ {%- if message.role == "assistant" and message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
+ {{- '<|im_start|>' + message.role }}
+ {%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
+ {{- '\n' + message.content | trim + '\n' }}
+ {%- endif %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if tool_call.function is defined %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n\n\n' }}
+ {%- if tool_call.arguments is defined %}
+ {%- for args_name, args_value in tool_call.arguments|items %}
+ {{- '\n' }}
+ {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+ {{- args_value }}
+ {{- '\n\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '\n' }}
+ {%- endfor %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "user" or message.role == "system" or message.role == "assistant" %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.previtem and loop.previtem.role != "tool" %}
+ {{- '<|im_start|>user\n' }}
+ {%- endif %}
+ {{- '\n' }}
+ {{- message.content }}
+ {{- '\n\n' }}
+ {%- if not loop.last and loop.nextitem.role != "tool" %}
+ {{- '<|im_end|>\n' }}
+ {%- elif loop.last %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+{%- endif %}
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 47fe1ebce9712..507a80c41e8b4 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -129,15 +129,16 @@ markdown_extensions:
- toc:
permalink: true
# For math rendering
- - mdx_math:
- enable_dollar_delimiter: true
+ - pymdownx.arithmatex:
+ generic: true
extra_css:
- mkdocs/stylesheets/extra.css
extra_javascript:
- mkdocs/javascript/run_llm_widget.js
- - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
+ - mkdocs/javascript/mathjax.js
+ - https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
- mkdocs/javascript/edit_and_feedback.js
- mkdocs/javascript/slack_and_forum.js
diff --git a/requirements/docs.txt b/requirements/docs.txt
index a24b9c7e924bf..d1c546398780a 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -7,27 +7,12 @@ mkdocs-awesome-nav
mkdocs-glightbox
mkdocs-git-revision-date-localized-plugin
mkdocs-minify-plugin
-python-markdown-math
regex
ruff
# Required for argparse hook only
-f https://download.pytorch.org/whl/cpu
cachetools
-cbor2
-cloudpickle
-fastapi
msgspec
-openai
-openai-harmony
-partial-json-parser
-pillow
-psutil
-pybase64
pydantic
-setproctitle
torch
-transformers
-zmq
-uvloop
-prometheus-client
diff --git a/requirements/test.in b/requirements/test.in
index 098a9242bc3af..92c577c501632 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -54,3 +54,4 @@ runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10
terratorch==1.1rc2 # required for PrithviMAE test
+decord==0.6.0
diff --git a/requirements/test.txt b/requirements/test.txt
index 8b872752d875c..0c27c9bb67e82 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -156,6 +156,8 @@ datasets==3.0.2
# mteb
decorator==5.1.1
# via librosa
+decord==0.6.0
+ # via -r requirements/test.in
dill==0.3.8
# via
# datasets
@@ -493,6 +495,7 @@ numpy==1.26.4
# contourpy
# cupy-cuda12x
# datasets
+ # decord
# einx
# encodec
# evaluate
diff --git a/setup.py b/setup.py
index ca6e0a8592cc2..ffe8ec4e79af7 100644
--- a/setup.py
+++ b/setup.py
@@ -694,7 +694,7 @@ setup(
"mistral_common[audio]"], # Required for audio processing
"video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile
- "flashinfer": ["flashinfer-python==0.2.12"],
+ "flashinfer": ["flashinfer-python==0.2.14.post1"],
# Optional deps for AMD FP4 quantization support
"petit-kernel": ["petit-kernel"],
},
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 34f9389c82a9b..f3ad680b72b55 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -177,3 +177,34 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
# cmp output
assert output[0].outputs[0].text == output3[0].outputs[0].text
+
+
+@create_new_process_for_each_test()
+def test_deep_sleep():
+ model = "Qwen/Qwen3-0.6B"
+ free, total = torch.cuda.mem_get_info()
+ used_bytes_baseline = total - free # in case other process is running
+ llm = LLM(model, enable_sleep_mode=True)
+ prompt = "How are you?"
+ sampling_params = SamplingParams(temperature=0, max_tokens=10)
+ output = llm.generate(prompt, sampling_params)
+
+ # Put the engine to deep sleep
+ llm.sleep(level=2)
+
+ free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+ used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+ assert used_bytes < 3 * GiB_bytes
+
+ llm.wake_up(tags=["weights"])
+ llm.collective_rpc("reload_weights")
+ free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+ used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+ assert used_bytes < 4 * GiB_bytes
+
+ # now allocate kv cache and cuda graph memory
+ llm.wake_up(tags=["kv_cache"])
+ output2 = llm.generate(prompt, sampling_params)
+
+ # cmp output
+ assert output[0].outputs[0].text == output2[0].outputs[0].text
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 5cfad935a0fb1..c4229f93464ac 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -15,7 +15,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape, QuantKey, ScaleDesc)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
- CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
+ Fp8LinearOp, maybe_create_device_identity)
from vllm.platforms import current_platform
from .backend import TestBackend
@@ -26,9 +26,9 @@ FP8_DTYPE = current_platform.fp8_dtype()
class TestModel(torch.nn.Module):
def __init__(self, hidden_size: int, eps: float, static: bool,
- cutlass_fp8_enabled: bool, *args, **kwargs):
+ force_fp8_e4m3fnuz: bool, *args, **kwargs):
super().__init__(*args, **kwargs)
- self.cutlass_fp8_enabled = cutlass_fp8_enabled
+ self.force_fp8_e4m3fnuz = force_fp8_e4m3fnuz
self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
@@ -43,7 +43,7 @@ class TestModel(torch.nn.Module):
for _ in range(2)
]
self.fp8_linear = Fp8LinearOp(
- cutlass_fp8_supported=cutlass_fp8_enabled,
+ force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
act_quant_static=static,
act_quant_group_shape=group_shape,
)
@@ -81,12 +81,11 @@ class TestModel(torch.nn.Module):
@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
@pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize("static", [True, False])
-@pytest.mark.parametrize("cutlass_fp8_enabled",
- [True, False] if CUTLASS_FP8_SUPPORTED else [False])
+@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
reason="Only test on CUDA and ROCm")
def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
- cutlass_fp8_enabled):
+ force_fp8_e4m3fnuz):
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(1)
@@ -103,7 +102,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
fusion_pass = FusionPass.instance(vllm_config)
backend = TestBackend(noop_pass, fusion_pass)
- model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)
+ model = TestModel(hidden_size, eps, static, force_fp8_e4m3fnuz)
# First dimension dynamic
x = torch.rand(num_tokens, hidden_size)
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index a6baa97fe6990..fb9f9dde22799 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -104,8 +104,7 @@ class TestQuantModel(torch.nn.Module):
# Initialize weights
torch.nn.init.normal_(self.gate_proj, std=0.02)
- self.fp8_linear = Fp8LinearOp(cutlass_fp8_supported=True,
- use_per_token_if_dynamic=False)
+ self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False)
self.scale = torch.rand(1, dtype=torch.float32)
# Create a weight that is compatible with torch._scaled_mm,
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 5351a3cf35ba5..0e1059e654479 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
- CUTLASS_FP8_SUPPORTED, Fp8LinearOp)
+ Fp8LinearOp)
from vllm.platforms import current_platform
from .backend import TestBackend
@@ -20,7 +20,7 @@ from .backend import TestBackend
class TestModel(torch.nn.Module):
- def __init__(self, hidden_size: int, cutlass_fp8_enabled: bool, *args,
+ def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, *args,
**kwargs):
super().__init__(*args, **kwargs)
self.silu_and_mul = SiluAndMul()
@@ -32,7 +32,7 @@ class TestModel(torch.nn.Module):
hidden_size).to(dtype=current_platform.fp8_dtype()).t())
self.fp8_linear = Fp8LinearOp(
- cutlass_fp8_supported=cutlass_fp8_enabled,
+ force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
act_quant_static=True,
act_quant_group_shape=GroupShape.PER_TENSOR,
)
@@ -48,12 +48,11 @@ class TestModel(torch.nn.Module):
@pytest.mark.parametrize("num_tokens", [256])
@pytest.mark.parametrize("hidden_size", [64])
-@pytest.mark.parametrize("cutlass_fp8_enabled",
- [True, False] if CUTLASS_FP8_SUPPORTED else [False])
+@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
reason="Only test on CUDA and ROCm")
def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
- cutlass_fp8_enabled):
+ force_fp8_e4m3fnuz):
torch.set_default_device("cuda")
torch.set_default_dtype(torch.float16)
@@ -64,7 +63,7 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
fusion_pass = ActivationQuantFusionPass(config)
backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
- model = TestModel(hidden_size, cutlass_fp8_enabled)
+ model = TestModel(hidden_size, force_fp8_e4m3fnuz)
# First dimension dynamic
x = torch.rand(num_tokens, hidden_size * 2)
diff --git a/tests/conftest.py b/tests/conftest.py
index 2bf88abb0f6c2..f8bfdfc8e6259 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1022,15 +1022,17 @@ class VllmRunner:
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
+ concurrency_limit: Optional[int] = None,
) -> list[tuple[list[list[int]], list[str]]]:
inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)
- outputs = self.llm.beam_search(
- inputs,
- BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
+ outputs = self.llm.beam_search(inputs,
+ BeamSearchParams(beam_width=beam_width,
+ max_tokens=max_tokens),
+ concurrency_limit=concurrency_limit)
returned_outputs = []
for output in outputs:
token_ids = [x.tokens for x in output.sequences]
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index e2cb579e22dc4..8d84cc2d0ffe6 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -18,7 +18,8 @@ from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
tensor_model_parallel_all_reduce,
tensor_model_parallel_reduce_scatter)
-from ..utils import init_test_distributed_environment, multi_process_parallel
+from ..utils import (init_test_distributed_environment, multi_gpu_test,
+ multi_process_parallel)
@ray.remote(num_gpus=1, max_calls=1)
@@ -226,8 +227,7 @@ def send_recv_test_worker(
torch.testing.assert_close(test_tensor, recv_tensor)
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
- reason="Need at least 2 GPUs to run the test.")
+@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("test_target", [
all_reduce_test_worker, all_gather_test_worker,
@@ -241,8 +241,7 @@ def test_multi_process_tensor_parallel(
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
- reason="Need at least 2 GPUs to run the test.")
+@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
@@ -254,8 +253,7 @@ def test_multi_process_pipeline_parallel(
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
- reason="Need at least 4 GPUs to run the test.")
+@multi_gpu_test(num_gpus=4)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize("test_target", [
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 1ca52599c519d..72d468db08f65 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -11,18 +11,25 @@ from openai import BadRequestError, NotFoundError, OpenAI
from ...utils import RemoteOpenAIServer
-pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.")
-
MODEL_NAME = "openai/gpt-oss-20b"
-DTYPE = "bfloat16"
@pytest.fixture(scope="module")
-def server():
+def monkeypatch_module():
+ from _pytest.monkeypatch import MonkeyPatch
+ mpatch = MonkeyPatch()
+ yield mpatch
+ mpatch.undo()
+
+
+@pytest.fixture(scope="module")
+def server(monkeypatch_module: pytest.MonkeyPatch):
args = ["--enforce-eager", "--tool-server", "demo"]
- with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
- yield remote_server
+ with monkeypatch_module.context() as m:
+ m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
+ with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+ yield remote_server
@pytest_asyncio.fixture
@@ -269,10 +276,11 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_streaming(client: OpenAI, model_name: str):
+ # TODO: Add back when web search and code interpreter are available in CI
prompts = [
"tell me a story about a cat in 20 words",
- "What is 13 * 24? Use python to calculate the result.",
- "When did Jensen found NVIDIA? Search it and answer the year only.",
+ # "What is 13 * 24? Use python to calculate the result.",
+ # "When did Jensen found NVIDIA? Search it and answer the year only.",
]
for prompt in prompts:
@@ -281,15 +289,15 @@ async def test_streaming(client: OpenAI, model_name: str):
input=prompt,
reasoning={"effort": "low"},
tools=[
- {
- "type": "web_search_preview"
- },
- {
- "type": "code_interpreter",
- "container": {
- "type": "auto"
- }
- },
+ # {
+ # "type": "web_search_preview"
+ # },
+ # {
+ # "type": "code_interpreter",
+ # "container": {
+ # "type": "auto"
+ # }
+ # },
],
stream=True,
)
@@ -317,6 +325,7 @@ async def test_streaming(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Web search tool is not available in CI yet.")
async def test_web_search(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
@@ -331,6 +340,7 @@ async def test_web_search(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
async def test_code_interpreter(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
@@ -436,6 +446,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.flaky(reruns=5)
async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
tools = [
{
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 93239f41a4aeb..6009d9aeec935 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
language="en",
response_format="text",
temperature=0.0)
- out = json.loads(transcription)['text']
- assert "Mary had a little lamb," in out
+ out = json.loads(transcription)
+ out_text = out['text']
+ out_usage = out['usage']
+ assert "Mary had a little lamb," in out_text
+ assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio
@@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
language="en",
response_format="text",
temperature=0.0)
- out = json.loads(transcription)['text']
- counts = out.count("Mary had a little lamb")
+ out = json.loads(transcription)
+ out_text = out['text']
+ out_usage = out['usage']
+ counts = out_text.count("Mary had a little lamb")
assert counts == 10, counts
+ assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 8259a81d7b6a1..106ec121a422e 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -6,8 +6,6 @@ import json
import openai
import pytest
import pytest_asyncio
-import requests
-from PIL import Image
from transformers import AutoProcessor
from vllm.multimodal.utils import encode_image_base64, fetch_image
@@ -88,7 +86,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
"role": "user",
"content": f"{placeholder}{content}",
}]
- images = [Image.open(requests.get(image_url, stream=True).raw)]
+ images = [fetch_image(image_url)]
prompt = processor.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True)
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 4e6a21058658b..d3cc2fac6af57 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -5,7 +5,6 @@ import json
import pytest
import requests
-from PIL import Image
from transformers import AutoProcessor
from vllm.entrypoints.openai.protocol import EmbeddingResponse
@@ -64,7 +63,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
placeholder = "<|image_1|> "
prompt = f"{placeholder}{content}"
- images = [Image.open(requests.get(image_url, stream=True).raw)]
+ images = [fetch_image(image_url)]
inputs = processor(prompt, images, return_tensors="pt")
return inputs.input_ids.shape[1]
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index 9e4eaf221f245..ecc57acc67963 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_topk, modular_triton_fused_moe)
from vllm.platforms import current_platform
from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
dg_available = has_deep_gemm()
@@ -226,8 +226,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
- reason="Not E8M0 scale MOE")
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(), reason="Not E8M0 scale MOE")
@torch.inference_mode()
def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
monkeypatch):
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 6f95581a5e60d..36a98522a6588 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -20,9 +20,9 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel)
from vllm.platforms import current_platform
from vllm.utils import has_deep_ep, has_deep_gemm
-from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
- is_deep_gemm_supported)
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
+from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
from .utils import make_test_weights
@@ -370,9 +370,10 @@ NUM_EXPERTS = [32]
@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOPKS)
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@multi_gpu_test(num_gpus=2)
@requires_deep_ep
@requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(),
reason="Skipping test for Blackwell DeepGEMM")
def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
topk: int, world_dp_size: tuple[int, int]):
@@ -427,9 +428,10 @@ USE_FP8_DISPATCH = [False]
@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
@pytest.mark.parametrize("block_size", [[128, 128]])
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@multi_gpu_test(num_gpus=2)
@requires_deep_ep
@requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(),
reason="Skipping test for Blackwell DeepGEMM")
def test_ll_deepep_deepgemm_moe(
mnk: tuple[int, int, int],
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index 43804c410b6c2..6a53af68cd53a 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -24,6 +24,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
from vllm.platforms import current_platform
from vllm.utils import has_deep_ep
+from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
if has_deep_ep():
@@ -411,6 +412,7 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
@pytest.mark.parametrize("topk", [6])
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
+@multi_gpu_test(num_gpus=2)
@requires_deep_ep
def test_deep_ep_moe(
dtype: torch.dtype,
@@ -459,6 +461,7 @@ USE_FP8_DISPATCH = [True, False]
@pytest.mark.parametrize("topk", [6])
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@multi_gpu_test(num_gpus=2)
@requires_deep_ep
def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
num_experts: int, topk: int,
diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py
new file mode 100644
index 0000000000000..646e763194fd6
--- /dev/null
+++ b/tests/kernels/moe/test_grouped_topk.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the MoE grouped topk kernel
+
+Run `pytest tests/kernels/moe/test_grouped_topk.py`.
+"""
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.fused_moe import (fused_grouped_topk,
+ grouped_topk)
+from vllm.platforms import current_platform
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+ reason="This test is skipped on non-CUDA platform.")
+@pytest.mark.parametrize("n_token", [1, 33, 64])
+@pytest.mark.parametrize("n_hidden", [1024, 2048])
+@pytest.mark.parametrize("n_expert", [16])
+@pytest.mark.parametrize("topk", [2])
+@pytest.mark.parametrize("renormalize", [True, False])
+@pytest.mark.parametrize("num_expert_group", [8])
+@pytest.mark.parametrize("topk_group", [2])
+@pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"])
+@pytest.mark.parametrize("routed_scaling_factor", [1.0, 2.5])
+@pytest.mark.parametrize("dtype",
+ [torch.float16, torch.bfloat16, torch.float32])
+def test_grouped_topk(monkeypatch: pytest.MonkeyPatch, n_token: int,
+ n_hidden: int, n_expert: int, topk: int,
+ renormalize: bool, num_expert_group: int,
+ topk_group: int, scoring_func: str,
+ routed_scaling_factor: float, dtype: torch.dtype):
+ current_platform.seed_everything(0)
+ hidden_states = torch.randn((n_token, n_hidden),
+ dtype=dtype,
+ device="cuda")
+ gating_output = torch.randn((n_token, n_expert),
+ dtype=dtype,
+ device="cuda")
+ e_score_correction_bias = torch.randn((n_expert, ),
+ dtype=torch.float32,
+ device="cuda")
+
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0")
+ baseline_topk_weights, baseline_topk_ids = grouped_topk(
+ hidden_states=hidden_states,
+ gating_output=gating_output,
+ topk=topk,
+ renormalize=renormalize,
+ num_expert_group=num_expert_group,
+ topk_group=topk_group,
+ scoring_func=scoring_func,
+ routed_scaling_factor=routed_scaling_factor,
+ e_score_correction_bias=e_score_correction_bias)
+
+ test_topk_weights, test_topk_ids = fused_grouped_topk(
+ hidden_states=hidden_states,
+ gating_output=gating_output,
+ topk=topk,
+ renormalize=renormalize,
+ num_expert_group=num_expert_group,
+ topk_group=topk_group,
+ scoring_func=scoring_func,
+ routed_scaling_factor=routed_scaling_factor,
+ e_score_correction_bias=e_score_correction_bias)
+
+ if renormalize:
+ torch.testing.assert_close(baseline_topk_weights,
+ test_topk_weights,
+ atol=2e-2,
+ rtol=0)
+ torch.testing.assert_close(baseline_topk_ids,
+ test_topk_ids,
+ atol=0,
+ rtol=0)
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index d45982384eb3b..6112183be5475 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from ...utils import multi_gpu_test
from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
reference_moe_impl,
run_modular_kernel)
@@ -162,6 +163,7 @@ def is_nyi_config(config: Config) -> bool:
product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
@pytest.mark.parametrize("world_size", [2])
+@multi_gpu_test(num_gpus=2)
@meets_multi_gpu_requirements
def test_modular_kernel_combinations_multigpu(
k: int, n: int, e: int, dtype: torch.dtype,
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index 98908f2714707..9e78f4d6e4da0 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
from vllm.platforms import current_platform
from vllm.utils import cdiv
+from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
try:
@@ -247,6 +248,7 @@ def _pplx_moe(
@pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("world_dp_size", [[2, 1]]) #, [4, 2]])
@pytest.mark.parametrize("use_internode", [False])
+@multi_gpu_test(num_gpus=2)
@pytest.mark.skipif(
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
current_platform.get_device_capability()),
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index c2064de97358f..3f36d7ada2e94 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -37,6 +37,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
from vllm.platforms import current_platform
from vllm.utils import round_up
+from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
requires_pplx = pytest.mark.skipif(
@@ -452,6 +453,7 @@ def _pplx_prepare_finalize(
@pytest.mark.parametrize("use_internode", [False])
@pytest.mark.optional
@requires_pplx
+@multi_gpu_test(num_gpus=2)
def test_pplx_prepare_finalize_slow(
mnk: tuple[int, int, int],
e: int,
@@ -740,6 +742,7 @@ def _pplx_moe(
@pytest.mark.parametrize("use_internode", [False])
@pytest.mark.optional
@requires_pplx
+@multi_gpu_test(num_gpus=2)
def test_pplx_moe_slow(
mnk: tuple[int, int, int],
e: int,
@@ -880,6 +883,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("use_internode", [False])
@requires_pplx
+@multi_gpu_test(num_gpus=2)
def test_pplx_prepare_finalize(
world_dp_size: tuple[int, int],
use_internode: bool,
@@ -893,6 +897,7 @@ def test_pplx_prepare_finalize(
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("use_internode", [False])
@requires_pplx
+@multi_gpu_test(num_gpus=2)
def test_pplx_moe(
world_dp_size: tuple[int, int],
use_internode: bool,
diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
index 96797e85bd125..9354495642b28 100644
--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the AWQ Triton kernel.
-Run `pytest tests/kernels/test_awq_triton.py`.
+Run `pytest tests/kernels/quantization/test_awq_triton.py`.
"""
import pytest
import torch
diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
index 878f66647e19e..ae61b3b3a28a8 100644
--- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for sparse cutlass kernels
-Run `pytest tests/kernels/test_semi_structured.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_2of4_sparse.py`.
"""
import pytest
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index a15decdf6f827..65320509e173f 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for cutlass kernels
-Run `pytest tests/kernels/test_cutlass.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_scaled_mm.py`.
"""
import random
diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py
index 7832f8179d0ec..f659408efe8c6 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the CUTLASS W4A8 kernel.
-Run `pytest tests/kernels/test_cutlass_w4a8.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`.
"""
from dataclasses import dataclass
diff --git a/tests/kernels/quantization/test_flashinfer_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
new file mode 100644
index 0000000000000..9f669c6df8bd5
--- /dev/null
+++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
+
+if not current_platform.has_device_capability(100):
+ pytest.skip(
+ reason=
+ "Flashinfer FP8 gemms requires compute capability of 10.0 or above.",
+ allow_module_level=True,
+ )
+
+DTYPES = [torch.float16, torch.bfloat16]
+# m, n, k
+SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
+PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
+SHAPES.extend(PAD_SHAPES)
+
+SEEDS = [42]
+CUDA_DEVICES = ["cuda:0"]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("autotune", [False, True])
+@torch.inference_mode()
+def test_flashinfer_fp8_gemm(
+ dtype: torch.dtype,
+ shape: tuple[int, int, int],
+ use_bias: bool,
+ seed: int,
+ device: str,
+ autotune: bool,
+) -> None:
+ current_platform.seed_everything(seed)
+ m, n, k = shape
+ a = torch.randn((m, k), dtype=dtype, device=device)
+ b = torch.randn((n, k), dtype=dtype, device=device) / k
+
+ a_fp8, a_scale = ops.scaled_fp8_quant(a)
+ b_fp8, b_scale = ops.scaled_fp8_quant(b)
+
+ expected_out = torch.mm(
+ a_scale * a_fp8.to(dtype=torch.float32),
+ b_scale * b_fp8.to(dtype=torch.float32).t(),
+ ).to(dtype=dtype)
+
+ if use_bias:
+ bias = torch.randn((n, ), dtype=dtype, device=device)
+ expected_out = expected_out + bias
+ else:
+ bias = None
+
+ import flashinfer
+
+ with flashinfer.autotune(autotune):
+ out = flashinfer_scaled_fp8_mm(
+ a_fp8,
+ b_fp8.t(),
+ a_scale,
+ b_scale,
+ dtype,
+ bias=bias,
+ )
+
+ torch.testing.assert_close(out, expected_out, atol=1e-2, rtol=1e-2)
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index 0e09661c955e4..50584f3f82d4c 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the machete kernel.
-Run `pytest tests/kernels/test_machete_mm.py`.
+Run `pytest tests/kernels/quantization/test_machete_mm.py`.
"""
import math
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index ad077e0b94732..0be020085bfa4 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the marlin kernel.
-Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
+Run `pytest tests/kernels/quantization/test_marlin_gemm.py`.
"""
import pytest
import torch
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 24245663fb1d6..d8cfb5710dbad 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the triton_scaled_mm kernel
-Run `pytest tests/kernels/test_triton_scaled_mm.py`.
+Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
"""
import importlib
from typing import Optional
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index cba573b63c045..3475993ff8f07 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -216,11 +216,6 @@ def tinyllama_lora_files():
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
-@pytest.fixture(scope="session")
-def phi2_lora_files():
- return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
-
-
@pytest.fixture
def reset_default_device():
"""
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
deleted file mode 100644
index 774ebb9db2106..0000000000000
--- a/tests/lora/test_baichuan.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-import vllm
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "baichuan-inc/Baichuan-7B"
-
-PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
- prompts = [
- PROMPT_TEMPLATE.format(query="How many singers do we have?"),
- PROMPT_TEMPLATE.format(
- query=
- "What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
- ),
- PROMPT_TEMPLATE.format(
- query=
- "Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
- ),
- ]
- print(prompts)
- sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
- outputs = llm.generate(
- prompts,
- sampling_params,
- lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
- if lora_id else None)
- # Print the outputs.
- generated_texts: list[str] = []
- for output in outputs:
- prompt = output.prompt
- generated_text = output.outputs[0].text.strip()
- generated_texts.append(generated_text)
- print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
- return generated_texts
-
-
-def test_baichuan_lora(baichuan_lora_files):
- llm = vllm.LLM(MODEL_PATH,
- max_model_len=1024,
- enable_lora=True,
- max_loras=4,
- max_lora_rank=64,
- trust_remote_code=True)
-
- expected_lora_output = [
- "SELECT count(*) FROM singer",
- "SELECT avg(age) , min(age) , max(age) FROM singer WHERE Country = 'France'", # noqa: E501
- "SELECT name , country , age FROM singer ORDER BY age ASC",
- ]
-
- output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
- for i in range(len(expected_lora_output)):
- assert output1[i] == expected_lora_output[i]
- output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
- for i in range(len(expected_lora_output)):
- assert output2[i] == expected_lora_output[i]
-
-
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
- num_gpus_available, fully_sharded):
- if num_gpus_available < 4:
- pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
-
- llm_tp1 = vllm.LLM(MODEL_PATH,
- enable_lora=True,
- max_num_seqs=16,
- max_loras=4,
- max_lora_rank=64,
- trust_remote_code=True,
- fully_sharded_loras=fully_sharded)
- output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
-
- del llm_tp1
- cleanup_dist_env_and_memory()
-
- llm_tp2 = vllm.LLM(MODEL_PATH,
- enable_lora=True,
- max_num_seqs=16,
- max_loras=4,
- max_lora_rank=64,
- tensor_parallel_size=2,
- trust_remote_code=True,
- fully_sharded_loras=fully_sharded)
- output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
-
- del llm_tp2
- cleanup_dist_env_and_memory()
-
- assert output_tp1 == output_tp2
-
- llm_tp4 = vllm.LLM(MODEL_PATH,
- enable_lora=True,
- max_num_seqs=16,
- max_loras=4,
- max_lora_rank=64,
- tensor_parallel_size=4,
- trust_remote_code=True,
- fully_sharded_loras=fully_sharded)
- output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
-
- del llm_tp4
- cleanup_dist_env_and_memory()
-
- assert output_tp1 == output_tp4
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 92db023babc28..6e2dda464d8eb 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -243,7 +243,7 @@ def check_punica_wrapper(punica_wrapper) -> bool:
@torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES)
@@ -347,7 +347,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
@torch.inference_mode()
# @pytest.mark.skip(
# reason="Fails when loras are in any slot other than the first.")
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES)
@@ -486,7 +486,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
@torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
@pytest.mark.parametrize("stage", STAGES)
@@ -620,12 +620,15 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
@torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
-def test_linear_replicated(dist_init, num_loras, device, stage,
- bias_enabled) -> None:
+def test_linear_replicated(
+ dist_init,
+ num_loras,
+ device,
+ stage,
+) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
@@ -634,10 +637,11 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
- lora_config = LoRAConfig(max_loras=max_loras,
- max_lora_rank=8,
- lora_dtype=torch.float16,
- bias_enabled=bias_enabled)
+ lora_config = LoRAConfig(
+ max_loras=max_loras,
+ max_lora_rank=8,
+ lora_dtype=torch.float16,
+ )
def create_random_linear_replicated_layer():
@@ -651,10 +655,6 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
lora_linear.create_lora_weights(max_loras, lora_config)
assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
lora_linear.lora_b_stacked) == 1)
- if bias_enabled:
- assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
- else:
- assert lora_linear.lora_bias_stacked is None
return linear, lora_linear
for i in range(NUM_RANDOM_SEEDS):
@@ -734,14 +734,13 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
@torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("orientation", ["row", "column"])
@pytest.mark.parametrize("fully_shard", [True, False])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
- device, stage, bias_enabled) -> None:
+ device, stage) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
@@ -750,11 +749,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
- lora_config = LoRAConfig(max_loras=max_loras,
- max_lora_rank=8,
- fully_sharded_loras=fully_shard,
- lora_dtype=torch.float16,
- bias_enabled=bias_enabled)
+ lora_config = LoRAConfig(
+ max_loras=max_loras,
+ max_lora_rank=8,
+ fully_sharded_loras=fully_shard,
+ lora_dtype=torch.float16,
+ )
def create_random_linear_parallel_layer():
if orientation == "row":
@@ -777,10 +777,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
lora_linear.create_lora_weights(max_loras, lora_config)
assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
lora_linear.lora_b_stacked) == 1)
- if bias_enabled:
- assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
- else:
- assert lora_linear.lora_bias_stacked is None
+
return linear, lora_linear
for i in range(NUM_RANDOM_SEEDS):
@@ -860,14 +857,13 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
@torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("repeats", [1, 2, 3])
@pytest.mark.parametrize("fully_shard", [True, False])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
- device, stage, bias_enabled) -> None:
+ device, stage) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
@@ -876,11 +872,12 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
- lora_config = LoRAConfig(max_loras=max_loras,
- max_lora_rank=8,
- fully_sharded_loras=fully_shard,
- lora_dtype=torch.float16,
- bias_enabled=bias_enabled)
+ lora_config = LoRAConfig(
+ max_loras=max_loras,
+ max_lora_rank=8,
+ fully_sharded_loras=fully_shard,
+ lora_dtype=torch.float16,
+ )
def create_column_parallel_packed_layer():
if repeats == 2:
@@ -924,10 +921,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
model_config=FakeConfig())
assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
lora_linear.lora_b_stacked) == n_slices)
- if bias_enabled:
- assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
- else:
- assert lora_linear.lora_bias_stacked is None
+
return linear, lora_linear
for i in range(NUM_RANDOM_SEEDS):
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
deleted file mode 100644
index 3090941e63679..0000000000000
--- a/tests/lora/test_phi.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import vllm
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "microsoft/phi-2"
-
-PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
- prompts = [
- PROMPT_TEMPLATE.format(
- sql_prompt=
- "Which catalog publisher has published the most catalogs?",
- context="CREATE TABLE catalogs (catalog_publisher VARCHAR);"),
- PROMPT_TEMPLATE.format(
- sql_prompt=
- "Which trip started from the station with the largest dock count? Give me the trip id.", # noqa: E501
- context=
- "CREATE TABLE trip (id VARCHAR, start_station_id VARCHAR); CREATE TABLE station (id VARCHAR, dock_count VARCHAR);" # noqa: E501
- ),
- PROMPT_TEMPLATE.format(
- sql_prompt=
- "How many marine species are found in the Southern Ocean?", # noqa: E501
- context=
- "CREATE TABLE marine_species (name VARCHAR(50), common_name VARCHAR(50), location VARCHAR(50));" # noqa: E501
- ),
- ]
- sampling_params = vllm.SamplingParams(temperature=0,
- max_tokens=64,
- stop="### End")
- outputs = llm.generate(
- prompts,
- sampling_params,
- lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
- if lora_id else None,
- )
- # Print the outputs.
- generated_texts: list[str] = []
- for output in outputs:
- prompt = output.prompt
- generated_text = output.outputs[0].text.strip()
- generated_texts.append(generated_text)
- print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
- return generated_texts
-
-
-def test_phi2_lora(phi2_lora_files):
- # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
- # Otherwise, the lora-test will fail due to CUDA OOM.
- llm = vllm.LLM(MODEL_PATH,
- max_model_len=1024,
- enable_lora=True,
- max_loras=2,
- enforce_eager=True,
- enable_chunked_prefill=True)
-
- expected_lora_output = [
- "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;", # noqa: E501
- "SELECT trip.id FROM trip JOIN station ON trip.start_station_id = station.id WHERE station.dock_count = (SELECT MAX(dock_count) FROM station);", # noqa: E501
- "SELECT COUNT(*) FROM marine_species WHERE location = 'Southern Ocean';", # noqa: E501
- ]
-
- output1 = do_sample(llm, phi2_lora_files, lora_id=1)
- for i in range(len(expected_lora_output)):
- assert output1[i].startswith(expected_lora_output[i])
- output2 = do_sample(llm, phi2_lora_files, lora_id=2)
- for i in range(len(expected_lora_output)):
- assert output2[i].startswith(expected_lora_output[i])
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 2055c44c83cda..31ca3a6f0f985 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -100,24 +100,19 @@ def test_models(
else:
hf_outputs = None
- if model not in V0_UNSUPPORTED_MODELS:
- with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
- vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
- example_prompts, max_tokens, num_logprobs)
- else:
- vllm_v0_outputs = None
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "0")
+ if model not in V0_UNSUPPORTED_MODELS:
+ with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+ vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, num_logprobs)
+ else:
+ vllm_v0_outputs = None
if model in V1_SUPPORTED_MODELS:
- with monkeypatch.context() as m:
- m.setenv("VLLM_USE_V1", "1")
- if model in HYBRID_MODELS:
- # required due to reorder_batch behaviour
- m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
- with vllm_runner(model,
- max_num_seqs=MAX_NUM_SEQS,
- enable_prefix_caching=False) as vllm_model:
- vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
- example_prompts, max_tokens, num_logprobs)
+ with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+ vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, num_logprobs)
else:
vllm_v1_outputs = None
@@ -140,7 +135,7 @@ def test_models(
)
-@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_batching(
@@ -150,10 +145,6 @@ def test_batching(
max_tokens: int,
num_logprobs: int,
) -> None:
- if model in V0_UNSUPPORTED_MODELS:
- pytest.skip(
- f"Unsupported V0 Engine. Skipping `test_batching` on {model}.")
-
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
@@ -191,29 +182,32 @@ def test_chunked_prefill(
max_tokens: int,
num_logprobs: int,
chunked_prefill_token_size: int,
+ monkeypatch,
) -> None:
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
- with vllm_runner(model,
- enable_chunked_prefill=True,
- max_num_batched_tokens=max_num_batched_tokens,
- max_num_seqs=max_num_seqs) as vllm_model:
- chunked = vllm_model.generate_greedy_logprobs(example_prompts,
- max_tokens, num_logprobs)
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "0")
+ with vllm_runner(model,
+ enable_chunked_prefill=True,
+ max_num_batched_tokens=max_num_batched_tokens,
+ max_num_seqs=max_num_seqs) as vllm_model:
+ chunked = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, num_logprobs)
- with vllm_runner(model,
- enable_chunked_prefill=False,
- max_num_seqs=max_num_seqs) as vllm_model:
- non_chunked = vllm_model.generate_greedy_logprobs(
- example_prompts, max_tokens, num_logprobs)
+ with vllm_runner(model,
+ enable_chunked_prefill=False,
+ max_num_seqs=max_num_seqs) as vllm_model:
+ non_chunked = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, num_logprobs)
- check_logprobs_close(
- outputs_0_lst=chunked,
- outputs_1_lst=non_chunked,
- name_0="chunked",
- name_1="non_chunked",
- )
+ check_logprobs_close(
+ outputs_0_lst=chunked,
+ outputs_1_lst=non_chunked,
+ name_0="chunked",
+ name_1="non_chunked",
+ )
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -284,25 +278,29 @@ def test_models_preemption_recompute(
example_prompts,
model: str,
max_tokens: int,
+ monkeypatch,
) -> None:
"""
Tests that outputs are identical with and w/o preemptions (recompute).
"""
- with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
- scheduler = vllm_model.llm.llm_engine.scheduler[0]
- scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
- preempt_vllm_outputs = vllm_model.generate_greedy(
- example_prompts, max_tokens)
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "0")
+ with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+ scheduler = vllm_model.llm.llm_engine.scheduler[0]
+ scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
+ preempt_vllm_outputs = vllm_model.generate_greedy(
+ example_prompts, max_tokens)
- scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
- vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+ scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
+ vllm_outputs = vllm_model.generate_greedy(example_prompts,
+ max_tokens)
- check_outputs_equal(
- outputs_0_lst=preempt_vllm_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="vllm_preepmtions",
- name_1="vllm",
- )
+ check_outputs_equal(
+ outputs_0_lst=preempt_vllm_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="vllm_preepmtions",
+ name_1="vllm",
+ )
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -405,24 +403,18 @@ def test_full_cuda_graph(
else:
hf_outputs = None
- if model not in V0_UNSUPPORTED_MODELS:
- with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
- vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
- example_prompts, max_tokens, num_logprobs)
- else:
- vllm_v0_outputs = None
-
with monkeypatch.context() as m:
- m.setenv("VLLM_USE_V1", "1")
- if model in HYBRID_MODELS:
- # required due to reorder_batch behaviour
- m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
- with vllm_runner(model,
- max_num_seqs=MAX_NUM_SEQS,
- compilation_config={'full_cuda_graph': True},
- enable_prefix_caching=False) as vllm_model:
- vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
- example_prompts, max_tokens, num_logprobs)
+ m.setenv("VLLM_USE_V1", "0")
+ if model not in V0_UNSUPPORTED_MODELS:
+ with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+ vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, num_logprobs)
+ else:
+ vllm_v0_outputs = None
+
+ with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+ vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, num_logprobs)
if hf_outputs is not None and vllm_v0_outputs is not None:
check_logprobs_close(
@@ -469,24 +461,20 @@ def test_fp32_state(
else:
hf_outputs = None
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "0")
+ with vllm_runner(model,
+ max_num_seqs=MAX_NUM_SEQS,
+ mamba_ssm_cache_dtype="float32") as vllm_model:
+ vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, num_logprobs)
+
with vllm_runner(model,
max_num_seqs=MAX_NUM_SEQS,
mamba_ssm_cache_dtype="float32") as vllm_model:
- vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+ vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
- with monkeypatch.context() as m:
- m.setenv("VLLM_USE_V1", "1")
- if model in HYBRID_MODELS:
- # required due to reorder_batch behaviour
- m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
- with vllm_runner(model,
- max_num_seqs=MAX_NUM_SEQS,
- mamba_ssm_cache_dtype="float32",
- enable_prefix_caching=False) as vllm_model:
- vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
- example_prompts, max_tokens, num_logprobs)
-
if hf_outputs is not None:
check_logprobs_close(
outputs_0_lst=hf_outputs,
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 96208f8eda628..2b60faae8ec0b 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -222,21 +222,6 @@ VLM_TEST_SETTINGS = {
},
marks=[large_gpu_mark(min_gb=32)],
),
- # Check "auto" with fallback to transformers
- "internvl-transformers": VLMTestInfo(
- models=["OpenGVLab/InternVL3-1B-hf"],
- test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
- prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
- img_idx_to_prompt=lambda idx: "",
- max_model_len=4096,
- use_tokenizer_eos=True,
- image_size_factors=[(0.25, 0.5, 1.0)],
- vllm_runner_kwargs={
- "model_impl": "auto",
- },
- auto_cls=AutoModelForImageTextToText,
- marks=[pytest.mark.core_model],
- ),
#### Extended model tests
"aria": VLMTestInfo(
models=["rhymes-ai/Aria"],
@@ -461,6 +446,20 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
),
+ "intern_vl-hf": VLMTestInfo(
+ models=["OpenGVLab/InternVL3-1B-hf"],
+ test_type=(
+ VLMTestType.IMAGE,
+ VLMTestType.MULTI_IMAGE,
+ VLMTestType.VIDEO,
+ ),
+ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+ img_idx_to_prompt=lambda idx: "",
+ video_idx_to_prompt=lambda idx: "