Merge branch 'main' into mlm-full-lora-support

This commit is contained in:
B-201 2025-12-05 20:17:40 +08:00 committed by GitHub
commit 1fbd7287b8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
122 changed files with 2640 additions and 2700 deletions

View File

@ -8,3 +8,4 @@ tasks:
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5
rtol: 0.05

View File

@ -0,0 +1 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

View File

@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
--tp-size=1
"""
import os
from contextlib import contextmanager
import lm_eval
import numpy as np
import yaml
RTOL = 0.08
DEFAULT_RTOL = 0.08
@contextmanager
def scoped_env_vars(new_env: dict[str, str]):
if not new_env:
# Fast path: nothing to do
yield
return
old_values = {}
new_keys = []
try:
for key, value in new_env.items():
if key in os.environ:
old_values[key] = os.environ[key]
else:
new_keys.append(key)
os.environ[key] = str(value)
yield
finally:
# Restore / clean up
for key, value in old_values.items():
os.environ[key] = value
for key in new_keys:
os.environ.pop(key, None)
def launch_lm_eval(eval_config, tp_size):
@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len},"
)
results = lm_eval.simple_evaluate(
model=backend,
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm, or explicitly set
apply_chat_template=eval_config.get(
"apply_chat_template", backend == "vllm-vlm"
),
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
gen_kwargs=eval_config.get("gen_kwargs"),
batch_size=batch_size,
)
env_vars = eval_config.get("env_vars", None)
with scoped_env_vars(env_vars):
results = lm_eval.simple_evaluate(
model=backend,
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm, or explicitly set
apply_chat_template=eval_config.get(
"apply_chat_template", backend == "vllm-vlm"
),
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
gen_kwargs=eval_config.get("gen_kwargs"),
batch_size=batch_size,
)
return results
@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
results = launch_lm_eval(eval_config, tp_size)
rtol = eval_config.get("rtol", DEFAULT_RTOL)
success = True
for task in eval_config["tasks"]:
for metric in task["metrics"]:
@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
measured_value = results["results"][task["name"]][metric["name"]]
print(
f"{task['name']} | {metric['name']}: "
f"ground_truth={ground_truth} | measured={measured_value}"
f"ground_truth={ground_truth:.3f} | "
f"measured={measured_value:.3f} | rtol={rtol}"
)
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
assert success

View File

@ -9,6 +9,7 @@ import argparse
import json
import sys
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from typing import Any
from urllib.parse import quote
@ -20,6 +21,7 @@ if not sys.version_info >= (3, 12):
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
<html>
<!-- {comment} -->
<meta name="pypi:repository-version" content="1.0">
<body>
{items}
@ -90,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
)
def generate_project_list(subdir_names: list[str]) -> str:
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
"""
Generate project list HTML content linking to each project & variant sub-directory.
"""
@ -98,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str:
for name in sorted(subdir_names):
name = name.strip("/").strip(".")
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
def generate_package_index_and_metadata(
wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
wheel_files: list[WheelFileInfo],
wheel_base_dir: Path,
index_base_dir: Path,
comment: str = "",
) -> tuple[str, str]:
"""
Generate package index HTML content for a specific package, linking to actual wheel files.
@ -120,7 +125,7 @@ def generate_package_index_and_metadata(
file_meta = asdict(file)
file_meta["path"] = file_path_quoted
metadata.append(file_meta)
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
metadata_str = json.dumps(metadata, indent=2)
return index_str, metadata_str
@ -131,6 +136,7 @@ def generate_index_and_metadata(
index_base_dir: Path,
default_variant: str | None = None,
alias_to_default: str | None = None,
comment: str = "",
):
"""
Generate index for all wheel files.
@ -141,6 +147,7 @@ def generate_index_and_metadata(
index_base_dir (Path): Base directory to store index files.
default_variant (str | None): The default variant name, if any.
alias_to_default (str | None): Alias variant name for the default variant, if any.
comment (str | None): Optional comment to include in the generated HTML files.
First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
@ -234,6 +241,10 @@ def generate_index_and_metadata(
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
print(f"Alias variant '{alias_to_default}' created for default variant.")
# Generate comment in HTML header
comment_str = f" ({comment})" if comment else ""
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
# Generate index for each variant
subdir_names = set()
for variant, files in variant_to_files.items():
@ -253,7 +264,7 @@ def generate_index_and_metadata(
subdir_names = subdir_names.union(packages)
else:
# generate project list for this variant directly
project_list_str = generate_project_list(sorted(packages))
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
with open(variant_dir / "index.html", "w") as f:
f.write(project_list_str)
@ -263,7 +274,7 @@ def generate_index_and_metadata(
package_dir = variant_dir / package
package_dir.mkdir(parents=True, exist_ok=True)
index_str, metadata_str = generate_package_index_and_metadata(
package_files, wheel_base_dir, package_dir
package_files, wheel_base_dir, package_dir, comment
)
with open(package_dir / "index.html", "w") as f:
f.write(index_str)
@ -271,7 +282,7 @@ def generate_index_and_metadata(
f.write(metadata_str)
# Generate top-level project list index
project_list_str = generate_project_list(sorted(subdir_names))
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
with open(index_base_dir / "index.html", "w") as f:
f.write(project_list_str)
@ -283,6 +294,7 @@ if __name__ == "__main__":
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--output-dir <output_directory> : directory to store generated index files
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
--comment <comment_string> : (optional) comment string to include in generated HTML files
"""
parser = argparse.ArgumentParser(
@ -312,6 +324,12 @@ if __name__ == "__main__":
default=None,
help="Alias variant name for the default variant",
)
parser.add_argument(
"--comment",
type=str,
default="",
help="Optional comment string to include in generated HTML files",
)
args = parser.parse_args()
@ -366,5 +384,6 @@ if __name__ == "__main__":
index_base_dir=index_base_dir,
default_variant=None,
alias_to_default=args.alias_to_default,
comment=args.comment.strip(),
)
print(f"Successfully generated index and metadata in {output_dir}")

View File

@ -81,7 +81,10 @@ else
alias_arg=""
fi
$PYTHON pip install regex && .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
# HACK: we do not need regex module here, but it is required by pre-commit hook
# To avoid any external dependency, we simply replace it back to the stdlib re module
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
# copy indices to /<commit>/ unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX"

View File

@ -718,17 +718,6 @@ steps:
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 15min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: OpenAI API correctness # 10min
timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction]
@ -974,19 +963,6 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
timeout_in_minutes: 70
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1 # 60min
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
@ -1162,21 +1138,6 @@ steps:
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
- label: ROCm GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
agent_pool: mi325_1
mirror_hardwares: [amdexperimental, amdproduction]
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
@ -1194,16 +1155,6 @@ steps:
commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test #####
##### multi gpus test #####
@ -1380,7 +1331,7 @@ steps:
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
# Disabled for now because MXFP4 backend on non-cuda platform
# Disabled for now because MXFP4 backend on non-cuda platform
# doesn't support LoRA yet
#- pytest -v -s -x lora/test_gptoss_tp.py
@ -1446,37 +1397,6 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: a100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
##### H100 test #####
- label: LM Eval Large Models (H100) # optional
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test #####
- label: Distributed Tests (H200) # optional
mirror_hardwares: [amdexperimental]
@ -1508,20 +1428,94 @@ steps:
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
##### E2E Eval Tests #####
- label: LM Eval Small Models (1 Card) # 15min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
timeout_in_minutes: 30
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
- csrc/
- vllm/model_executor/layers/quantization
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
timeout_in_minutes: 70
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: LM Eval Large Models (4 Card)
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: a100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (H100) # optional
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
- label: ROCm LM Eval Large Models (8 Card)
mirror_hardwares: [amdproduction]
agent_pool: mi325_8
num_gpus: 8
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
- label: ROCm GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
agent_pool: mi325_1
mirror_hardwares: [amdexperimental, amdproduction]
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: DeepSeek V2-Lite Accuracy
mirror_hardwares: [amdexperimental, amdproduction]
@ -1554,4 +1548,19 @@ steps:
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
timeout_in_minutes: 30
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh

View File

@ -350,7 +350,8 @@ steps:
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- vllm/
- vllm/v1/attention
- vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn

View File

@ -0,0 +1,160 @@
# Nightly Builds of vLLM Wheels
vLLM maintains a per-commit wheel repository (commonly referred to as "nightly") at `https://wheels.vllm.ai` that provides pre-built wheels for every commit on the `main` branch since `v0.5.3`. This document explains how the nightly wheel index mechanism works.
## Build and Upload Process on CI
### Wheel Building
Wheels are built in the `Release` pipeline (`.buildkite/release-pipeline.yaml`) after a PR is merged into the main branch, with multiple variants:
- **Backend variants**: `cpu` and `cuXXX` (e.g., `cu129`, `cu130`).
- **Architecture variants**: `x86_64` and `aarch64`.
Each build step:
1. Builds the wheel in a Docker container.
2. Renames the wheel filename to use the correct manylinux tag (currently `manylinux_2_31`) for PEP 600 compliance.
3. Uploads the wheel to S3 bucket `vllm-wheels` under `/{commit_hash}/`.
### Index Generation
After uploading each wheel, the `.buildkite/scripts/upload-wheels.sh` script:
1. **Lists all existing wheels** in the commit directory from S3
2. **Generates indices** using `.buildkite/scripts/generate-nightly-index.py`:
- Parses wheel filenames to extract metadata (version, variant, platform tags).
- Creates HTML index files (`index.html`) for PyPI compatibility.
- Generates machine-readable `metadata.json` files.
3. **Uploads indices** to multiple locations (overriding existing ones):
- `/{commit_hash}/` - Always uploaded for commit-specific access.
- `/nightly/` - Only for commits on `main` branch (not PRs).
- `/{version}/` - Only for release wheels (no `dev` in its version).
!!! tip "Handling Concurrent Builds"
The index generation script can handle multiple variants being built concurrently by always listing all wheels in the commit directory before generating indices, avoiding race conditions.
## Directory Structure
The S3 bucket structure follows this pattern:
```text
s3://vllm-wheels/
├── {commit_hash}/ # Commit-specific wheels and indices
│ ├── vllm-*.whl # All wheel files
│ ├── index.html # Project list (default variant)
│ ├── vllm/
│ │ ├── index.html # Package index (default variant)
│ │ └── metadata.json # Metadata (default variant)
│ ├── cu129/ # Variant subdirectory
│ │ ├── index.html # Project list (cu129 variant)
│ │ └── vllm/
│ │ ├── index.html # Package index (cu129 variant)
│ │ └── metadata.json # Metadata (cu129 variant)
│ ├── cu130/ # Variant subdirectory
│ ├── cpu/ # Variant subdirectory
│ └── .../ # More variant subdirectories
├── nightly/ # Latest main branch wheels (mirror of latest commit)
└── {version}/ # Release version indices (e.g., 0.11.2)
```
All built wheels are stored in `/{commit_hash}/`, while different indices are generated and reference them.
This avoids duplication of wheel files.
For example, you can specify the following URLs to use different indices:
- `https://wheels.vllm.ai/nightly/cu130` for the latest main branch wheels built with CUDA 13.0.
- `https://wheels.vllm.ai/{commit_hash}` for wheels built at a specific commit (default variant).
- `https://wheels.vllm.ai/0.12.0/cpu` for 0.12.0 release wheels built for CPU variant.
Please note that not all variants are present on every commit. The available variants are subject to change over time, e.g., changing cu130 to cu131.
### Variant Organization
Indices are organized by variant:
- **Default variant**: Wheels without variant suffix (i.e., built with the current `VLLM_MAIN_CUDA_VERSION`) are placed in the root.
- **Variant subdirectories**: Wheels with variant suffixes (e.g., `+cu130`, `.cpu`) are organized in subdirectories.
- **Alias to default**: The default variant can have an alias (e.g., `cu129` for now) for consistency and convenience.
The variant is extracted from the wheel filename (as described in the [file name convention](https://packaging.python.org/en/latest/specifications/binary-distribution-format/#file-name-convention)):
- The variant is encoded in the local version identifier (e.g. `+cu129` or `dev<N>+g<hash>.cu130`).
- Examples:
- `vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl` → default variant
- `vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl``cu129` variant
- `vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl``cu130` variant
## Index Generation Details
The `generate-nightly-index.py` script performs the following:
1. **Parses wheel filenames** using regex to extract:
- Package name
- Version (with variant extracted)
- Python tag, ABI tag, platform tag
- Build tag (if present)
2. **Groups wheels by variant**, then by package name:
- Currently only `vllm` is built, but the structure supports multiple packages in the future.
3. **Generates HTML indices** (compliant with the [Simple repository API](https://packaging.python.org/en/latest/specifications/simple-repository-api/#simple-repository-api)):
- Top-level `index.html`: Lists all packages and variant subdirectories
- Package-level `index.html`: Lists all wheel files for that package
- Uses relative paths to wheel files for portability
4. **Generates metadata.json**:
- Machine-readable JSON containing all wheel metadata
- Includes `path` field with URL-encoded relative path to wheel file
- Used by `setup.py` to locate compatible pre-compiled wheels during Python-only builds
### Special Handling for AWS Services
The wheels and indices are directly stored on AWS S3, and we use AWS CloudFront as a CDN in front of the S3 bucket.
Since S3 does not provide proper directory listing, to support PyPI-compatible simple repository API behavior, we deploy a CloudFront Function that:
- redirects any URL that does not end with `/` and does not look like a file (i.e., does not contain a dot `.` in the last path segment) to the same URL with a trailing `/`
- appends `/index.html` to any URL that ends with `/`
For example, the following requests would be handled as:
- `/nightly` -> `/nightly/index.html`
- `/nightly/cu130/` -> `/nightly/cu130/index.html`
- `/nightly/index.html` or `/nightly/vllm.whl` -> unchanged
!!! note "AWS S3 Filename Escaping"
S3 will automatically escape filenames upon upload according to its [naming rule](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html). The direct impact on vllm is that `+` in filenames will be converted to `%2B`. We take special care in the index generation script to escape filenames properly when generating the HTML indices and JSON metadata, to ensure the URLs are correct and can be directly used.
## Usage of precompiled wheels in `setup.py` {#precompiled-wheels-usage}
When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script:
1. **Determines wheel location** via `precompiled_wheel_utils.determine_wheel_url()`:
- Env var `VLLM_PRECOMPILED_WHEEL_LOCATION` (user-specified URL/path) always takes precedence and skips all other steps.
- Determines the variant from `VLLM_MAIN_CUDA_VERSION` (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_VARIANT`); the default variant will also be tried as a fallback.
- Determines the _base commit_ (explained later) of this branch (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_COMMIT`).
2. **Fetches metadata** from `https://wheels.vllm.ai/{commit}/vllm/metadata.json` (for the default variant) or `https://wheels.vllm.ai/{commit}/{variant}/vllm/metadata.json` (for a specific variant).
3. **Selects compatible wheel** based on:
- Package name (`vllm`)
- Platform tag (architecture match)
4. **Downloads and extracts** precompiled binaries from the wheel:
- C++ extension modules (`.so` files)
- Flash Attention Python modules
- Triton kernel Python files
5. **Patches package_data** to include extracted files in the installation
!!! note "What is the base commit?"
The base commit is determined by finding the merge-base
between the current branch and upstream `main`, ensuring
compatibility between source code and precompiled binaries.
_Note: it's users' responsibility to ensure there is no native code (e.g., C++ or CUDA) changes before using precompiled wheels._
## Implementation Files
Key files involved in the nightly wheel mechanism:
- **`.buildkite/release-pipeline.yaml`**: CI pipeline that builds wheels
- **`.buildkite/scripts/upload-wheels.sh`**: Script that uploads wheels and generates indices
- **`.buildkite/scripts/generate-nightly-index.py`**: Python script that generates PyPI-compatible indices
- **`setup.py`**: Contains `precompiled_wheel_utils` class for fetching and using precompiled wheels

View File

@ -0,0 +1,333 @@
# Kthena
[**Kthena**](https://github.com/volcano-sh/kthena) is a Kubernetes-native LLM inference platform that transforms how organizations deploy and manage Large Language Models in production. Built with declarative model lifecycle management and intelligent request routing, it provides high performance and enterprise-grade scalability for LLM inference workloads.
This guide shows how to deploy a production-grade, **multi-node vLLM** service on Kubernetes.
Well:
- Install the required components (Kthena + Volcano).
- Deploy a multi-node vLLM model via Kthenas `ModelServing` CR.
- Validate the deployment.
---
## 1. Prerequisites
You need:
- A Kubernetes cluster with **GPU nodes**.
- `kubectl` access with cluster-admin or equivalent permissions.
- **Volcano** installed for gang scheduling.
- **Kthena** installed with the `ModelServing` CRD available.
- A valid **Hugging Face token** if loading models from Hugging Face Hub.
### 1.1 Install Volcano
```bash
helm repo add volcano-sh https://volcano-sh.github.io/helm-charts
helm repo update
helm install volcano volcano-sh/volcano -n volcano-system --create-namespace
```
This provides the gang-scheduling and network topology features used by Kthena.
### 1.2 Install Kthena
```bash
helm install kthena oci://ghcr.io/volcano-sh/charts/kthena --version v0.1.0 --namespace kthena-system --create-namespace
```
- The `kthena-system` namespace is created.
- Kthena controllers and CRDs, including `ModelServing`, are installed and healthy.
Validate:
```bash
kubectl get crd | grep modelserving
```
You should see:
```text
modelservings.workload.serving.volcano.sh ...
```
---
## 2. The Multi-Node vLLM `ModelServing` Example
Kthena provides an example manifest to deploy a **multi-node vLLM cluster running Llama**. Conceptually this is equivalent to the vLLM production stack Helm deployment, but expressed with `ModelServing`.
A simplified version of the example (`llama-multinode`) looks like:
- `spec.replicas: 1` one `ServingGroup` (one logical model deployment).
- `roles`:
- `entryTemplate` defines **leader** pods that run:
- vLLMs **multi-node cluster bootstrap script** (Ray cluster).
- vLLM **OpenAI-compatible API server**.
- `workerTemplate` defines **worker** pods that join the leaders Ray cluster.
Key points from the example YAML:
- **Image**: `vllm/vllm-openai:latest` (matches upstream vLLM images).
- **Command** (leader):
```yaml
command:
- sh
- -c
- >
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
python3 -m vllm.entrypoints.openai.api_server
--port 8080
--model meta-llama/Llama-3.1-405B-Instruct
--tensor-parallel-size 8
--pipeline-parallel-size 2
```
- **Command** (worker):
```yaml
command:
- sh
- -c
- >
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
```
---
## 3. Deploying Multi-Node llama vLLM via Kthena
### 3.1 Prepare the Manifest
**Recommended**: use a Secret instead of a raw env var:
```bash
kubectl create secret generic hf-token \
-n default \
--from-literal=HUGGING_FACE_HUB_TOKEN='<your-token>'
```
### 3.2 Apply the `ModelServing`
```bash
cat <<EOF | kubectl apply -f -
apiVersion: workload.serving.volcano.sh/v1alpha1
kind: ModelServing
metadata:
name: llama-multinode
namespace: default
spec:
schedulerName: volcano
replicas: 1 # group replicas
template:
restartGracePeriodSeconds: 60
gangPolicy:
minRoleReplicas:
405b: 1
roles:
- name: 405b
replicas: 2
entryTemplate:
spec:
containers:
- name: leader
image: vllm/vllm-openai:latest
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: HUGGING_FACE_HUB_TOKEN
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2"
resources:
limits:
nvidia.com/gpu: "8"
memory: 1124Gi
ephemeral-storage: 800Gi
requests:
ephemeral-storage: 800Gi
cpu: 125
ports:
- containerPort: 8080
readinessProbe:
tcpSocket:
port: 8080
initialDelaySeconds: 15
periodSeconds: 10
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
workerReplicas: 1
workerTemplate:
spec:
containers:
- name: worker
image: vllm/vllm-openai:latest
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
resources:
limits:
nvidia.com/gpu: "8"
memory: 1124Gi
ephemeral-storage: 800Gi
requests:
ephemeral-storage: 800Gi
cpu: 125
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: HUGGING_FACE_HUB_TOKEN
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
EOF
```
Kthena will:
- Create a `ModelServing` object.
- Derive a `PodGroup` for Volcano gang scheduling.
- Create the leader and worker pods for each `ServingGroup` and `Role`.
---
## 4. Verifying the Deployment
### 4.1 Check ModelServing Status
Use the snippet from the Kthena docs:
```bash
kubectl get modelserving -oyaml | grep status -A 10
```
You should see something like:
```yaml
status:
availableReplicas: 1
conditions:
- type: Available
status: "True"
reason: AllGroupsReady
message: All Serving groups are ready
- type: Progressing
status: "False"
...
replicas: 1
updatedReplicas: 1
```
### 4.2 Check Pods
List pods for your deployment:
```bash
kubectl get pod -owide -l modelserving.volcano.sh/name=llama-multinode
```
Example output (from docs):
```text
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE ...
default llama-multinode-0-405b-0-0 1/1 Running 0 15m 10.244.0.56 192.168.5.12 ...
default llama-multinode-0-405b-0-1 1/1 Running 0 15m 10.244.0.58 192.168.5.43 ...
default llama-multinode-0-405b-1-0 1/1 Running 0 15m 10.244.0.57 192.168.5.58 ...
default llama-multinode-0-405b-1-1 1/1 Running 0 15m 10.244.0.53 192.168.5.36 ...
```
Pod name pattern:
- `llama-multinode-<group-idx>-<role-name>-<replica-idx>-<ordinal>`.
The first number indicates `ServingGroup`. The second (`405b`) is the `Role`. The remaining indices identify the pod within the role.
---
## 6. Accessing the vLLM OpenAI-Compatible API
Expose the entry via a Service:
```yaml
apiVersion: v1
kind: Service
metadata:
name: llama-multinode-openai
namespace: default
spec:
selector:
modelserving.volcano.sh/name: llama-multinode
modelserving.volcano.sh/entry: "true"
# optionally further narrow to leader role if you label it
ports:
- name: http
port: 80
targetPort: 8080
type: ClusterIP
```
Port-forward from your local machine:
```bash
kubectl port-forward svc/llama-multinode-openai 30080:80 -n default
```
Then:
- List models:
```bash
curl -s http://localhost:30080/v1/models
```
- Send a completion request (mirroring vLLM production stack docs):
```bash
curl -X POST http://localhost:30080/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Llama-3.1-405B-Instruct",
"prompt": "Once upon a time,",
"max_tokens": 10
}'
```
You should see an OpenAI-style response from vLLM.
---
## 7. Clean Up
To remove the deployment and its resources:
```bash
kubectl delete modelserving llama-multinode -n default
```
If youre done with the entire stack:
```bash
helm uninstall kthena -n kthena-system # or your Kthena release name
helm uninstall volcano -n volcano-system
```

View File

@ -14,6 +14,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
- [InftyAI/llmaz](integrations/llmaz.md)
- [KAITO](integrations/kaito.md)
- [KServe](integrations/kserve.md)
- [Kthena](integrations/kthena.md)
- [KubeRay](integrations/kuberay.md)
- [kubernetes-sigs/lws](frameworks/lws.md)
- [meta-llama/llama-stack](integrations/llamastack.md)

View File

@ -86,7 +86,7 @@ LLM(model, enforce_eager=True)
```
To turn off just torch.compile, pass `mode = NONE` to the compilation config.
(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated):
(`-cc` is short for `--compilation_config`):
```sh
# Online

View File

@ -62,7 +62,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
- `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
- `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
- `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
- `vllm:kv_cache_usage_perc` - Percentage of used cache blocks by vLLM.
- `vllm:request_prompt_tokens` - Request prompt length.
- `vllm:request_generation_tokens` - Request generation length.
- `vllm:request_success` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.

View File

@ -443,6 +443,8 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
print(generated_text)
```
For Qwen3-VL, the `image_embeds` should contain both the base image embedding and deepstack features.
#### Audio Embeddings
You can pass pre-computed audio embeddings similar to image embeddings:

View File

@ -18,6 +18,7 @@ vLLM currently supports the following reasoning models:
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
| [Holo2 series](https://huggingface.co/collections/Hcompany/holo2) | `holo2` | `json`, `regex` | ✅ |
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
| [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) | `minimax_m2_append_think` | `json`, `regex` | ✅ |
@ -28,6 +29,7 @@ vLLM currently supports the following reasoning models:
IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
DeepSeek-V3.1 tool calling is supported in non-thinking mode.
Holo2 reasoning is enabled by default. To disable it, you must also pass `thinking=False` in your `chat_template_kwargs`.
## Quickstart

View File

@ -58,10 +58,14 @@ schemathesis==3.39.15
# Evaluation and benchmarking
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
jiwer==4.0.0
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
multiprocess==0.70.16
# Required for v1/metrics/test_engine_logger_apis.py
ray[cgraph,default]>=2.48.0
# Plugins test
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
torchgeo==0.7.0

View File

@ -260,13 +260,18 @@ def test_deep_sleep_fp8_kvcache():
llm.sleep(level=2)
used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
assert used_bytes < 3 * GiB_bytes
# Rocm uses more memory for CudaGraphs, so we add 2 GiB more for the threshold
rocm_extra_mem_bytes = 2 * GiB_bytes if current_platform.is_rocm() else 0
mem_threshold_after_sleep = 3 * GiB_bytes + rocm_extra_mem_bytes
assert used_bytes < mem_threshold_after_sleep
llm.wake_up(tags=["weights"])
llm.collective_rpc("reload_weights")
used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
assert used_bytes < 4 * GiB_bytes
mem_threshold_after_wake_up = 4 * GiB_bytes + rocm_extra_mem_bytes
assert used_bytes < mem_threshold_after_wake_up
# now allocate kv cache and cuda graph memory
llm.wake_up(tags=["kv_cache"])

View File

@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import functools
import multiprocessing
import tempfile
from contextlib import contextmanager
@ -137,3 +139,67 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
artifacts = compiled_mod.aot_compiled_fn._artifacts
guards_string = artifacts.compiled_fn.shape_env.format_guards()
assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
@pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
@use_vllm_config(make_vllm_config())
def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
"""
Test that compiling gpt2 twice results in a cache hit and
capture torch dynamic symbol creations to ensure make_symbol
not called on cache hit.
"""
import torch.fx.experimental.symbolic_shapes as symbolic_shapes_module
from torch.utils._sympy.symbol import make_symbol
from vllm import LLM
create_symbol_counter = multiprocessing.Value("i", 0)
original_make_symbol = make_symbol
@functools.wraps(original_make_symbol)
def counting_make_symbol(prefix, idx, **kwargs):
with create_symbol_counter.get_lock():
create_symbol_counter.value += 1
return original_make_symbol(prefix, idx, **kwargs)
symbolic_shapes_module.make_symbol = counting_make_symbol
try:
with monkeypatch.context() as m, tempfile.TemporaryDirectory() as tmpdirname:
m.setenv("VLLM_CACHE_ROOT", tmpdirname)
m.setenv("VLLM_USE_AOT_COMPILE", "1")
# First compilation - initialize model and generate
llm_model = LLM(
model="gpt2",
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
),
max_model_len=256,
)
llm_model.generate("Hello, my name is")
assert create_symbol_counter.value == 2
create_symbol_counter.value = 0
# Clean up first model
del llm_model
# Second compilation - should hit cache
m.setenv("VLLM_FORCE_AOT_LOAD", "1")
llm_model = LLM(
model="gpt2",
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
),
max_model_len=256,
)
llm_model.generate("Hello, my name is")
assert create_symbol_counter.value == 0
finally:
# Restore original method
symbolic_shapes_module.make_symbol = original_make_symbol

View File

@ -27,7 +27,7 @@ import threading
from collections.abc import Generator
from contextlib import nullcontext
from enum import Enum
from typing import Any, Callable, TypedDict, TypeVar, cast
from typing import Any, Callable, TypedDict, TypeVar, cast, TYPE_CHECKING
import numpy as np
import pytest
@ -67,6 +67,11 @@ from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_num_threads
if TYPE_CHECKING:
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers.generation.utils import GenerateOutput
logger = init_logger(__name__)
_TEST_DIR = os.path.dirname(__file__)
@ -202,10 +207,7 @@ def dynamo_reset():
@pytest.fixture
def example_prompts() -> list[str]:
prompts = []
for filename in _TEST_PROMPTS:
prompts += _read_prompts(filename)
return prompts
return [prompt for filename in _TEST_PROMPTS for prompt in _read_prompts(filename)]
@pytest.fixture
@ -224,10 +226,7 @@ class DecoderPromptType(Enum):
@pytest.fixture
def example_long_prompts() -> list[str]:
prompts = []
for filename in _LONG_PROMPTS:
prompts += _read_prompts(filename)
return prompts
return [prompt for filename in _LONG_PROMPTS for prompt in _read_prompts(filename)]
@pytest.fixture(scope="session")
@ -353,10 +352,13 @@ class HfRunner:
trust_remote_code=trust_remote_code,
)
else:
model = auto_cls.from_pretrained(
model_name,
trust_remote_code=trust_remote_code,
**model_kwargs,
model = cast(
nn.Module,
auto_cls.from_pretrained(
model_name,
trust_remote_code=trust_remote_code,
**model_kwargs,
),
)
# in case some unquantized custom models are not in same dtype
@ -374,10 +376,12 @@ class HfRunner:
self.model = model
if not skip_tokenizer_init:
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
dtype=dtype,
trust_remote_code=trust_remote_code,
self.tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast" = (
AutoTokenizer.from_pretrained(
model_name,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
)
# don't put this import at the top level
@ -495,7 +499,7 @@ class HfRunner:
outputs: list[tuple[list[list[int]], list[str]]] = []
for inputs in all_inputs:
output_ids = self.model.generate(
output_ids: torch.Tensor = self.model.generate(
**self.wrap_device(inputs),
use_cache=True,
**kwargs,
@ -505,8 +509,7 @@ class HfRunner:
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)
output_ids = output_ids.cpu().tolist()
outputs.append((output_ids, output_str))
outputs.append((output_ids.cpu().tolist(), output_str))
return outputs
def generate_greedy(
@ -574,7 +577,7 @@ class HfRunner:
all_logprobs: list[list[torch.Tensor]] = []
for inputs in all_inputs:
output = self.model.generate(
output: "GenerateOutput" = self.model.generate(
**self.wrap_device(inputs),
use_cache=True,
do_sample=False,
@ -656,7 +659,7 @@ class HfRunner:
all_output_strs: list[str] = []
for inputs in all_inputs:
output = self.model.generate(
output: "GenerateOutput" = self.model.generate(
**self.wrap_device(inputs),
use_cache=True,
do_sample=False,

View File

@ -16,16 +16,35 @@ from typing import Literal, NamedTuple
import pytest
import torch
from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
from tests.utils import RemoteOpenAIServer, create_new_process_for_each_test
from vllm.config.model import RunnerOption
from vllm.logger import init_logger
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import compare_two_settings, create_new_process_for_each_test
logger = init_logger("test_context_parallel")
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
CP_TEST_MODELS = [
# TODO support other models
# [LANGUAGE GENERATION]
"deepseek-ai/DeepSeek-V2-Lite-Chat",
"Qwen/Qwen2.5-1.5B-Instruct",
]
# GSM8K eval configuration
NUM_QUESTIONS = 256 # Fast eval for CI
NUM_SHOTS = 5 # Few-shot examples
# tp accuracy with 2% buffer
MIN_ACCURACY = {
# .buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
"deepseek-ai/DeepSeek-V2-Lite-Chat": 0.64,
# .buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
"Qwen/Qwen2.5-1.5B-Instruct": 0.52,
}
class ParallelSetup(NamedTuple):
tp_size: int
@ -38,7 +57,6 @@ class ParallelSetup(NamedTuple):
class CPTestOptions(NamedTuple):
multi_node_only: bool
load_format: str | None = None
attn_backend: str | None = None
@ -54,17 +72,20 @@ class CPTestSettings:
*,
tp_base: int = 4,
pp_base: int = 1,
dcp_base: int = 1,
dcp_multipliers: list[float] | None = None,
cp_kv_cache_interleave_size: int = 1,
multi_node_only: bool = False,
runner: RunnerOption = "auto",
load_format: str | None = None,
attn_backend: str | None = None,
):
parallel_setups = []
if dcp_multipliers is None:
dcp_multipliers = [
0.5,
]
for eager_mode_val in [False]:
for pp_multiplier in [1]:
for dcp_multiplier in [0.5, 1]:
for dcp_multiplier in dcp_multipliers:
for chunked_prefill_val in [True]:
parallel_setups.append(
ParallelSetup(
@ -82,7 +103,6 @@ class CPTestSettings:
runner=runner,
test_options=CPTestOptions(
multi_node_only=multi_node_only,
load_format=load_format,
attn_backend=attn_backend,
),
)
@ -101,7 +121,24 @@ class CPTestSettings:
)
def _compare_cp_with_tp(
CP_TEXT_GENERATION_MODELS = {
"deepseek-ai/DeepSeek-V2-Lite-Chat": [
CPTestSettings.detailed(
dcp_multipliers=[0.5, 1], cp_kv_cache_interleave_size=64
),
],
"Qwen/Qwen2.5-1.5B-Instruct": [
CPTestSettings.detailed(
cp_kv_cache_interleave_size=16, attn_backend="FLASH_ATTN"
),
CPTestSettings.detailed(
cp_kv_cache_interleave_size=16, attn_backend="FLASHINFER"
),
],
}
def _test_cp_gsm8k(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
@ -121,7 +158,7 @@ def _compare_cp_with_tp(
chunked_prefill,
) = parallel_setup
multi_node_only, load_format, attn_backend = test_options
multi_node_only, attn_backend = test_options
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_transformers_version(on_fail="skip")
@ -130,22 +167,7 @@ def _compare_cp_with_tp(
tokenizer_mode = model_info.tokenizer_mode
hf_overrides = model_info.hf_overrides
if load_format == "dummy":
# Avoid OOM
text_overrides = {
"num_hidden_layers": 4,
"hidden_size": 512,
"intermediate_size": 800,
"num_attention_heads": 4,
"num_key_value_heads": 1,
}
if is_multimodal:
hf_overrides.update({"text_config": text_overrides})
else:
hf_overrides.update(text_overrides)
else:
model_info.check_available_online(on_fail="skip")
model_info.check_available_online(on_fail="skip")
if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
@ -157,90 +179,70 @@ def _compare_cp_with_tp(
if multi_node_only and not VLLM_MULTI_NODE:
pytest.skip("Not in multi-node setting")
common_args = [
server_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"4096",
"--max-num-seqs",
"8",
"64",
]
if chunked_prefill:
common_args.append("--enable-chunked-prefill")
server_args.append("--enable-chunked-prefill")
if eager_mode:
common_args.append("--enforce-eager")
server_args.append("--enforce-eager")
if runner != "auto":
common_args.extend(["--runner", runner])
server_args.extend(["--runner", runner])
if trust_remote_code:
common_args.append("--trust-remote-code")
server_args.append("--trust-remote-code")
if tokenizer_mode:
common_args.extend(["--tokenizer-mode", tokenizer_mode])
if load_format:
common_args.extend(["--load-format", load_format])
server_args.extend(["--tokenizer-mode", tokenizer_mode])
if hf_overrides:
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
server_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
if not attn_backend:
cp_env = tp_env = {}
else:
cp_env = tp_env = {
"VLLM_ATTENTION_BACKEND": attn_backend,
}
cp_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
"--pipeline-parallel-size",
str(pp_size),
"--decode-context-parallel-size",
str(dcp_size),
"--dcp-kv-cache-interleave-size",
str(cp_kv_cache_interleave_size),
"--distributed-executor-backend",
distributed_backend,
]
tp_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
"--pipeline-parallel-size",
str(pp_size),
"--distributed-executor-backend",
distributed_backend,
]
compare_two_settings(
model_id,
cp_args,
tp_args,
cp_env,
tp_env,
method=method,
max_wait_seconds=720,
server_args.extend(
[
"--tensor-parallel-size",
str(tp_size),
"--pipeline-parallel-size",
str(pp_size),
"--decode-context-parallel-size",
str(dcp_size),
"--dcp-kv-cache-interleave-size",
str(cp_kv_cache_interleave_size),
"--distributed-executor-backend",
distributed_backend,
]
)
server_env = {}
if attn_backend:
server_env["VLLM_ATTENTION_BACKEND"] = attn_backend
CP_TEXT_GENERATION_MODELS = {
"deepseek-ai/DeepSeek-V2-Lite-Chat": [
CPTestSettings.detailed(),
CPTestSettings.detailed(tp_base=2),
CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64),
],
"bigcode/gpt_bigcode-santacoder": [
CPTestSettings.detailed(),
CPTestSettings.detailed(tp_base=2),
],
}
with RemoteOpenAIServer(
model_id,
server_args,
env_dict=server_env,
max_wait_seconds=720,
) as remote_server:
host = f"http://{remote_server.host}"
port = remote_server.port
CP_TEST_MODELS = [
# TODO support other models
# [LANGUAGE GENERATION]
"deepseek-ai/DeepSeek-V2-Lite-Chat",
"bigcode/gpt_bigcode-santacoder",
]
# Run GSM8K evaluation
results = evaluate_gsm8k(
num_questions=NUM_QUESTIONS,
num_shots=NUM_SHOTS,
host=host,
port=port,
)
# Validate accuracy is reasonable
accuracy = results["accuracy"]
min_accuracy = MIN_ACCURACY[model_id]
assert accuracy >= min_accuracy, (
f"TP+DCP accuracy too low: {accuracy:.3f} < {min_accuracy:.3f}"
)
@pytest.mark.parametrize(
@ -274,12 +276,12 @@ def test_cp_generation(
):
pytest.skip(reason="MLA+DCP requires compute capability of 9.0 or higher")
if (
model_id == "bigcode/gpt_bigcode-santacoder"
model_id == "Qwen/Qwen2.5-1.5B-Instruct"
and torch.cuda.get_device_capability() != (9, 0)
):
pytest.skip(reason="GQA+DCP currently requires compute capability of 9.0")
_compare_cp_with_tp(
_test_cp_gsm8k(
model_id,
parallel_setup,
distributed_backend,

View File

@ -4,7 +4,7 @@
import pytest
import torch
from vllm.distributed.eplb.rebalance_algo import rebalance_experts
from vllm.distributed.eplb.policy.default import DefaultEplbPolicy
def test_basic_rebalance():
@ -23,7 +23,7 @@ def test_basic_rebalance():
num_nodes = 2
num_gpus = 8
phy2log, log2phy, logcnt = rebalance_experts(
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
@ -77,7 +77,7 @@ def test_single_gpu_case():
num_nodes = 1
num_gpus = 1
phy2log, log2phy, logcnt = rebalance_experts(
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
@ -99,7 +99,7 @@ def test_equal_weights():
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = rebalance_experts(
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
@ -122,7 +122,7 @@ def test_extreme_weight_imbalance():
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = rebalance_experts(
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
@ -150,7 +150,7 @@ def test_multiple_layers():
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = rebalance_experts(
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
@ -175,14 +175,14 @@ def test_parameter_validation():
# Test non-divisible case - this should handle normally without throwing
# errors because the function will fall back to global load balancing
# strategy
phy2log, log2phy, logcnt = rebalance_experts(weight, 8, 3, 2, 4)
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(weight, 8, 3, 2, 4)
assert phy2log.shape == (1, 8)
assert logcnt.shape == (1, 4)
# Test cases that will actually cause errors:
# num_physical_experts not divisible by num_gpus
with pytest.raises(AssertionError):
rebalance_experts(weight, 7, 2, 2, 4) # 7 not divisible by 4
DefaultEplbPolicy.rebalance_experts(weight, 7, 2, 2, 4) # 7 not divisible by 4
def test_small_scale_hierarchical():
@ -197,7 +197,7 @@ def test_small_scale_hierarchical():
num_nodes = 2 # 2 nodes
num_gpus = 4 # 4 GPUs
phy2log, log2phy, logcnt = rebalance_experts(
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
@ -224,7 +224,7 @@ def test_global_load_balance_fallback():
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = rebalance_experts(
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
@ -246,7 +246,7 @@ def test_device_compatibility(device):
num_nodes = 1
num_gpus = 2
phy2log, log2phy, logcnt = rebalance_experts(
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
@ -263,7 +263,9 @@ def test_additional_cases():
weight1 = torch.tensor(
[[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
)
phy2log1, log2phy1, logcnt1 = rebalance_experts(weight1, 24, 8, 4, 8)
phy2log1, log2phy1, logcnt1 = DefaultEplbPolicy.rebalance_experts(
weight1, 24, 8, 4, 8
)
assert phy2log1.shape == (1, 24)
assert logcnt1.shape == (1, 16)
@ -276,7 +278,9 @@ def test_additional_cases():
[12, 25, 50, 100, 150, 200], # Increasing weights
]
)
phy2log2, log2phy2, logcnt2 = rebalance_experts(weight2, 10, 3, 1, 2)
phy2log2, log2phy2, logcnt2 = DefaultEplbPolicy.rebalance_experts(
weight2, 10, 3, 1, 2
)
assert phy2log2.shape == (2, 10)
assert logcnt2.shape == (2, 6)
@ -300,7 +304,7 @@ if __name__ == "__main__":
num_nodes = 2
num_gpus = 8
phy2log, log2phy, logcnt = rebalance_experts(
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
print(phy2log)

View File

@ -1,11 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from openai_harmony import Role
from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem
from openai_harmony import Author, Message, Role, TextContent
from vllm.entrypoints.harmony_utils import (
has_custom_tools,
parse_input_to_harmony_message,
parse_output_message,
)
@ -257,6 +259,191 @@ class TestParseInputToHarmonyMessage:
assert messages[0].content[1].text == "actual text"
class TestParseOutputMessage:
"""Tests for parse_output_message function."""
def test_commentary_with_no_recipient_creates_reasoning(self):
"""Test that commentary with recipient=None (preambles) creates reasoning items.
Per Harmony format, commentary channel can contain preambles to calling
multiple functions - explanatory text with no recipient.
"""
message = Message.from_role_and_content(
Role.ASSISTANT, "I will now search for the weather information."
)
message = message.with_channel("commentary")
# recipient is None by default, representing a preamble
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].type == "reasoning"
assert (
output_items[0].content[0].text
== "I will now search for the weather information."
)
assert output_items[0].content[0].type == "reasoning_text"
def test_commentary_with_function_recipient_creates_function_call(self):
"""Test commentary with recipient='functions.X' creates function calls."""
message = Message.from_role_and_content(
Role.ASSISTANT, '{"location": "San Francisco", "units": "celsius"}'
)
message = message.with_channel("commentary")
message = message.with_recipient("functions.get_weather")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseFunctionToolCall)
assert output_items[0].type == "function_call"
assert output_items[0].name == "get_weather"
assert (
output_items[0].arguments
== '{"location": "San Francisco", "units": "celsius"}'
)
assert output_items[0].call_id.startswith("call_")
assert output_items[0].id.startswith("fc_")
def test_commentary_with_python_recipient_creates_reasoning(self):
"""Test that commentary with recipient='python' creates reasoning items."""
message = Message.from_role_and_content(
Role.ASSISTANT, "import numpy as np\nprint(np.array([1, 2, 3]))"
)
message = message.with_channel("commentary")
message = message.with_recipient("python")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].type == "reasoning"
assert (
output_items[0].content[0].text
== "import numpy as np\nprint(np.array([1, 2, 3]))"
)
def test_commentary_with_browser_recipient_creates_reasoning(self):
"""Test that commentary with recipient='browser' creates reasoning items."""
message = Message.from_role_and_content(
Role.ASSISTANT, "Navigating to the specified URL"
)
message = message.with_channel("commentary")
message = message.with_recipient("browser")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].type == "reasoning"
assert output_items[0].content[0].text == "Navigating to the specified URL"
def test_commentary_with_container_recipient_creates_reasoning(self):
"""Test that commentary with recipient='container' creates reasoning items."""
message = Message.from_role_and_content(
Role.ASSISTANT, "Running command in container"
)
message = message.with_channel("commentary")
message = message.with_recipient("container")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].type == "reasoning"
assert output_items[0].content[0].text == "Running command in container"
def test_commentary_with_empty_content_and_no_recipient(self):
"""Test edge case: empty commentary with recipient=None."""
message = Message.from_role_and_content(Role.ASSISTANT, "")
message = message.with_channel("commentary")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].content[0].text == ""
def test_commentary_with_multiple_contents_and_no_recipient(self):
"""Test multiple content items in commentary with no recipient."""
contents = [
TextContent(text="Step 1: Analyze the request"),
TextContent(text="Step 2: Prepare to call functions"),
]
message = Message.from_role_and_contents(Role.ASSISTANT, contents)
message = message.with_channel("commentary")
output_items = parse_output_message(message)
assert len(output_items) == 2
assert all(isinstance(item, ResponseReasoningItem) for item in output_items)
assert output_items[0].content[0].text == "Step 1: Analyze the request"
assert output_items[1].content[0].text == "Step 2: Prepare to call functions"
def test_commentary_with_multiple_function_calls(self):
"""Test multiple function calls in commentary channel."""
contents = [
TextContent(text='{"location": "San Francisco"}'),
TextContent(text='{"location": "New York"}'),
]
message = Message.from_role_and_contents(Role.ASSISTANT, contents)
message = message.with_channel("commentary")
message = message.with_recipient("functions.get_weather")
output_items = parse_output_message(message)
assert len(output_items) == 2
assert all(isinstance(item, ResponseFunctionToolCall) for item in output_items)
assert output_items[0].name == "get_weather"
assert output_items[1].name == "get_weather"
assert output_items[0].arguments == '{"location": "San Francisco"}'
assert output_items[1].arguments == '{"location": "New York"}'
def test_commentary_with_unknown_recipient_raises_error(self):
"""Test that commentary with unknown recipient raises ValueError."""
message = Message.from_role_and_content(Role.ASSISTANT, "some content")
message = message.with_channel("commentary")
message = message.with_recipient("unknown_recipient")
try:
parse_output_message(message)
raise AssertionError("Expected ValueError to be raised")
except ValueError as e:
assert "Unknown recipient: unknown_recipient" in str(e)
def test_analysis_channel_creates_reasoning(self):
"""Test that analysis channel creates reasoning items."""
message = Message.from_role_and_content(
Role.ASSISTANT, "Analyzing the problem step by step..."
)
message = message.with_channel("analysis")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].type == "reasoning"
assert (
output_items[0].content[0].text == "Analyzing the problem step by step..."
)
def test_non_assistant_message_returns_empty(self):
"""Test that non-assistant messages return empty list.
Per the implementation, tool messages to assistant (e.g., search results)
are not included in final output to align with OpenAI behavior.
"""
message = Message.from_author_and_content(
Author.new(Role.TOOL, "functions.get_weather"),
"The weather is sunny, 72°F",
)
output_items = parse_output_message(message)
assert len(output_items) == 0
def test_has_custom_tools() -> None:
assert not has_custom_tools(set())
assert not has_custom_tools({"web_search_preview", "code_interpreter", "container"})

View File

@ -113,12 +113,10 @@ def test_mrope(
is_neox_style = True
max_position = config.max_position_embeddings
partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
rotary_dim = int(head_dim * partial_rotary_factor)
mrope_helper_class = get_rope(
head_size=head_dim,
rotary_dim=rotary_dim,
rotary_dim=head_dim,
max_position=max_position,
is_neox_style=is_neox_style,
rope_parameters=config.rope_parameters,
@ -184,12 +182,10 @@ def test_mrope_torch_compile_tracing(
)
is_neox_style = True
max_position = config.max_position_embeddings
partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
rotary_dim = int(head_dim * partial_rotary_factor)
mrope_helper_class = get_rope(
head_size=head_dim,
rotary_dim=rotary_dim,
rotary_dim=head_dim,
max_position=max_position,
is_neox_style=is_neox_style,
rope_parameters=config.rope_parameters,

View File

@ -1,160 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import random
import torch
from tqdm import tqdm
from vllm.config import KVTransferConfig
from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import SimpleBuffer
from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
# TODO: the test depends on a lot of fields in the current implementation.
# We should have standard interface instead direct field access
def test_run(my_rank, buffer, device):
# buffer should be empty in the beginning
if my_rank == 0:
assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0
print(f"My rank: {my_rank}, device: {device}")
# insert
tokens = torch.tensor([1, 2, 3]).to(device)
roi = tokens > 0
if my_rank == 0:
key = 2.0 * torch.ones([5, 6]).to(device)
value = 3.0 * torch.ones([5, 6]).to(device)
placeholder = torch.tensor([1]).to(device)
buffer.insert(tokens, roi, key, value, placeholder)
torch.distributed.barrier()
# drop_select
if my_rank == 1:
tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
assert torch.allclose(tokens, tok)
assert torch.allclose(roi, roi_)
assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
torch.distributed.barrier()
if my_rank == 0:
assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0
print(f"My rank: {my_rank}, Test run passed!")
def stress_test(my_rank, buf, device):
torch.distributed.barrier()
torch.manual_seed(100)
reqs = [
(
torch.rand(100).to(device), # tokens
torch.ones(100).bool().to(device), # roi
torch.rand(100).to(device), # key
torch.rand(100).to(device), # value
torch.rand(100).to(device), # hidden
)
for i in tqdm(range(200))
]
random.seed(my_rank)
random.shuffle(reqs)
torch.distributed.barrier()
n = 0
# the buffer size can only store 100 reqs
# so the sender will occasionally block to wait for the receiver.
for req in tqdm(reqs):
if my_rank == 0:
buf.insert(*req)
else:
tok, roi, k, v, h = req
tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
if tok_ is None:
assert roi_ is None
assert k_ is None
assert v_ is None
assert h_ is None
n += 1
else:
assert torch.allclose(tok, tok_)
assert torch.allclose(roi, roi_)
assert torch.allclose(k, k_)
assert torch.allclose(v, v_)
assert torch.allclose(h, h_)
print(f"Rank {my_rank} done")
torch.distributed.barrier()
if my_rank == 0:
x = torch.tensor([0])
torch.distributed.recv(x, 1)
# the # of None received is the kv that are not selected
assert x.item() == len(buf.buffer)
# and the size of the buffer should be 2000 * buffer len
print(buf.buffer_size)
assert buf.buffer_size == 1700 * len(buf.buffer)
else:
torch.distributed.send(torch.tensor([n]), 0)
print(f"My rank: {my_rank}, Passed stress test!")
if __name__ == "__main__":
my_rank = int(os.environ["RANK"])
torch.distributed.init_process_group(
backend="gloo",
init_method="tcp://localhost:12398",
world_size=2,
rank=my_rank,
)
print(f"initialized! My rank is {my_rank}")
config = KVTransferConfig(
kv_connector="P2pNcclConnector",
kv_buffer_device="cuda",
kv_buffer_size=1e9,
kv_rank=my_rank,
kv_role="kv_both", # this arg doesn't matter in this test
kv_parallel_size=2,
kv_ip="127.0.0.1",
kv_port=12345,
)
data_pipe = PyNcclPipe(
local_rank=my_rank,
config=config,
device="cuda",
port_offset=0,
)
cpu_pipe = PyNcclPipe(
local_rank=my_rank,
config=config,
device="cpu",
port_offset=1,
)
buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000)
test_run(my_rank, buffer, data_pipe.device)
stress_test(my_rank, buffer, data_pipe.device)
buffer.close()
data_pipe.close()
cpu_pipe.close()
print("Done")

View File

@ -1,8 +0,0 @@
#!/bin/bash
RANK=0 python3 test_lookup_buffer.py &
PID0=$!
RANK=1 python3 test_lookup_buffer.py &
PID1=$!
wait $PID0
wait $PID1

View File

@ -1,62 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess
import sys
import pytest
import torch
def run_python_script(script_name, timeout):
script_name = f"kv_transfer/{script_name}"
try:
# Start both processes asynchronously using Popen
process0 = subprocess.Popen(
[sys.executable, script_name],
env={"RANK": "0"}, # Set the RANK environment variable for process 0
stdout=sys.stdout, # Pipe stdout to current stdout
stderr=sys.stderr, # Pipe stderr to current stderr
)
process1 = subprocess.Popen(
[sys.executable, script_name],
env={"RANK": "1"}, # Set the RANK environment variable for process 1
stdout=sys.stdout, # Pipe stdout to current stdout
stderr=sys.stderr, # Pipe stderr to current stderr
)
# Wait for both processes to complete, with a timeout
process0.wait(timeout=timeout)
process1.wait(timeout=timeout)
# Check the return status of both processes
if process0.returncode != 0:
pytest.fail(f"Test {script_name} failed for RANK=0, {process0.returncode}")
if process1.returncode != 0:
pytest.fail(f"Test {script_name} failed for RANK=1, {process1.returncode}")
except subprocess.TimeoutExpired:
# If either process times out, terminate both and fail the test
process0.terminate()
process1.terminate()
pytest.fail(f"Test {script_name} timed out")
except Exception as e:
pytest.fail(f"Test {script_name} failed with error: {str(e)}")
# Define the test cases using pytest's parametrize
@pytest.mark.parametrize(
"script_name,timeout",
[
("test_lookup_buffer.py", 60), # Second test case with a 60-second timeout
("test_send_recv.py", 120), # First test case with a 120-second timeout
],
)
def test_run_python_script(script_name, timeout):
# Check the number of GPUs
if torch.cuda.device_count() < 2:
pytest.skip(f"Skipping test {script_name} because <2 GPUs are available")
# Run the test if there are at least 2 GPUs
run_python_script(script_name, timeout)

View File

@ -1,154 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import time
import torch
from tqdm import tqdm
from vllm.config import KVTransferConfig
from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
def test_run(my_rank, pipe):
print(f"rank {my_rank} test_run starts....")
# test run
x = torch.tensor([1]).to(pipe.device)
y = torch.tensor([[2.0, 3.0, 4.0, 8.0]]).to(pipe.device)
if my_rank == 0:
pipe.send_tensor(x)
print(f"rank {my_rank} sent tensor x")
pipe.send_tensor(y)
print(f"rank {my_rank} sent tensor y")
x2 = pipe.recv_tensor()
print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor()
print(f"rank {my_rank} received y2 = ", y2)
else:
x2 = pipe.recv_tensor()
print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor()
print(f"rank {my_rank} received y2 = ", y2)
pipe.send_tensor(x)
print(f"rank {my_rank} sent tensor x")
pipe.send_tensor(y)
print(f"rank {my_rank} sent tensor y")
assert torch.allclose(x, x2)
assert torch.allclose(y, y2)
print(f"rank {my_rank} test_run passed!")
def stress_test(my_rank, pipe):
print(f"rank {my_rank} stress_test starts....")
tensors: list[torch.Tensor] = []
torch.distributed.barrier()
torch.manual_seed(0)
for i in tqdm(range(500)):
mean = torch.rand(1).item() * 100
std = torch.rand(1).item() * 100
size = torch.randint(900, 1000, (2,))
x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device)
# 5% probability of sending a None
if torch.rand(1).item() < 0.05:
tensors.append(None)
tensors.append(None)
tensors.append(None)
else:
tensors.append(x)
tensors.append(x.mean().unsqueeze(0))
tensors.append(x.std().unsqueeze(0))
torch.distributed.barrier()
for i in tqdm(range(500)):
if my_rank == int((i % 10) > 3):
pipe.send_tensor(tensors[3 * i])
pipe.send_tensor(tensors[3 * i + 1])
pipe.send_tensor(tensors[3 * i + 2])
else:
x = pipe.recv_tensor()
mean = pipe.recv_tensor()
std = pipe.recv_tensor()
if x is None:
assert mean is None
assert std is None
else:
assert torch.allclose(x, tensors[3 * i])
assert x.mean() == mean[0]
assert x.std() == std[0]
torch.distributed.barrier()
def latency_test(my_rank, pipe, nelement, ntensor):
latencies = []
torch.distributed.barrier()
for i in tqdm(range(500)):
tensors = []
if my_rank == 0:
# create tensor
tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
torch.distributed.barrier()
if my_rank == 0:
t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device)
for tensor in tensors:
pipe.send_tensor(tensor)
pipe.send_tensor(t)
else:
for _ in range(ntensor):
pipe.recv_tensor()
t = pipe.recv_tensor()
latencies.append(time.time() - t.item())
torch.distributed.barrier()
print("Latency test passed.")
print("Latency:", torch.tensor(latencies).mean().item() * 1000, "ms")
if __name__ == "__main__":
my_rank = int(os.environ["RANK"])
torch.distributed.init_process_group(
backend="gloo",
init_method="tcp://localhost:12398",
world_size=2,
rank=my_rank,
)
config = KVTransferConfig(
kv_connector="P2pNcclConnector",
kv_buffer_device="cuda",
kv_buffer_size=1e9,
kv_rank=my_rank,
kv_role="kv_both", # this arg doesn't matter in this test
kv_parallel_size=2,
kv_ip="127.0.0.1",
kv_port=12345,
)
pipe = PyNcclPipe(
local_rank=my_rank,
config=config,
)
test_run(my_rank, pipe)
stress_test(my_rank, pipe)
# Use this function if you want to test the latency of pipe impl.
# latency_test(my_rank, pipe, 1024 * 8 * 128, 80)

View File

@ -1,9 +0,0 @@
#!/bin/bash
RANK=0 python3 test_send_recv.py &
PID0=$!
RANK=1 python3 test_send_recv.py &
PID1=$!
wait $PID0
wait $PID1

View File

@ -20,7 +20,7 @@ from vllm.config.multimodal import (
)
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs
from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.tokenizers import (
MistralTokenizer,
@ -418,4 +418,4 @@ def _assert_inputs_equal(
a_data.pop(key, None)
b_data.pop(key, None)
assert a_data == b_data, msg
assert batched_tensors_equal(a_data, b_data), msg

View File

@ -5,6 +5,7 @@ import pytest
from vllm.assets.video import VideoAsset
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import batched_tensors_equal
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
from ...utils import build_model_context
@ -103,7 +104,7 @@ def test_video_loader_consistency(
dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
assert (
static_outputs["mm_kwargs"].get_data()
== dynamic_outputs["mm_kwargs"].get_data()
assert batched_tensors_equal(
static_outputs["mm_kwargs"].get_data(),
dynamic_outputs["mm_kwargs"].get_data(),
)

View File

@ -130,10 +130,9 @@ def create_batched_mm_kwargs(
hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
tokenization_kwargs=processor_inputs.tokenization_kwargs,
)["mm_kwargs"].require_data()
items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
return group_mm_kwargs_by_modality(
items,
merge_by_field_config=model_cls.merge_by_field_config,
[item for modality in supported_mm_limits for item in mm_kwargs[modality]]
)

View File

@ -416,7 +416,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True,
),
"Qwen2ForCausalLM": _HfExamplesInfo(
"Qwen/Qwen2-0.5B-Instruct", extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}
"Qwen/Qwen2-0.5B-Instruct",
extras={
"2.5": "Qwen/Qwen2.5-0.5B-Instruct",
"2.5-1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
},
),
"Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
"Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),

View File

@ -85,12 +85,6 @@ def _dummy_items(
(_dummy_item("a", {"a1": 100}), 100),
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
(
_dummy_items(
{"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}
).get_data(),
460,
), # noqa: E501
],
)
def test_cache_item_size(item, expected_size):
@ -107,6 +101,9 @@ def test_cache_item_size(item, expected_size):
cache[""] = MultiModalProcessorCacheItemMetadata(item, [prompt_update])
assert cache.currsize == expected_size
cache[""] = item.get_data()
assert cache.currsize == expected_size
def _create_vllm_config(
*,

View File

@ -1,91 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
pytestmark = pytest.mark.cpu_test
def assert_nested_tensors_equal(expected: NestedTensors, actual: NestedTensors):
assert type(expected) == type(actual) # noqa: E721
if isinstance(expected, torch.Tensor):
assert torch.equal(expected, actual)
else:
for expected_item, actual_item in zip(expected, actual):
assert_nested_tensors_equal(expected_item, actual_item)
def assert_multimodal_inputs_equal(
expected: MultiModalKwargs, actual: MultiModalKwargs
):
assert set(expected.keys()) == set(actual.keys())
for key in expected:
assert_nested_tensors_equal(expected[key], actual[key])
def test_multimodal_input_batch_single_tensor():
t = torch.rand([1, 2])
result = MultiModalKwargs.batch([{"image": t}])
assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
def test_multimodal_input_batch_multiple_tensors():
a = torch.rand([1, 1, 2])
b = torch.rand([1, 1, 2])
c = torch.rand([1, 1, 2])
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
def test_multimodal_input_batch_multiple_heterogeneous_tensors():
a = torch.rand([1, 2, 2])
b = torch.rand([1, 3, 2])
c = torch.rand([1, 4, 2])
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
def test_multimodal_input_batch_nested_tensors():
a = torch.rand([2, 3])
b = torch.rand([2, 3])
c = torch.rand([2, 3])
result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b]}, {"image": [c]}])
assert_multimodal_inputs_equal(
result, {"image": torch.stack([a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)])}
)
def test_multimodal_input_batch_heterogeneous_lists():
a = torch.rand([1, 2, 3])
b = torch.rand([1, 2, 3])
c = torch.rand([1, 2, 3])
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
assert_multimodal_inputs_equal(
result, {"image": [torch.stack([a, b]), c.unsqueeze(0)]}
)
def test_multimodal_input_batch_multiple_batchable_lists():
a = torch.rand([1, 2, 3])
b = torch.rand([1, 2, 3])
c = torch.rand([1, 2, 3])
d = torch.rand([1, 2, 3])
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
assert_multimodal_inputs_equal(
result, {"image": torch.stack([torch.stack([a, b]), torch.stack([c, d])])}
)
def test_multimodal_input_batch_mixed_stacking_depths():
a = torch.rand([1, 2, 3])
b = torch.rand([1, 3, 3])
c = torch.rand([1, 4, 3])
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})

View File

@ -0,0 +1,188 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from vllm.reasoning.holo2_reasoning_parser import Holo2ReasoningParser
from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
REASONING_MODEL_NAME = "HCompany/Holo2-4B"
@pytest.fixture(scope="module")
def tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
@pytest.mark.parametrize(
"thinking,expected_parser_type",
[
(True, DeepSeekR1ReasoningParser),
(False, IdentityReasoningParser),
],
)
def test_parser_selection(tokenizer, thinking, expected_parser_type):
parser = Holo2ReasoningParser(
tokenizer,
chat_template_kwargs={
"thinking": thinking,
},
)
assert isinstance(parser._parser, expected_parser_type)
def test_holo2_default_parser_is_deepseekr1(tokenizer):
parser = Holo2ReasoningParser(tokenizer)
assert isinstance(parser._parser, DeepSeekR1ReasoningParser)
def test_holo2_supports_structured_output(tokenizer):
# Structured output manager uses the reasoning parser to check if the
# reasoning content is ended before applying the grammar. The main function
# used is is_reasoning_end. This test checks if the parser is able to
# correctly identify the end of the reasoning content.
# important to not pass chat_template_kwargs here as it is done in the
# StructuredOutputManager
parser = Holo2ReasoningParser(tokenizer)
end_token_id = tokenizer.encode("</think>", add_special_tokens=False)[0]
assert parser.is_reasoning_end([1, 2, 4, end_token_id])
assert not parser.is_reasoning_end([1, 2, 4])
assert parser.is_reasoning_end([1, 2, 4, end_token_id, 5])
# thinking is True, non-streaming
WITH_THINK = {
"output": "This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
}
# thinking is True, streaming
WITH_THINK_STREAM = {
"output": "This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
}
# thinking is False, non-streaming
THINKING_DISABLED = {
"output": "This is the rest",
"reasoning": None,
"content": "This is the rest",
}
# thinking is False, streaming
THINKING_DISABLED_STREAM = {
"output": "This is the rest",
"reasoning": None,
"content": "This is the rest",
}
# thinking is False but the model output </think>, non-streaming
THINKING_DISABLED_WITH_CLOSE_TAG = {
"output": "</think>This is the rest",
"reasoning": None,
"content": "</think>This is the rest",
}
# thinking is False but the model output </think>, streaming
THINKING_DISABLED_WITH_CLOSE_TAG_STREAM = {
"output": "some text</think>This is the rest",
"reasoning": None,
"content": "some text</think>This is the rest",
}
COMPLETE_REASONING = {
"output": "This is a reasoning section</think>",
"reasoning": "This is a reasoning section",
"content": None,
}
TEST_CASES = [
pytest.param(
False,
WITH_THINK,
None,
id="with_think",
),
pytest.param(
True,
WITH_THINK_STREAM,
None,
id="with_think_stream",
),
pytest.param(
False,
WITH_THINK,
{"thinking": True},
id="with_think_enabled",
),
pytest.param(
True,
WITH_THINK_STREAM,
{"thinking": True},
id="with_think_stream_enabled",
),
pytest.param(
False,
THINKING_DISABLED,
{"thinking": False},
id="thinking_disabled",
),
pytest.param(
True,
THINKING_DISABLED_STREAM,
{"thinking": False},
id="thinking_disabled_stream",
),
pytest.param(
False,
THINKING_DISABLED_WITH_CLOSE_TAG,
{"thinking": False},
id="thinking_disabled_with_close_tag",
),
pytest.param(
True,
THINKING_DISABLED_WITH_CLOSE_TAG_STREAM,
{"thinking": False},
id="thinking_disabled_with_close_tag_stream",
),
pytest.param(
False,
COMPLETE_REASONING,
None,
id="complete_reasoning",
),
pytest.param(
True,
COMPLETE_REASONING,
None,
id="complete_reasoning_stream",
),
]
@pytest.mark.parametrize("streaming, param_dict, chat_template_kwargs", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
chat_template_kwargs: dict | None,
tokenizer,
):
output = tokenizer.tokenize(param_dict["output"])
output_tokens: list[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser("holo2")(
tokenizer,
chat_template_kwargs=chat_template_kwargs,
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]

View File

@ -460,23 +460,20 @@ def test_flat_product():
]
def test_o_legacy_syntax_deprecation(caplog_vllm):
"""Test that -O.* dotted syntax emits warnings and converts correctly to -cc syntax."""
def test_o_dotted_syntax_error():
"""Test that -O.* dotted syntax raises a clear error message."""
parser = FlexibleArgumentParser()
parser.add_argument("-cc", "--compilation-config", type=json.loads)
# Test that -O.backend gets converted correctly AND emits warning
args = parser.parse_args(["-O.backend=eager"])
assert args.compilation_config == {"backend": "eager"}
# Test that -O.* syntax raises a clear ValueError
with pytest.raises(ValueError, match=r"The -O\.\* syntax is no longer supported"):
parser.parse_args(["-O.backend=eager"])
# Check that deprecation warning was logged
assert len(caplog_vllm.records) >= 1
assert (
"The -O.* dotted syntax for --compilation-config is deprecated"
in caplog_vllm.text
)
with pytest.raises(ValueError, match=r"Please use -cc\.\* instead"):
parser.parse_args(["-O.mode=2"])
# Test that -O.mode gets converted correctly
# Note: warning_once won't emit again in same session
args = parser.parse_args(["-O.mode=2"])
assert args.compilation_config == {"mode": 2}
with pytest.raises(
ValueError,
match=r"replace '-O\.cudagraph_mode=NONE' with '-cc\.cudagraph_mode=NONE'",
):
parser.parse_args(["-O.cudagraph_mode=NONE"])

View File

@ -13,7 +13,7 @@ from vllm.v1.attention.backends.utils import (
split_attn_metadata,
split_decodes_and_prefills,
)
from vllm.v1.worker.ubatch_utils import create_ubatch_slices
from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices
@pytest.fixture
@ -294,8 +294,14 @@ def test_prefill_split_across_ubatches(
qsl_np = common.query_start_loc_cpu.numpy()
num_tokens = common.num_actual_tokens
ubatch_slices = create_ubatch_slices(num_scheduled_tokens, split_point)
assert len(ubatch_slices) == 2
ubatch_slices, _ = maybe_create_ubatch_slices(
True,
num_scheduled_tokens,
num_tokens,
batch_spec.batch_size,
split_point=split_point,
)
assert ubatch_slices is not None and len(ubatch_slices) == 2
first_meta = _make_metadata_with_slice(ubatch_slices[0], common)
second_meta = _make_metadata_with_slice(ubatch_slices[1], common)

View File

@ -21,6 +21,7 @@ def test_reset_prefix_cache_e2e(monkeypatch):
max_num_batched_tokens=32,
max_model_len=2048,
compilation_config={"mode": 0},
dtype="float16",
)
engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(

View File

@ -9,10 +9,22 @@ correctly with the DeepSeek-V2-Lite model using GSM8K evaluation.
"""
import pytest
import torch
from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
from tests.utils import RemoteOpenAIServer
# Detect Blackwell / B200 (compute capability 10.x)
try:
if torch.cuda.is_available():
cap = torch.cuda.get_device_capability(0)
IS_BLACKWELL = cap[0] >= 10
else:
IS_BLACKWELL = False
except Exception:
# Be conservative: if we can't detect, don't xfail by default
IS_BLACKWELL = False
MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
DP_SIZE = 2
@ -33,6 +45,13 @@ DEEPEP_BACKENDS = [
@pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS)
@pytest.mark.xfail(
IS_BLACKWELL,
reason=(
"Temporary: DBO accuracy unstable on Blackwell "
"(doesn't meet expectation of MIN_ACCURACY = 0.62)"
),
)
def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
"""
Test DBO with DP+EP using GSM8K evaluation.

View File

@ -1,9 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from concurrent.futures import Future
import pytest
from transformers import AutoTokenizer
from vllm.config import StructuredOutputsConfig, VllmConfig
from vllm.config.model import ModelConfig
from vllm.config.parallel import ParallelConfig
from vllm.config.speculative import SpeculativeConfig
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
from vllm.v1.request import Request
@ -116,3 +121,72 @@ def test_grammar_bitmask_with_specdec():
) # EOS not the final token
grammar_bitmask(request, prompt[i:]) # EOS not present
grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
@pytest.mark.parametrize("async_grammar", [True, False])
def test_grammar_init_async_and_sync(async_grammar):
"""Test grammar initialization works correctly in both async and sync modes.
This test validates that the distributed_executor_backend config option
correctly controls whether grammar compilation happens asynchronously
(via executor.submit) or synchronously. When set to "external_launcher",
grammar compilation is synchronous to avoid deadlocks.
"""
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
prompt = tokenizer.encode('{"a": "b"}')
# Use "external_launcher" for sync mode, None for async mode
executor_backend = None if async_grammar else "external_launcher"
vllm_config = VllmConfig(
model_config=ModelConfig(tokenizer=TOKENIZER),
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
)
structured_output_manager = StructuredOutputManager(vllm_config)
sampling_params = SamplingParams(
structured_outputs=StructuredOutputsParams(
json='{"type": "object"}',
),
)
sampling_params.structured_outputs._backend = "guidance"
request = Request(
"test_request",
prompt_token_ids=prompt,
sampling_params=sampling_params,
pooling_params=None,
eos_token_id=tokenizer.eos_token_id,
)
structured_output_manager.grammar_init(request)
# Check the internal _grammar type immediately after init
# Before _check_grammar_completion is called, async mode should have a Future
raw_grammar = request.structured_output_request._grammar
if async_grammar:
assert isinstance(raw_grammar, Future), (
"Async mode should store a Future before completion"
)
else:
assert not isinstance(raw_grammar, Future), (
"Sync mode should store the grammar directly, not a Future"
)
# Wait for grammar to be ready (handles both async and sync cases)
start_time = time.time()
while not request.structured_output_request._check_grammar_completion():
if time.time() - start_time > 5: # 5-second timeout
pytest.fail("Grammar compilation timed out")
time.sleep(0.01)
# After completion, _grammar should no longer be a Future
assert not isinstance(request.structured_output_request._grammar, Future)
# Verify grammar is properly initialized and functional
grammar = request.structured_output_request.grammar
assert grammar is not None
assert not grammar.is_terminated()
# Verify the grammar can accept valid tokens
assert grammar.accept_tokens(request.request_id, prompt)

View File

@ -1842,6 +1842,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
random_seed=args.seed,
dataset_path=args.dataset_path,
disable_shuffle=args.disable_shuffle,
prefix_len=args.common_prefix_len,
).sample(
tokenizer=tokenizer,
num_requests=args.num_prompts,

View File

@ -1221,6 +1221,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="Repetition penalty sampling parameter. Only has effect on "
"openai-compatible backends.",
)
sampling_group.add_argument(
"--common-prefix-len",
type=int,
default=None,
help="Common prefix length shared by all prompts (used by random dataset)",
)
parser.add_argument(
"--tokenizer-mode",

View File

@ -402,6 +402,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
self.extra_traceback = False
def run(self, *args):
# maybe instead just assert inputs are fake?
fake_args = [
self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
for t in args
@ -416,11 +417,13 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
kwargs: dict[str, Any],
) -> Any:
assert isinstance(target, str)
output = super().call_module(target, args, kwargs)
if target in self.compile_submod_names:
index = self.compile_submod_names.index(target)
submod = self.fetch_attr(target)
sym_shape_indices = [
i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
]
@ -746,11 +749,21 @@ class VllmBackend:
if not item.is_splitting_graph
]
# Extract fake values from the graph to use them when needed.
all_fake_values = []
for i in graph.graph.find_nodes(op="placeholder"):
all_fake_values.append(i.meta["example_value"])
fake_args = [
all_fake_values[i] if isinstance(t, torch.Tensor) else t
for i, t in enumerate(example_inputs)
]
# propagate the split graph to the piecewise backend,
# compile submodules with symbolic shapes
PiecewiseCompileInterpreter(
self.split_gm, submod_names_to_compile, self.vllm_config, self
).run(*example_inputs)
).run(*fake_args)
graph_path = os.path.join(local_cache_dir, "computation_graph.py")
if not os.path.exists(graph_path):
@ -780,14 +793,7 @@ class VllmBackend:
)
# if we need to copy input buffers for cudagraph
from torch._guards import detect_fake_mode
fake_mode = detect_fake_mode()
fake_args = [
fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
for t in example_inputs
]
#
# index of tensors that have symbolic shapes (batch size)
# for weights and static buffers, they will have concrete shapes.
# symbolic shape only happens for input tensors.

View File

@ -433,7 +433,6 @@ def _support_torch_compile(
return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
# This is the path for the first compilation.
# the first compilation needs to have dynamic shapes marked
_mark_dynamic_inputs(
self,

View File

@ -5,6 +5,7 @@ from collections.abc import Iterable
import torch.fx
from torch import SymInt
from torch.fx.experimental.symbolic_shapes import statically_known_true
from vllm.logger import init_logger
@ -116,12 +117,7 @@ class NoOpEliminationPass(VllmInductorPass):
2. The dimensions both correspond to the same SymInt
"""
# Case 1
if isinstance(i_dim, int) and isinstance(dim, int):
return dim == i_dim
# Case 2
if isinstance(i_dim, SymInt) and isinstance(dim, SymInt):
return dim == i_dim
return False
return statically_known_true(dim == i_dim)
def all_dims_equivalent(
self, dims: Iterable[int | SymInt], i_dims: Iterable[int | SymInt]

View File

@ -14,6 +14,7 @@ import torch._C._dynamo.guards
import vllm.envs as envs
from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
from vllm.logger import init_logger
from vllm.utils.nvtx_pytorch_hooks import layerwise_nvtx_marker_context
logger = init_logger(__name__)
@ -92,12 +93,29 @@ class TorchCompileWithNoGuardsWrapper:
return self.forward(*args, **kwargs)
def _call_with_optional_nvtx_range(self, callable_fn, *args, **kwargs):
if self.layerwise_nvtx_tracing_enabled:
args_list = list(args)
kwargs_dict = dict(kwargs)
with layerwise_nvtx_marker_context(
"Torch Compiled Module (input):{}".format(self.__class__.__name__),
self,
in_tensor=args_list,
kwargs=kwargs_dict,
) as ctx:
ctx.result = callable_fn(*args, **kwargs)
return ctx.result
return callable_fn(*args, **kwargs)
def __init__(self):
self.compiled = False
vllm_config = get_current_vllm_config()
self.vllm_config = vllm_config
mode = vllm_config.compilation_config.mode
self.layerwise_nvtx_tracing_enabled = (
vllm_config.observability_config.enable_layerwise_nvtx_tracing
)
if mode is None:
raise RuntimeError("Compilation mode cannot be NO_COMPILATION")
@ -168,13 +186,19 @@ class TorchCompileWithNoGuardsWrapper:
# Make sure a compilation is triggered by clearing dynamo
# cache.
torch._dynamo.eval_frame.remove_from_cache(self.original_code_object())
return self._compiled_callable(*args, **kwargs)
return self._call_with_optional_nvtx_range(
self._compiled_callable, *args, **kwargs
)
else:
with self._dispatch_to_compiled_code():
return self.forward(*args, **kwargs)
return self._call_with_optional_nvtx_range(
self.forward, *args, **kwargs
)
else:
with _compilation_context():
return self._compiled_callable(*args, **kwargs)
return self._call_with_optional_nvtx_range(
self._compiled_callable, *args, **kwargs
)
@abstractmethod
def forward(self, *args, **kwargs): ...

View File

@ -29,7 +29,7 @@ CacheDType = Literal[
"fp8_inc",
"fp8_ds_mla",
]
MambaDType = Literal["auto", "float32"]
MambaDType = Literal["auto", "float32", "float16"]
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
KVOffloadingBackend = Literal["native", "lmcache"]

View File

@ -59,6 +59,11 @@ class ObservabilityConfig:
"""Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
dispatch modes, and their observed frequencies at every logging interval)."""
enable_layerwise_nvtx_tracing: bool = False
"""Enable layerwise NVTX tracing. This traces the execution of each layer or
module in the model and attach informations such as input/output shapes to
nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
@cached_property
def collect_model_forward_time(self) -> bool:
"""Whether to collect model forward time for the request."""

View File

@ -35,6 +35,7 @@ logger = init_logger(__name__)
ExpertPlacementStrategy = Literal["linear", "round_robin"]
DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
DataParallelBackend = Literal["ray", "mp"]
EPLBPolicyOption = Literal["default"]
@config
@ -65,6 +66,9 @@ class EPLBConfig:
Whether to use non-blocking EPLB.
"""
policy: EPLBPolicyOption = "default"
"""The policy type for expert parallel load balancing (EPLB)."""
@config
@dataclass

View File

@ -671,36 +671,22 @@ class VllmConfig:
if current_platform.support_static_graph_mode():
# if cudagraph_mode has full cudagraphs, we need to check support
if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
# decode context parallel does not support full cudagraphs
if self.parallel_config.decode_context_parallel_size > 1:
if (
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
and self.model_config is not None
):
if self.model_config.pooler_config is not None:
logger.warning_once(
"Decode context parallel (DCP) is enabled, which is "
"incompatible with full CUDA graphs. "
"Pooling models do not support full cudagraphs. "
"Overriding cudagraph_mode to PIECEWISE."
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
# prefill context parallel do not support full cudagraphs
elif self.parallel_config.prefill_context_parallel_size > 1:
elif self.model_config.is_encoder_decoder:
logger.warning_once(
"Prefill context parallel (PCP) is enabled, which is "
"incompatible with full CUDA graphs. "
"Encoder-decoder models do not support full cudagraphs. "
"Overriding cudagraph_mode to PIECEWISE."
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
elif self.model_config is not None:
if self.model_config.pooler_config is not None:
logger.warning_once(
"Pooling models do not support full cudagraphs. "
"Overriding cudagraph_mode to PIECEWISE."
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
elif self.model_config.is_encoder_decoder:
logger.warning_once(
"Encoder-decoder models do not support full cudagraphs. "
"Overriding cudagraph_mode to PIECEWISE."
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
# disable cudagraph when enforce eager execution
if self.model_config is not None and self.model_config.enforce_eager:

View File

@ -1,8 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Expert parallelism load balancer (EPLB).
"""
from .eplb_state import *
from .rebalance_algo import *
"""Expert parallelism load balancer (EPLB)."""

View File

@ -45,7 +45,7 @@ from vllm.logger import init_logger
from vllm.model_executor.models.interfaces import MixtureOfExperts
from .async_worker import start_async_worker
from .rebalance_algo import rebalance_experts
from .policy import EPLB_POLICIES, AbstractEplbPolicy, DefaultEplbPolicy
from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace
logger = init_logger(__name__)
@ -213,18 +213,23 @@ class EplbState:
self.parallel_config = parallel_config
self.device = device
self.model_states: dict[str, EplbModelState] = {}
self.policy: type[AbstractEplbPolicy] = DefaultEplbPolicy
"""
Selected EPLB algorithm class
"""
self.expert_load_window_step: int = 0
"""
Current step in the sliding window.
Different from `expert_rearrangement_step`,
each EP rank may have its own `expert_load_window_step`.
"""
self.expert_load_window_step: int = 0
self.expert_load_window_size: int = 0
"""
Size of the expert load sliding window.
This is a constant and is taken from the config.
"""
self.expert_load_window_size: int = 0
self.expert_rearrangement_step: int = 0
"""
Steps after last rearrangement.
Will trigger a rearrangement if it exceeds the threshold.
@ -415,6 +420,10 @@ class EplbState:
)
self.expert_rearrangement_step_interval = eplb_step_interval
# Set the policy based on the selected eplb algorithm type.
policy_type = self.parallel_config.eplb_config.policy
self.policy = EPLB_POLICIES[policy_type]
logger.debug("Selected EPLB policy: %d", policy_type)
if global_expert_load is not None:
ep_group = get_ep_group().device_group
assert global_expert_load.shape == (
@ -441,7 +450,7 @@ class EplbState:
new_physical_to_logical_map,
new_logical_to_physical_map,
new_logical_replica_count,
) = rebalance_experts(
) = self.policy.rebalance_experts(
global_expert_load,
num_replicas,
num_groups,
@ -776,6 +785,7 @@ class EplbState:
f"{num_gpus=}, {num_nodes=}"
)
# Get new expert mappings
for eplb_model_state, global_expert_load_window in zip(
self.model_states.values(), global_expert_load_windows
):
@ -784,7 +794,7 @@ class EplbState:
new_physical_to_logical_map,
new_logical_to_physical_map,
new_logical_replica_count,
) = rebalance_experts(
) = self.policy.rebalance_experts(
global_expert_load_window,
num_replicas,
num_groups,

View File

@ -0,0 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import get_args
from vllm.config.parallel import EPLBPolicyOption
from .abstract import AbstractEplbPolicy
from .default import DefaultEplbPolicy
EPLB_POLICIES = {"default": DefaultEplbPolicy}
# Ensure that the EPLB_POLICIES keys match the EPLBPolicyOption values
assert set(EPLB_POLICIES.keys()) == set(get_args(EPLBPolicyOption))
__all__ = [
"AbstractEplbPolicy",
"DefaultEplbPolicy",
"EPLB_POLICIES",
]

View File

@ -0,0 +1,40 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
import torch
class AbstractEplbPolicy(ABC):
@classmethod
@abstractmethod
def rebalance_experts(
cls,
weight: torch.Tensor,
num_replicas: int,
num_groups: int,
num_nodes: int,
num_ranks: int,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Entry point for expert-parallelism load balancer.
Parameters:
weight: [layers, num_logical_experts], the load statistics
for all logical experts
num_replicas: number of physical experts, must be a multiple of
`num_ranks`
num_groups: number of expert groups
num_nodes: number of server nodes
num_ranks: number of ranks, must be a multiple of `num_nodes`
Returns:
physical_to_logical_map: [layers, num_replicas], the expert
index of each replica
logical_to_physical_map: [layers, num_logical_experts, X],
the replica indices for each expert
expert_count: [layers, num_logical_experts], number of
physical replicas for each logical expert
"""
raise NotImplementedError

View File

@ -0,0 +1,267 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Expert parallelism load balancer (EPLB) for vLLM.
This module implements the core rearrangement algorithm.
The rearrangement algorithm is adapted from
[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
on how the EPLB algorithm works.
"""
import numpy as np
import torch
from .abstract import AbstractEplbPolicy
class DefaultEplbPolicy(AbstractEplbPolicy):
@classmethod
def balanced_packing(
cls, weight: torch.Tensor, num_packs: int
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Pack n weighted objects to m packs, such that each bin contains exactly
n/m objects and the weights of all packs are as balanced as possible.
Parameters:
weight: [X, n], the weight of each item
num_packs: number of packs
Returns:
pack_index: [X, n], the pack index of each item
rank_in_pack: [X, n], the rank of the item in the pack
"""
num_layers, num_groups = weight.shape
assert num_groups % num_packs == 0
groups_per_pack = num_groups // num_packs
device = weight.device
if groups_per_pack == 1:
pack_index = torch.arange(
weight.size(-1), dtype=torch.int64, device=device
).expand(weight.shape)
rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
return pack_index, rank_in_pack
weight_np = weight.cpu().numpy()
# Sort and get indices in decending order
indices_np = np.argsort(-weight_np, axis=-1)
pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
# Run the packing algorithm
for i in range(num_layers):
pack_weights = [0.0] * num_packs
pack_items = [0] * num_packs
for group in indices_np[i]:
# Find a pack with capacity that has the lowest weight
pack = min(
(j for j in range(num_packs) if pack_items[j] < groups_per_pack),
key=pack_weights.__getitem__,
)
assert pack_items[pack] < groups_per_pack
pack_index_np[i, group] = pack
rank_in_pack_np[i, group] = pack_items[pack]
pack_weights[pack] += weight_np[i, group]
pack_items[pack] += 1
pack_index = torch.from_numpy(pack_index_np).to(device)
rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
return pack_index, rank_in_pack
@classmethod
def replicate_experts(
cls, weight: torch.Tensor, num_phy: int
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Replicate `num_log` experts to `num_phy` replicas, such that the maximum
load of all replicas is minimized.
Parameters:
weight: [X, num_log]
num_phy: total number of experts after replication
Returns:
phy2log: [X, num_phy], logical expert id of each physical expert
rank: [X, num_phy], the replica rank
logcnt: [X, num_log], number of replicas for each logical expert
"""
n, num_log = weight.shape
num_redundant = num_phy - num_log
assert num_redundant >= 0
device = weight.device
phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
arangen = torch.arange(n, dtype=torch.int64, device=device)
for i in range(num_log, num_phy):
redundant_indices = (weight / logcnt).max(dim=-1).indices
phy2log[:, i] = redundant_indices
rank[:, i] = logcnt[arangen, redundant_indices]
logcnt[arangen, redundant_indices] += 1
return phy2log, rank, logcnt
@classmethod
def rebalance_experts_hierarchical(
cls,
weight: torch.Tensor,
num_physical_experts: int,
num_groups: int,
num_nodes: int,
num_gpus: int,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Parameters:
weight: [num_moe_layers, num_logical_experts]
num_physical_experts: number of physical experts after replication
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network
(e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
phy2log: [layers, num_replicas], the expert
index of each replica
log2phy: [layers, num_logical_experts, X],
the replica indices for each expert
logcnt: [layers, num_logical_experts], number of
physical replicas for each logical expert
"""
num_layers, num_logical_experts = weight.shape
assert num_logical_experts % num_groups == 0
group_size = num_logical_experts // num_groups
assert num_groups % num_nodes == 0
groups_per_node = num_groups // num_nodes
assert num_gpus % num_nodes == 0
assert num_physical_experts % num_gpus == 0
phy_experts_per_gpu = num_physical_experts // num_gpus
def inverse(perm: torch.Tensor) -> torch.Tensor:
inv = torch.empty_like(perm)
inv.scatter_(
1,
perm,
torch.arange(
perm.size(1), dtype=torch.int64, device=perm.device
).expand(perm.shape),
)
return inv
# Step 1: pack groups to nodes
tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
group_pack_index, group_rank_in_pack = cls.balanced_packing(
tokens_per_group, num_nodes
)
log2mlog = (
(
(group_pack_index * groups_per_node + group_rank_in_pack) * group_size
).unsqueeze(-1)
+ torch.arange(
group_size, dtype=torch.int64, device=group_pack_index.device
)
).flatten(-2)
mlog2log = inverse(log2mlog)
# Step 2: construct redundant experts within nodes
# [num_layers * num_nodes, num_logical_experts // num_nodes]
tokens_per_mlog = weight.gather(-1, mlog2log).view(
-1, num_logical_experts // num_nodes
)
phy2mlog, phyrank, mlogcnt = cls.replicate_experts(
tokens_per_mlog, num_physical_experts // num_nodes
)
# Step 3: pack physical_experts to GPUs
# [num_layers * num_nodes, num_physical_experts // num_nodes]
tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
pack_index, rank_in_pack = cls.balanced_packing(
tokens_per_phy, num_gpus // num_nodes
)
phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
pphy2phy = inverse(phy2pphy)
pphy2mlog = phy2mlog.gather(
-1, pphy2phy
) # [num_layers * num_nodes, num_log_per_nodes]
pphy2mlog = (
pphy2mlog.view(num_layers, num_nodes, -1)
+ torch.arange(
0,
num_logical_experts,
num_logical_experts // num_nodes,
device=group_pack_index.device,
).view(1, -1, 1)
).flatten(-2)
pphy2log = mlog2log.gather(-1, pphy2mlog)
pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
return pphy2log, pphyrank, logcnt
@classmethod
def rebalance_experts(
cls,
weight: torch.Tensor,
num_replicas: int,
num_groups: int,
num_nodes: int,
num_ranks: int,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Entry point for expert-parallelism load balancer.
Parameters:
weight: [layers, num_logical_experts], the load statistics for all
logical experts
num_replicas: number of physical experts, must be a multiple of
`num_gpus`
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network
(e.g, NVLink) is faster
num_ranks: number of ranks, must be a multiple of `num_nodes`
Returns:
phy2log: [layers, num_replicas], the expert
index of each replica
log2phy: [layers, num_logical_experts, X],
the replica indices for each expert
logcnt: [layers, num_logical_experts], number of
physical replicas for each logical expert
"""
num_layers, num_logical_experts = weight.shape
weight = weight.float()
if num_groups % num_nodes == 0:
# use hierarchical load-balance policy
phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical(
weight, num_replicas, num_groups, num_nodes, num_ranks
)
else:
# use global load-balance policy
phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical(
weight, num_replicas, 1, 1, num_ranks
)
num_redundant_experts = num_replicas - num_logical_experts
maxlogcnt = num_redundant_experts + 1
log2phy: torch.Tensor = torch.full(
(num_layers, num_logical_experts, maxlogcnt),
-1,
dtype=torch.int64,
device=logcnt.device,
)
log2phy.view(num_layers, -1).scatter_(
-1,
phy2log * maxlogcnt + phyrank,
torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
num_layers, -1
),
)
return phy2log, log2phy, logcnt

View File

@ -1,260 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Expert parallelism load balancer (EPLB) for vLLM.
This module implements the core rearrangement algorithm.
The rearrangement algorithm is adapted from
[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
on how the EPLB algorithm works.
"""
import numpy as np
import torch
def balanced_packing(
weight: torch.Tensor, num_packs: int
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Pack n weighted objects to m packs, such that each bin contains exactly
n/m objects and the weights of all packs are as balanced as possible.
Parameters:
weight: [X, n], the weight of each item
num_packs: number of packs
Returns:
pack_index: [X, n], the pack index of each item
rank_in_pack: [X, n], the rank of the item in the pack
"""
num_layers, num_groups = weight.shape
assert num_groups % num_packs == 0
groups_per_pack = num_groups // num_packs
device = weight.device
if groups_per_pack == 1:
pack_index = torch.arange(
weight.size(-1), dtype=torch.int64, device=device
).expand(weight.shape)
rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
return pack_index, rank_in_pack
weight_np = weight.cpu().numpy()
# Sort and get indices in decending order
indices_np = np.argsort(-weight_np, axis=-1)
pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
# Run the packing algorithm
for i in range(num_layers):
pack_weights = [0.0] * num_packs
pack_items = [0] * num_packs
for group in indices_np[i]:
# Find a pack with capacity that has the lowest weight
pack = min(
(j for j in range(num_packs) if pack_items[j] < groups_per_pack),
key=pack_weights.__getitem__,
)
assert pack_items[pack] < groups_per_pack
pack_index_np[i, group] = pack
rank_in_pack_np[i, group] = pack_items[pack]
pack_weights[pack] += weight_np[i, group]
pack_items[pack] += 1
pack_index = torch.from_numpy(pack_index_np).to(device)
rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
return pack_index, rank_in_pack
def replicate_experts(
weight: torch.Tensor, num_phy: int
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Replicate `num_log` experts to `num_phy` replicas, such that the maximum
load of all replicas is minimized.
Parameters:
weight: [X, num_log]
num_phy: total number of experts after replication
Returns:
phy2log: [X, num_phy], logical expert id of each physical expert
rank: [X, num_phy], the replica rank
logcnt: [X, num_log], number of replicas for each logical expert
"""
n, num_log = weight.shape
num_redundant = num_phy - num_log
assert num_redundant >= 0
device = weight.device
phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
arangen = torch.arange(n, dtype=torch.int64, device=device)
for i in range(num_log, num_phy):
redundant_indices = (weight / logcnt).max(dim=-1).indices
phy2log[:, i] = redundant_indices
rank[:, i] = logcnt[arangen, redundant_indices]
logcnt[arangen, redundant_indices] += 1
return phy2log, rank, logcnt
def rebalance_experts_hierarchical(
weight: torch.Tensor,
num_physical_experts: int,
num_groups: int,
num_nodes: int,
num_gpus: int,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Parameters:
weight: [num_moe_layers, num_logical_experts]
num_physical_experts: number of physical experts after replication
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network
(e.g., NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
physical_to_logical_map (torch.Tensor):
[num_moe_layers, num_physical_experts]
logical_to_physical_map (torch.Tensor):
[num_moe_layers, num_logical_experts, X]
logical_count (torch.Tensor):
[num_moe_layers, num_logical_experts]
"""
num_layers, num_logical_experts = weight.shape
assert num_logical_experts % num_groups == 0
group_size = num_logical_experts // num_groups
assert num_groups % num_nodes == 0
groups_per_node = num_groups // num_nodes
assert num_gpus % num_nodes == 0
assert num_physical_experts % num_gpus == 0
phy_experts_per_gpu = num_physical_experts // num_gpus
def inverse(perm: torch.Tensor) -> torch.Tensor:
inv = torch.empty_like(perm)
inv.scatter_(
1,
perm,
torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
perm.shape
),
)
return inv
# Step 1: pack groups to nodes
tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
log2mlog = (
(
(group_pack_index * groups_per_node + group_rank_in_pack) * group_size
).unsqueeze(-1)
+ torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
).flatten(-2)
mlog2log = inverse(log2mlog)
# Step 2: construct redundant experts within nodes
# [num_layers * num_nodes, num_logical_experts // num_nodes]
tokens_per_mlog = weight.gather(-1, mlog2log).view(
-1, num_logical_experts // num_nodes
)
phy2mlog, phyrank, mlogcnt = replicate_experts(
tokens_per_mlog, num_physical_experts // num_nodes
)
# Step 3: pack physical_experts to GPUs
# [num_layers * num_nodes, num_physical_experts // num_nodes]
tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
pphy2phy = inverse(phy2pphy)
pphy2mlog = phy2mlog.gather(
-1, pphy2phy
) # [num_layers * num_nodes, num_log_per_nodes]
pphy2mlog = (
pphy2mlog.view(num_layers, num_nodes, -1)
+ torch.arange(
0,
num_logical_experts,
num_logical_experts // num_nodes,
device=group_pack_index.device,
).view(1, -1, 1)
).flatten(-2)
pphy2log = mlog2log.gather(-1, pphy2mlog)
pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
return pphy2log, pphyrank, logcnt
def rebalance_experts(
weight: torch.Tensor,
num_replicas: int,
num_groups: int,
num_nodes: int,
num_gpus: int,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Entry point for expert-parallelism load balancer.
Parameters:
weight: [layers, num_logical_experts], the load statistics for all
logical experts
num_replicas: number of physical experts, must be a multiple of
`num_gpus`
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network
(e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
physical_to_logical_map:
[layers, num_replicas], the expert index of each replica
logical_to_physical_map:
[layers, num_logical_experts, X], the replica indices for each
expert
expert_count:
[layers, num_logical_experts], number of physical
replicas for each logical expert
"""
num_layers, num_logical_experts = weight.shape
weight = weight.float()
if num_groups % num_nodes == 0:
# use hierarchical load-balance policy
phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
weight, num_replicas, num_groups, num_nodes, num_gpus
)
else:
# use global load-balance policy
phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
weight, num_replicas, 1, 1, num_gpus
)
num_redundant_experts = num_replicas - num_logical_experts
maxlogcnt = num_redundant_experts + 1
log2phy: torch.Tensor = torch.full(
(num_layers, num_logical_experts, maxlogcnt),
-1,
dtype=torch.int64,
device=logcnt.device,
)
log2phy.view(num_layers, -1).scatter_(
-1,
phy2log * maxlogcnt + phyrank,
torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
num_layers, -1
),
)
return phy2log, log2phy, logcnt
__all__ = ["rebalance_experts"]

View File

@ -1,179 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file contains a new class `KVLookupBufferBase` that allows developers to
think of KV cache operations as inserting new KV cache entries (`insert`)
into the lookup buffer and querying existing KV caches (`drop_select`)
from the lookup buffer.
This file also contains a new class `KVStoreBufferBase` that allows developers
to manage the KVCache buffer as a simple key-value storage buffer with basic
put/get operations.
These classes above are abstracted behind class `KVCacheBufferBase`.
"""
from abc import ABC, abstractmethod
import torch
class KVCacheBufferBase(ABC):
"""
Abstract base class for a KVCache buffer.
"""
@abstractmethod
def close(self) -> None:
"""Close the buffer and release resources.
This method is responsible for cleaning up resources related to the
KVCache buffer when it is no longer needed.
Raises:
NotImplementedError: This method must be implemented in subclasses.
"""
raise NotImplementedError
class KVLookupBufferBase(KVCacheBufferBase):
"""
Abstract base class for a KVCache lookup buffer.
This class provides an abstraction for a key-value (KV) cache lookup buffer.
The key of the lookup buffer:
- input_tokens: token IDs of the request
- roi: a binary mask on top of input_tokens.
- Purpose of roi: Since KV cache may only be available for a subset of
tokens in the input (for example, when vLLM is connected to an external
KV cache service), roi specifies the subset of tokens that the KV cache
is associated with.
- NOTE: roi can be further extended to describe which part of KV the
current process is holding (each process may only hold a part of KV
due to TP and PP). This is not implemented for now.
The value of the lookup buffer:
- key: the key tensor in the KV cache
- value: the value tensor in the KV cache
- hidden: the final hidden state generated by model forwarding. This allows
vLLM to bypass further model forwarding by transmitting the hidden state.
"""
@abstractmethod
def insert(
self,
input_tokens: torch.Tensor,
roi: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
hidden: torch.Tensor,
) -> None:
"""Insert into the lookup buffer.
The functionality is similar to the following python statement
```
buffer[input_tokens, roi] = [key, value, hidden]
```
FIXME: in the future, we should only have two arguments, key and value,
where key is a tensor dict and value is a tensor dict.
FIXME: we should transmit both sampler outputs and the hidden states.
Args:
input_tokens (torch.Tensor): token IDs.
roi (torch.Tensor): A binary mask on top of the input tokens
key (torch.Tensor): The key tensor in the KV cache.
value (torch.Tensor): The value tensor in the KV cache.
hidden (torch.Tensor): The final hidden state tensor generated
during model forwarding to bypass model
forwarding.
Raises:
NotImplementedError: This method must be implemented in subclasses.
"""
raise NotImplementedError
@abstractmethod
def drop_select(
self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
) -> list[torch.Tensor | None]:
"""Select and *drop* KV cache entries from the lookup buffer.
The functionality is similar to the following python statements
```
ret = buffer.pop(input_tokens, roi)
return ret
```
If `input_tokens` and `roi` is `None`, it means selecting any of the
KV caches in the buffer, return, and remove it from the buffer, useful
when offloading KV cache to KV cache storage service.
Args:
input_tokens (torch.Tensor): token IDs.
roi (torch.Tensor): A binary mask on top of the input tokens
Returns:
list[Optional[torch.Tensor]]: A list of tensors. Can be None.
Raises:
NotImplementedError: This method must be implemented in subclasses.
"""
raise NotImplementedError
class KVStoreBufferBase(KVCacheBufferBase):
"""
Abstract base class for a KVCache storage buffer with key-value semantics.
This class provides a simple key-value storage buffer abstract with basic
put/get operations, which enables flexible KVCache transfer granular
control.
The functionality is similar to a distributed key-value store, where:
- Key: A unique string identifier for the cached entry
- Value:
- Tensor to be stored and retrieved
- None (indicating deletion or empty value)
"""
@abstractmethod
def put(
self,
key: str,
value: torch.Tensor | None,
) -> None:
"""Store a key-value pair in the buffer.
Args:
key (str): Unique identifier for a tensor, this tensor could be the
key cache tensor, value cache tensor, or hidden state tensor
generated during model forwarding.
value (Optional[torch.Tensor]): Tensor to be stored.
Raises:
NotImplementedError: This method must be implemented in subclasses.
"""
raise NotImplementedError
@abstractmethod
def get(
self,
key: str,
) -> torch.Tensor | None:
"""Retrieve a value from the buffer by key.
Args:
key (str): Unique identifier for a tensor, this tensor could be the
key cache tensor, value cache tensor, or hidden state tensor
generated during model forwarding.
Returns:
Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
Raises:
NotImplementedError: This method must be implemented in subclasses.
"""
raise NotImplementedError

View File

@ -1,164 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file contains a new class `MooncakeStore` that allows developers to
think of KV cache transfer operations as putting new KV cache entries
into a remote KVStore-based lookup buffer and getting existing KV caches
from this remote lookup buffer.
"""
import json
import os
from dataclasses import dataclass
import torch
from safetensors.torch import load as safetensors_load
from safetensors.torch import save as safetensors_save
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVStoreBufferBase
from vllm.logger import init_logger
DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB
DEFAULT_LOCAL_BUFFER_SIZE = 1073741824 # 1.0 GiB
logger = init_logger(__name__)
@dataclass
class MooncakeStoreConfig:
local_hostname: str
metadata_server: str
global_segment_size: int
local_buffer_size: int
protocol: str
device_name: str
master_server_address: str
@staticmethod
def from_file(file_path: str) -> "MooncakeStoreConfig":
"""Load the config from a JSON file."""
with open(file_path) as fin:
config = json.load(fin)
return MooncakeStoreConfig(
local_hostname=config.get("local_hostname"),
metadata_server=config.get("metadata_server"),
global_segment_size=config.get(
"global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
),
local_buffer_size=config.get(
"local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE
),
protocol=config.get("protocol", "tcp"),
device_name=config.get("device_name", ""),
master_server_address=config.get("master_server_address"),
)
@staticmethod
def load_from_env() -> "MooncakeStoreConfig":
"""Load config from a file specified in the environment variable."""
config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
if config_file_path is None:
raise ValueError(
"The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
)
return MooncakeStoreConfig.from_file(config_file_path)
class MooncakeStore(KVStoreBufferBase):
def __init__(
self,
config: VllmConfig,
):
try:
from mooncake.store import MooncakeDistributedStore
except ImportError as e:
raise ImportError(
"Please install mooncake by following the instructions at "
"https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501
"to run vLLM with MooncakeConnector."
) from e
try:
self.store = MooncakeDistributedStore()
self.config = MooncakeStoreConfig.load_from_env()
logger.info("Mooncake Configuration loaded successfully.")
self.store.setup(
self.config.local_hostname,
self.config.metadata_server,
self.config.global_segment_size,
self.config.local_buffer_size,
self.config.protocol,
self.config.device_name,
self.config.master_server_address,
)
except ValueError as e:
logger.error("Configuration loading failed: %s", e)
raise
except Exception as exc:
logger.error("An error occurred while loading the configuration: %s", exc)
raise
def close(self):
# MooncakeDistributedStore will automatically call the destructor, so
# it is unnecessary to close it manually.
pass
def put(
self,
key: str,
value: torch.Tensor | None,
) -> None:
# A message queue needs to be introduced before making it asynchronous.
if value is not None:
self._put_impl(key, value)
def get(
self,
key: str,
) -> torch.Tensor | None:
# A message queue needs to be introduced before making it asynchronous.
value = self._get_impl(key)
return value
def _put_impl(
self,
key: str,
value: torch.Tensor,
) -> None:
"""Put KVCache to Mooncake Store"""
device_id = value.device.index if value.device.type == "cuda" else -1
device_tensor = torch.tensor(device_id, dtype=torch.int32)
value_bytes = safetensors_save({"tensor": value, "device_id": device_tensor})
try:
self.store.put(key, value_bytes)
except TypeError as err:
logger.error("Failed to put value into Mooncake Store: %s", err)
raise TypeError("Mooncake Store Put Type Error.") from err
def _get_impl(
self,
key: str,
) -> torch.Tensor | None:
"""Get KVCache from Mooncake Store"""
try:
data = self.store.get(key)
except TypeError as err:
logger.error("Failed to get value from Mooncake Store: %s", err)
raise TypeError("Mooncake Store Get Type Error.") from err
if data:
loaded_tensors = safetensors_load(data)
tensor = loaded_tensors["tensor"]
device_id_tensor = loaded_tensors["device_id"]
device_id = int(device_id_tensor.item())
device = (
torch.device("cuda", device_id)
if device_id >= 0
else torch.device("cpu")
)
return tensor.to(device)
return None

View File

@ -1,242 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Implements a distributed key-value (KV) cache transfer mechanism.
Key Features:
- Distributed KV cache transmission using PyNccl pipes.
- Non-blocking `insert`, blocking `drop_select`.
- Use CPU signal pipe to avoid racing condition
- Handles buffer size constraints and provide backpressure mechanism to
stop the prefill instance when the decode instance is slow.
"""
import threading
from collections import deque
import torch
from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVLookupBufferBase
from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
from vllm.logger import init_logger
logger = init_logger(__name__)
class SimpleBuffer(KVLookupBufferBase):
def __init__(
self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, buffer_size_thresh: float
):
"""
signal_pipe: on CPU
NOTE: on-device recv will block all threads in the process, making the
KV cache producer unable to listen to new request while transmitting
KV cache. Luckily CPU recv only blocks the current thread so we use
CPU recv to listen to new request.
data_pipe: on device (e.g. GPU)
"""
self.buffer: deque[list[torch.Tensor]] = deque()
self.buffer_size = 0
self.buffer_size_threshold = buffer_size_thresh
self.buffer_cv = threading.Condition()
self.signal_pipe = signal_pipe
self.data_pipe = data_pipe
self.request_handling_thread: threading.Thread | None = None
self.normal_signal = torch.tensor([0], device="cpu")
self.end_signal = None
def _matches(
self,
tokens_roi_sender: list[torch.Tensor],
tokens_roi_recver: list[torch.Tensor],
):
# tokens_roi_sender: tokens and roi of the producer (in the buffer)
# tokens_roi_recver: tokens and roi of the consumer (query)
tokens_sender = tokens_roi_sender[0]
tokens_recver = tokens_roi_recver[0]
roi_sender = tokens_roi_sender[1]
roi_recver = tokens_roi_recver[1]
if tokens_recver is None:
# consumer sends an empty request
# semantics: DROP SELECT * LIMIT 1
# so any of the data in the buffer can be drop-selected
return True
# Assuming that roi is a binary mask on tokens
tokens_sender = tokens_sender[roi_sender]
tokens_recver = tokens_recver[roi_recver]
# simple common prefix matching
min_length = min(len(tokens_sender), len(tokens_recver))
if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]):
return min_length
return 0
def _send_tensor_and_dec_size(self, tensor: torch.Tensor | None) -> None:
assert tensor is not None, "Use self.data_pipe.send(None) instead"
self.buffer_size -= tensor.element_size() * tensor.numel()
if tensor.dtype == torch.bool:
tensor = tensor.float()
self.data_pipe.send_tensor(tensor)
def _get_element_size(self, data: list | torch.Tensor | None):
if isinstance(data, torch.Tensor):
return data.element_size() * data.numel()
if not data:
# cannot perform `not data` on a tensor
# so this check needs to go after the check above
return 0
raise AssertionError(f"Unknown data type {type(data)}")
def _add_to_buffer(
self,
input_tokens: torch.Tensor,
roi: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
hidden: torch.Tensor,
):
if isinstance(input_tokens, torch.Tensor):
input_tokens = input_tokens.clone()
if isinstance(roi, torch.Tensor):
roi = roi.clone()
if isinstance(key, torch.Tensor):
key = key.clone()
if isinstance(value, torch.Tensor):
value = value.clone()
if isinstance(hidden, torch.Tensor):
hidden = hidden.clone()
buffer_item = [input_tokens, roi, key, value, hidden]
data_size = sum([self._get_element_size(data) for data in buffer_item])
with self.buffer_cv:
if self.buffer_size + data_size > self.buffer_size_threshold:
# log outside the while loop to avoid this message being logged
# repeatedly.
logger.debug("KV transfer buffer is full. Handling...")
while self.buffer_size + data_size > self.buffer_size_threshold:
self.buffer_cv.wait()
self.buffer_size += data_size
self.buffer.append(buffer_item)
self.buffer_cv.notify()
def _is_end_signal(self, signal):
return signal is None
def drop_select_handler(self):
try:
while True:
signal = self.signal_pipe.recv_tensor()
if self._is_end_signal(signal):
logger.info("Received end signal!")
break
input_tokens = self.data_pipe.recv_tensor()
roi = self.data_pipe.recv_tensor()
assert roi is not None, (
"Please provide the roi when sending drop-select request"
)
roi = roi > 0.5
tokens_roi_recver = [input_tokens, roi]
def is_buffer_available(
tokens_roi_recver: list[torch.Tensor],
) -> bool:
# perform input tokens and roi matching
# FIXME: this matching is O(n), ideally it should be O(1)
# but this buffer size won't (and shouldn't) be too large so
# the fix is not urgent.
for _ in range(len(self.buffer)):
if self._matches(self.buffer[0], tokens_roi_recver) > 0:
return True
# rotate the element we just accessed to the end
self.buffer.rotate(-1)
return False
with self.buffer_cv:
while not is_buffer_available(tokens_roi_recver):
logger.debug("KV transfer buffer is not available. Waiting...")
self.buffer_cv.wait()
# need to clone the tensor
# in case the tensor is freed before sending finishes
matched_item = self.buffer.popleft()
for tensor in matched_item:
self._send_tensor_and_dec_size(tensor)
self.buffer_cv.notify()
except RuntimeError as e:
if "Connection closed by peer" not in str(e):
raise e
logger.debug("Closing drop_select_handler")
def drop_select(
self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
) -> list[torch.Tensor | None]:
assert self.request_handling_thread is None, (
"drop_select should be called by the KV cache consumer "
"(e.g. the decode vLLM instance)"
)
if isinstance(input_tokens, torch.Tensor):
input_tokens = input_tokens.clone()
if isinstance(roi, torch.Tensor):
roi = roi.clone().float()
self.signal_pipe.send_tensor(self.normal_signal)
self.data_pipe.send_tensor(input_tokens)
self.data_pipe.send_tensor(roi)
input_tokens = self.data_pipe.recv_tensor()
roi = self.data_pipe.recv_tensor()
if roi is not None:
# convert from float tensor to bool tensor
# as PyNccl does not support sending bool tensor
roi = roi > 0.5
key = self.data_pipe.recv_tensor()
value = self.data_pipe.recv_tensor()
hidden = self.data_pipe.recv_tensor()
return [input_tokens, roi, key, value, hidden]
def insert(
self,
input_tokens: torch.Tensor,
roi: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
hidden: torch.Tensor,
) -> None:
self._add_to_buffer(input_tokens, roi, key, value, hidden)
# when calling the insert, the current process is a sender
# need to launch the request handler and start listening to request.
if self.request_handling_thread is None:
self.request_handling_thread = threading.Thread(
target=self.drop_select_handler
)
self.request_handling_thread.start()
def close(self):
if (
hasattr(self, "request_handling_thread")
and self.request_handling_thread is not None
):
self.request_handling_thread.join()
else:
# TODO: have a explicit close signal and have a explicit way to
# check if it's requester
self.signal_pipe.send_tensor(self.end_signal)

View File

@ -1,66 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file defines an interface `KVPipeBase`
that provides an abstraction for sending and receiving tensors, or None, via
distributed communications.
All classes instantiated from this interface are assumed to be a FIFO pipe.
If your distributed communication platform already supports key-value lookup,
you can bypass this interface and directly start from `kv_lookup_buffer`.
"""
from abc import ABC, abstractmethod
import torch
class KVPipeBase(ABC):
"""
This class provides an interface for sending and receiving tensors, or
None, by distributed communications.
"""
@abstractmethod
def send_tensor(self, tensor: torch.Tensor | None) -> None:
"""Send a tensor, or None, via the pipe.
Need to support sending None -- important for error handling.
TODO: add a `key` argument so that we can use traditional
key-value database as the distributed communication mechanism behind
the pipe.
Args:
tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None.
Raises:
NotImplementedError: This method must be implemented in subclasses.
"""
raise NotImplementedError
@abstractmethod
def recv_tensor(self) -> torch.Tensor | None:
"""Receive a tensor (can be None) from the pipeline.
Returns:
Optional[torch.Tensor]: The tensor received from the pipeline. Can
be None.
Raises:
NotImplementedError: This method must be implemented in subclasses.
"""
raise NotImplementedError
@abstractmethod
def close(self) -> None:
"""Close the pipeline and release resources.
This method is responsible for closing the communication pipeline
and releasing any resources associated with it.
Raises:
NotImplementedError: This method must be implemented in subclasses.
"""
raise NotImplementedError

View File

@ -1,295 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import os
import struct
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
import torch
import zmq
from safetensors.torch import load as safetensors_load
from safetensors.torch import save as safetensors_save
from vllm.config.kv_transfer import KVTransferConfig
from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
from vllm.logger import init_logger
from vllm.utils.network_utils import join_host_port, make_zmq_path, split_host_port
logger = init_logger(__name__)
NONE_INT = -150886311
@dataclass
class MooncakeTransferEngineConfig:
prefill_url: str
decode_url: str
metadata_backend: str | None
metadata_server: str
protocol: str
device_name: str
@staticmethod
def from_file(file_path: str) -> "MooncakeTransferEngineConfig":
"""Load the config from a JSON file."""
with open(file_path) as fin:
config = json.load(fin)
return MooncakeTransferEngineConfig(
prefill_url=config.get("prefill_url"),
decode_url=config.get("decode_url"),
metadata_backend=config.get("metadata_backend", None),
metadata_server=config.get("metadata_server"),
protocol=config.get("protocol", "tcp"),
device_name=config.get("device_name", ""),
)
@staticmethod
def load_from_env() -> "MooncakeTransferEngineConfig":
"""Load config from a file specified in the environment variable."""
config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
if config_file_path is None:
raise ValueError(
"The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
)
return MooncakeTransferEngineConfig.from_file(config_file_path)
class MooncakeTransferEngine:
"""Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
def __init__(self, kv_rank: int, local_rank: int):
try:
from mooncake.engine import TransferEngine
except ImportError as e:
raise ImportError(
"Please install mooncake by following the instructions at "
"https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501
"to run vLLM with MooncakeConnector."
) from e
self.engine = TransferEngine()
self.local_rank = local_rank
try:
self.config = MooncakeTransferEngineConfig.load_from_env()
logger.info("Mooncake Configuration loaded successfully.")
except ValueError as e:
logger.error(e)
raise
except Exception as exc:
logger.error("An error occurred while loading the configuration: %s", exc)
raise
prefill_host, base_prefill_port = split_host_port(self.config.prefill_url)
decode_host, base_decode_port = split_host_port(self.config.decode_url)
# Avoid ports conflict when running prefill and decode on the same node
if prefill_host == decode_host and base_prefill_port == base_decode_port:
base_decode_port = base_decode_port + 100
prefill_port = base_prefill_port + self.local_rank
decode_port = base_decode_port + self.local_rank
self.prefill_url = join_host_port(prefill_host, prefill_port)
self.decode_url = join_host_port(decode_host, decode_port)
self.initialize(
self.prefill_url if kv_rank == 0 else self.decode_url,
self.config.metadata_server,
self.config.protocol,
self.config.device_name,
self.config.metadata_backend,
)
self.remote_url = self.decode_url if kv_rank == 0 else self.prefill_url
# Initialize ZeroMQ context and sockets
self.context = zmq.Context() # type: ignore[attr-defined]
self.sender_socket = self.context.socket(zmq.constants.PUSH)
self.receiver_socket = self.context.socket(zmq.constants.PULL)
self.sender_ack = self.context.socket(zmq.constants.PULL)
self.receiver_ack = self.context.socket(zmq.constants.PUSH)
self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
self._setup_metadata_sockets(
kv_rank, prefill_host, base_prefill_port, decode_host, base_decode_port
)
def _setup_metadata_sockets(
self, kv_rank: int, p_host: str, p_port: int, d_host: str, d_port: int
) -> None:
"""Set up ZeroMQ sockets for sending and receiving data."""
# Offsets < 8 are left for initialization in case tp and pp are enabled
p_rank_offset = p_port + 8 + self.local_rank * 2
d_rank_offset = d_port + 8 + self.local_rank * 2
if kv_rank == 0:
self.sender_socket.bind(make_zmq_path("tcp", p_host, p_rank_offset + 1))
self.receiver_socket.connect(
make_zmq_path("tcp", d_host, d_rank_offset + 1)
)
self.sender_ack.connect(make_zmq_path("tcp", d_host, d_rank_offset + 2))
self.receiver_ack.bind(make_zmq_path("tcp", p_host, p_rank_offset + 2))
else:
self.receiver_socket.connect(
make_zmq_path("tcp", p_host, p_rank_offset + 1)
)
self.sender_socket.bind(make_zmq_path("tcp", d_host, d_rank_offset + 1))
self.receiver_ack.bind(make_zmq_path("tcp", d_host, d_rank_offset + 2))
self.sender_ack.connect(make_zmq_path("tcp", p_host, p_rank_offset + 2))
def initialize(
self,
local_hostname: str,
metadata_server: str,
protocol: str,
device_name: str,
metadata_backend: str | None,
) -> None:
"""Initialize the mooncake instance."""
if metadata_backend is None:
self.engine.initialize(
local_hostname, metadata_server, protocol, device_name
)
else:
supported_backend = ["etcd", "redis"]
metadata_backend = metadata_backend.lower()
if metadata_backend not in supported_backend:
raise ValueError(
"Mooncake Configuration error. `metadata_backend`"
f" should be one of {supported_backend}."
)
self.engine.initialize_ext(
local_hostname, metadata_server, protocol, device_name, metadata_backend
)
def allocate_managed_buffer(self, length: int) -> int:
"""Allocate a managed buffer of the specified length."""
ret = self.engine.allocate_managed_buffer(length)
if ret <= 0:
logger.error("Allocation Return Error")
raise Exception("Allocation Return Error")
return ret
def free_managed_buffer(self, buffer: int, length: int) -> int:
"""Free a previously allocated managed buffer."""
return self.engine.free_managed_buffer(buffer, length)
def transfer_sync(self, buffer: int, peer_buffer_address: int, length: int) -> int:
"""Synchronously transfer data to the specified address."""
ret = self.engine.transfer_sync_read(
self.remote_url, buffer, peer_buffer_address, length
)
if ret < 0:
logger.error("Transfer Return Error")
raise Exception("Transfer Return Error")
return ret
def write_bytes_to_buffer(self, buffer: int, user_data: bytes, length: int) -> int:
"""Write bytes to the allocated buffer."""
return self.engine.write_bytes_to_buffer(buffer, user_data, length)
def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
"""Read bytes from the allocated buffer."""
return self.engine.read_bytes_from_buffer(buffer, length)
def wait_for_ack(self, src_ptr: int, length: int) -> None:
"""Asynchronously wait for ACK from the receiver."""
ack = self.sender_ack.recv()
if ack != b"ACK":
logger.error("Failed to receive ACK from the receiver")
self.free_managed_buffer(src_ptr, length)
def send_bytes(self, user_data: bytes) -> None:
"""Send bytes to the remote process."""
length = len(user_data)
src_ptr = self.allocate_managed_buffer(length)
self.write_bytes_to_buffer(src_ptr, user_data, length)
self.sender_socket.send_multipart(
[struct.pack("!Q", src_ptr), struct.pack("!Q", length)]
)
self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
def recv_bytes(self) -> bytes:
"""Receive bytes from the remote process."""
data = self.receiver_socket.recv_multipart()
src_ptr = struct.unpack("!Q", data[0])[0]
length = struct.unpack("!Q", data[1])[0]
dst_ptr = self.allocate_managed_buffer(length)
self.transfer_sync(dst_ptr, src_ptr, length)
ret = self.read_bytes_from_buffer(dst_ptr, length)
# Buffer cleanup
self.receiver_ack.send(b"ACK")
self.free_managed_buffer(dst_ptr, length)
return ret
class MooncakePipe(KVPipeBase):
"""MooncakeTransferEngine based Pipe implementation."""
def __init__(
self, local_rank: int, config: KVTransferConfig, device: str | None = None
):
"""Initialize the mooncake pipe and set related parameters."""
self.config = config
self.local_rank = local_rank
self.kv_rank = self.config.kv_rank
assert self.kv_rank is not None
if device is None:
self.device = self._select_device(self.config.kv_buffer_device)
else:
self.device = self._select_device(device)
self.transfer_engine = MooncakeTransferEngine(self.kv_rank, self.local_rank)
self.transport_thread: ThreadPoolExecutor | None = None
self.none_tensor = torch.tensor([NONE_INT], device=self.device)
def _select_device(self, device: str) -> torch.device:
"""Select available device (CUDA or CPU)."""
logger.info("Selecting device: %s", device)
if device == "cuda":
return torch.device(f"cuda:{self.local_rank}")
else:
return torch.device("cpu")
def tensor_hash(self, tensor: torch.Tensor) -> int:
"""Calculate the hash value of the tensor."""
return hash(tensor.data_ptr())
def _send_impl(self, tensor: torch.Tensor) -> None:
"""Implement the tensor sending logic using safetensors."""
self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
def _recv_impl(self) -> torch.Tensor:
"""Implement the tensor receiving logic using safetensors."""
data = self.transfer_engine.recv_bytes()
return safetensors_load(data)["tensor"].to(self.device)
def send_tensor(self, tensor: torch.Tensor | None) -> None:
"""Send tensor to the target process."""
if self.transport_thread is None:
self.transport_thread = ThreadPoolExecutor(max_workers=1)
tensor = tensor if tensor is not None else self.none_tensor
assert len(tensor.shape) > 0
self.transport_thread.submit(self._send_impl, tensor)
def recv_tensor(self) -> torch.Tensor | None:
"""Receive tensor from other processes."""
if self.transport_thread is None:
self.transport_thread = ThreadPoolExecutor(max_workers=1)
tensor = self.transport_thread.submit(self._recv_impl).result()
if tensor.numel() == 1 and tensor.item() == NONE_INT:
return None
else:
return tensor
def close(self) -> None:
"""Cleanup logic when closing the pipe."""
self.transfer_engine.sender_socket.close()
self.transfer_engine.receiver_socket.close()
self.transfer_engine.sender_ack.close()
self.transfer_engine.receiver_ack.close()
self.transfer_engine.context.term() # Terminate the ZMQ context
logger.info("Closed the transfer engine and cleaned up resources.")

View File

@ -1,285 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This module implements a PyNccl pipe for sending and receiving
Optional[torch.Tensor] between distributed ranks with advanced
communication features.
Key Features:
- Supports sending and receiving tensors with metadata
- Handles both CUDA and CPU device communications
- Implements a non-blocking tensor transfer mechanism
- Manages buffer size and provides backpressure control
- Supports distributed process groups with configurable parameters
"""
import threading
import time
from collections.abc import Callable
from concurrent.futures import ThreadPoolExecutor
import torch
from vllm.config.kv_transfer import KVTransferConfig
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
from vllm.distributed.utils import StatelessProcessGroup
from vllm.logger import init_logger
logger = init_logger(__name__)
class BrokenPipeException(Exception):
def __init__(self, message):
self.message = message
super().__init__(self.message)
Metadata = dict[str, torch.Tensor | None]
class PyNcclPipe(KVPipeBase):
METADATA_LENGTH = 16
MAX_TENSOR_DIMENSIONS = 14
METADATA_DTYPE = torch.int64
def __init__(
self,
local_rank: int,
config: KVTransferConfig,
device: str | None = None,
port_offset: int = 0,
):
self.config = config
self.local_rank = local_rank
self.kv_rank = self.config.kv_rank
assert self.kv_rank is not None
self.kv_parallel_size = self.config.kv_parallel_size
if device is None:
self.device = self._select_device(self.config.kv_buffer_device)
else:
self.device = self._select_device(device)
# build distributed connection and send/recv implementation
store_timeout = self.config.get_from_extra_config("store_timeout", 300)
self.group = StatelessProcessGroup.create(
host=self.config.kv_ip,
port=self.config.kv_port + port_offset,
rank=self.kv_rank,
world_size=self.kv_parallel_size,
store_timeout=store_timeout,
)
# add a barrier to make sure the connection is initiated properly
self.group.barrier()
impl = self._get_device_send_recv_impl(self.group)
self.device_send_func, self.device_recv_func = impl
# set target rank
self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size
self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size
# transportation-related variables
self.transport_thread: ThreadPoolExecutor | None = None
self.buffer_size = 0
self.buffer_size_lock = threading.Lock()
self.buffer_size_thresh = self.config.kv_buffer_size
def _get_device_send_recv_impl(
self, group: StatelessProcessGroup
) -> tuple[
Callable[[torch.Tensor, int], None], Callable[[torch.Tensor, int], None]
]:
send: Callable[[torch.Tensor, int], None]
recv: Callable[[torch.Tensor, int], None]
if self.device.type == "cuda":
# use PyNCCL for send / recv
comm = PyNcclCommunicator(group, device=self.local_rank)
comm.disabled = False
send, recv = comm.send, comm.recv # type: ignore
else:
# This send / recv implementation here is NOT intended to transfer
# KV caches (and should NOT be repurposed to transfer KV caches).
# Currently it is only used to transmit control-plane messages
# for PyNcclBuffer.
send = group.send_obj
def my_recv(x, src):
x[...] = group.recv_obj(src)
recv = my_recv
return send, recv
def _select_device(self, device: str):
logger.info("Selecting device: %s", device)
if device == "cuda":
return torch.device(f"cuda:{self.local_rank}")
else:
return torch.device("cpu")
def _make_metadata(self, tensor: torch.Tensor | None) -> Metadata:
"""
Create the metadata as a dictionary based on the input tensor.
Args:
tensor: The input tensor or None if no tensor is provided.
Returns:
metadata: A dictionary with the following keys:
- "dtype": The data type of the tensor or None.
- "shape": The shape of the tensor or None.
"""
if tensor is None:
return {"dtype": None, "shape": None}
else:
return {"dtype": tensor.dtype, "shape": tensor.shape}
def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
"""
Create a buffer to receive the tensor based on the provided metadata.
Args:
metadata: A dictionary with keys "dtype" and "shape",
describing the tensor's data type and shape.
Returns:
buffer: A tensor of the specified type and shape,
allocated on `self.device`.
"""
return torch.empty(
metadata["shape"], dtype=metadata["dtype"], device=self.device
)
def _send_metadata(self, metadata: Metadata):
"""
Send the metadata dictionary to the target rank.
Args:
metadata: A dictionary with keys "dtype" and "shape".
"""
self.group.send_obj(metadata, self.target_rank_for_send)
def _recv_metadata(self) -> Metadata:
"""
Receive the metadata dictionary from the target rank.
Returns:
metadata: A dictionary with keys "dtype" and "shape"
describing the tensor.
"""
return self.group.recv_obj(self.target_rank_for_recv)
def _send_impl(self, tensor: torch.Tensor | None) -> None:
"""
The actual implementation of sending the tensor and its metadata to the
target rank.
Args:
tensor: The input tensor to be sent, or `None` if no tensor is
being sent.
"""
metadata = self._make_metadata(tensor)
self._send_metadata(metadata)
if tensor is not None:
self.device_send_func(tensor.to(self.device), self.target_rank_for_send)
def _recv_impl(self) -> torch.Tensor | None:
"""
The actual implementation of receiving a tensor and its metadata from
the target rank.
Returns:
buffer: The received tensor, or `None` if no tensor is received.
"""
metadata = self._recv_metadata()
if metadata["dtype"] is None:
return None
buffer = self._prepare_recv_buffer(metadata)
self.device_recv_func(buffer, self.target_rank_for_recv)
return buffer
def send_tensor_wrapper(
self, tensor: torch.Tensor | None, tensor_size: int
) -> None:
"""
Wrapper for _send_impl to handle exceptions and update buffer size.
"""
try:
self._send_impl(tensor)
with self.buffer_size_lock:
self.buffer_size -= tensor_size
except Exception as e:
logger.error(
"[rank%d]: Exception when trying to send %s, msg: %s",
torch.distributed.get_rank(),
str(tensor),
str(e),
)
import traceback
traceback.print_exc()
def block_if_full(self):
"""
Block the current thread if the buffer size is larger than the
threshold.
"""
while self.buffer_size > self.buffer_size_thresh:
logger.debug("KV cache transfer pipe is full. Waiting...")
time.sleep(0.05)
def send_tensor(self, tensor: torch.Tensor | None) -> None:
"""
Sends a tensor and its metadata to the destination rank in a
non-blocking way.
Args:
tensor: The tensor to send, or `None` if no tensor is being sent.
"""
if self.transport_thread is None:
self.transport_thread = ThreadPoolExecutor(max_workers=1)
if tensor is not None:
tensor_size = tensor.element_size() * tensor.numel()
else:
tensor_size = 0
self.block_if_full()
with self.buffer_size_lock:
self.buffer_size += tensor_size
self.transport_thread.submit(self.send_tensor_wrapper, tensor, tensor_size)
def recv_tensor(self) -> torch.Tensor | None:
"""
Receives a tensor and its metadata from the source rank. Blocking call.
Returns:
The received tensor, or `None` if no tensor is received.
"""
if self.transport_thread is None:
self.transport_thread = ThreadPoolExecutor(max_workers=1)
future = self.transport_thread.submit(self._recv_impl)
try:
tensor = future.result()
except Exception as e:
logger.error("Encountering exception in KV receiving thread")
logger.error("%s", e)
logger.error("My device: %s", self.device)
import traceback
traceback.print_exc()
raise e
return tensor
def close(self):
"""
Close the pipe and release associated resources.
"""
if hasattr(self, "transport_thread") and self.transport_thread is not None:
self.transport_thread.shutdown()

View File

@ -520,6 +520,9 @@ class EngineArgs:
ObservabilityConfig, "kv_cache_metrics_sample"
)
cudagraph_metrics: bool = ObservabilityConfig.cudagraph_metrics
enable_layerwise_nvtx_tracing: bool = (
ObservabilityConfig.enable_layerwise_nvtx_tracing
)
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
@ -1032,6 +1035,10 @@ class EngineArgs:
"--cudagraph-metrics",
**observability_kwargs["cudagraph_metrics"],
)
observability_group.add_argument(
"--enable-layerwise-nvtx-tracing",
**observability_kwargs["enable_layerwise_nvtx_tracing"],
)
# Scheduler arguments
scheduler_kwargs = get_kwargs(SchedulerConfig)
@ -1711,6 +1718,7 @@ class EngineArgs:
kv_cache_metrics=self.kv_cache_metrics,
kv_cache_metrics_sample=self.kv_cache_metrics_sample,
cudagraph_metrics=self.cudagraph_metrics,
enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
)
# Compilation config overrides

View File

@ -455,11 +455,13 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
output_items.extend(_parse_function_call(message, recipient))
# Built-in tools on commentary channel are treated as reasoning for now
elif recipient is not None and (
recipient.startswith("python")
or recipient.startswith("browser")
or recipient.startswith("container")
elif (
recipient is None # Preambles: explanatory text before tool calls
or recipient.startswith(("python", "browser", "container"))
):
# Per Harmony format, commentary channel can contain preambles to calling
# multiple functions - explanatory text with no recipient. Built-in tool
# recipients (python/browser/container) also generate reasoning output.
output_items.extend(_parse_reasoning_content(message))
else:
raise ValueError(f"Unknown recipient: {recipient}")

View File

@ -1072,10 +1072,15 @@ class OpenAIServingChat(OpenAIServing):
# wasn't ready to send a token, then
# get the next token without streaming a chunk
if delta_message is None:
if output.finish_reason is None:
# NOTE: If return_token_ids is enabled, we still need to
# send a chunk with token_ids even if delta_message is None
# to ensure all tokens are included in the response
if (
output.finish_reason is None
and not request.return_token_ids
):
continue
else:
delta_message = DeltaMessage()
delta_message = DeltaMessage()
# Log streaming delta if output logging is enabled
if self.enable_log_outputs and self.request_logger:

View File

@ -345,6 +345,10 @@ class FusedMoEQuantConfig:
def use_mxfp4_w4a16(self) -> bool:
return self._a1.dtype is None and self._w1.dtype == "mxfp4"
@property
def use_mxfp4_w4a4(self) -> bool:
return self._a1.dtype == "mxfp4" and self._w1.dtype == "mxfp4"
@property
def use_nvfp4_w4a4(self) -> bool:
return self.quant_dtype == "nvfp4"

View File

@ -863,7 +863,8 @@ class FusedMoE(CustomOp):
use_chunked_impl: bool,
) -> tuple[bool, torch.Tensor | None]:
use_shared_experts_stream = (
has_separate_shared_experts
current_platform.is_cuda()
and has_separate_shared_experts
and not use_chunked_impl
and self.shared_experts_stream is not None
and (

View File

@ -221,8 +221,8 @@ def rocm_aiter_fused_experts(
else:
quant_method = QuantMethod.NO.value
# quark moe for mxfp4 w_dtype
if quant_config.use_mxfp4_w4a16:
# quark moe for mxfp4 w_dtype mxfp4 a_dtype
if quant_config.use_mxfp4_w4a4:
quant_method = QuantMethod.BLOCK_1X32.value
# w8a8 block-scaled
if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:

View File

@ -124,12 +124,16 @@ class Fp8MoeBackend(Enum):
def get_fp8_moe_backend(
block_quant: bool, moe_parallel_config: FusedMoEParallelConfig
block_quant: bool,
moe_parallel_config: FusedMoEParallelConfig,
with_lora_support: bool,
) -> Fp8MoeBackend:
"""
Select the primary FP8 MoE backend
Note: Shape-specific fallbacks may still occur at runtime.
"""
if with_lora_support:
return Fp8MoeBackend.TRITON
# Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
if (
current_platform.is_cuda()
@ -665,7 +669,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
self.weight_block_size = self.quant_config.weight_block_size
self.block_quant: bool = self.weight_block_size is not None
self.fp8_backend = get_fp8_moe_backend(
self.block_quant, layer.moe_parallel_config
self.block_quant, layer.moe_parallel_config, self.moe.is_lora_enabled
)
self.marlin_input_dtype = None
@ -1084,6 +1088,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
from vllm.model_executor.layers.fused_moe import (
BatchedDeepGemmExperts,
BatchedTritonExperts,
TritonExperts,
TritonOrDeepGemmExperts,
)
@ -1116,7 +1121,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
num_dispatchers=prepare_finalize.num_dispatchers(),
quant_config=self.moe_quant_config,
)
elif self.moe.is_lora_enabled:
return TritonExperts(quant_config=self.moe_quant_config)
elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
# Select GEMM experts with block-scale when weights are block-quantized
experts = select_cutlass_fp8_gemm_impl(

View File

@ -30,7 +30,6 @@ def get_rope(
is_neox_style: bool = True,
rope_parameters: dict[str, Any] | None = None,
dtype: torch.dtype | None = None,
partial_rotary_factor: float = 1.0,
dual_chunk_attention_config: dict[str, Any] | None = None,
) -> RotaryEmbedding:
if dtype is None:
@ -55,6 +54,10 @@ def get_rope(
else:
dual_chunk_attention_args = None
partial_rotary_factor = 1.0
if rope_parameters is not None:
partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
if partial_rotary_factor < 1.0:
rotary_dim = int(rotary_dim * partial_rotary_factor)
key = (

View File

@ -148,8 +148,6 @@ class ApertusAttention(nn.Module):
if head_dim is None:
head_dim = self.hidden_size // self.total_num_heads
self.head_dim = head_dim
# Phi models introduced a partial_rotary_factor parameter in the config
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
@ -228,11 +226,10 @@ class ApertusAttention(nn.Module):
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=int(self.partial_rotary_factor * self.head_dim),
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)

View File

@ -127,8 +127,6 @@ class BailingAttention(nn.Module):
prefix=f"{prefix}.dense",
)
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
self.rotary_dim = getattr(config, "rotary_dim", self.head_dim)
self.rotary_emb = get_rope(
@ -137,7 +135,6 @@ class BailingAttention(nn.Module):
max_position=config.max_position_embeddings,
rope_parameters=config.rope_parameters,
is_neox_style=True,
partial_rotary_factor=self.partial_rotary_factor,
)
self.attn = Attention(

View File

@ -178,9 +178,7 @@ class BambaAttentionDecoderLayer(nn.Module):
self.scaling = self.head_dim**-0.5
self.max_position_embeddings = max_position_embeddings
if hasattr(config, "partial_rotary_factor"):
rotary_dim = int(self.head_dim * config.partial_rotary_factor)
elif hasattr(config, "attn_rotary_emb"):
if hasattr(config, "attn_rotary_emb"):
rotary_dim = config.attn_rotary_emb # for backward compatibility
else:
rotary_dim = self.head_dim # default

View File

@ -8,7 +8,6 @@ import vllm.envs as envs
from vllm.logger import init_logger
from vllm.model_executor.models import ModelRegistry
from vllm.platforms import current_platform
from vllm.transformers_utils.config import set_default_rope_theta
from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
@ -78,8 +77,6 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
if not model_config.enforce_eager:
max_position = round_up(max_position, 8)
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
config.rotary_kwargs = {
"head_size": head_dim,
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
@ -119,8 +116,6 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
max_trained_positions = getattr(config, "max_trained_positions", 2048)
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
config.rotary_kwargs = {
"head_size": head_dim,
"rotary_dim": rotary_emb_dim,
@ -490,6 +485,26 @@ class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")
class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
@staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
"""Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
(or not explicitly set), to the value specified in the HF config, or to
float16 if not specified.
"""
cache_config = vllm_config.cache_config
if cache_config.mamba_ssm_cache_dtype == "auto":
hf_config = vllm_config.model_config.hf_config
mamba_ssm_cache_dtype = getattr(
hf_config, "mamba_ssm_cache_dtype", "float16"
)
logger.info(
"Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
mamba_ssm_cache_dtype,
)
cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
"GteModel": SnowflakeGteNewModelConfig,
"GteNewModel": GteNewModelConfig,
@ -507,4 +522,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
"Mamba2ForCausalLM": MambaModelConfig,
"FalconMambaForCausalLM": MambaModelConfig,
"DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
"NemotronHForCausalLM": NemotronHForCausalLMConfig,
}

View File

@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargs,
MultiModalKwargsItems,
NestedTensors,
)
from vllm.multimodal.parse import (
@ -305,7 +305,7 @@ class DeepseekOCRMultiModalProcessor(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)

View File

@ -242,9 +242,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
self.scaling = self.head_dim**-0.5
self.max_position_embeddings = max_position_embeddings
if hasattr(config, "partial_rotary_factor"):
rotary_dim = self.head_dim * config.partial_rotary_factor
elif hasattr(config, "attn_rotary_emb"):
if hasattr(config, "attn_rotary_emb"):
rotary_dim = config.attn_rotary_emb # for backward compatibility
else:
rotary_dim = self.head_dim # default

View File

@ -10,7 +10,8 @@ from .utils import PPMissingLayer
class GlmForCausalLM(LlamaForCausalLM):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
hf_config = vllm_config.model_config.hf_config
hf_config.rope_parameters["partial_rotary_factor"] = 0.5
super().__init__(vllm_config=vllm_config, prefix=prefix)
# Hack Llama model to fit HF format GLM implementation
# Attention difference between GLM and Llama:

View File

@ -78,7 +78,7 @@ class Glm4Attention(nn.Module):
# Number of KV heads is less than TP size, so we replicate
# the KV heads across multiple tensor parallel GPUs.
assert tp_size % self.total_num_kv_heads == 0
partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
self.head_dim = head_dim or hidden_size // self.total_num_heads
self.rotary_dim = self.head_dim
@ -106,7 +106,6 @@ class Glm4Attention(nn.Module):
rotary_dim=self.rotary_dim,
max_position=max_position,
rope_parameters=config.rope_parameters,
partial_rotary_factor=partial_rotary_factor,
is_neox_style=False,
)
self.attn = Attention(

View File

@ -282,13 +282,12 @@ class Glm4MoeAttention(nn.Module):
prefix=f"{prefix}.o_proj",
)
partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
rope_parameters=config.rope_parameters,
partial_rotary_factor=partial_rotary_factor,
)
self.attn = Attention(
self.num_heads,

View File

@ -89,16 +89,14 @@ class GPTNeoXAttention(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.dense",
)
scaling = self.head_size**-0.5
rotary_dim = int(self.head_size * config.rotary_pct)
assert rotary_dim % 2 == 0
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope(
self.head_size,
rotary_dim=rotary_dim,
rotary_dim=self.head_size,
max_position=max_position_embeddings,
rope_parameters=config.rope_parameters,
)
scaling = self.head_size**-0.5
self.attn = Attention(
self.num_heads,
self.head_size,

View File

@ -78,7 +78,7 @@ class SupportsMultiModal(Protocol):
`multimodal_config.mm_encoder_tp_mode="data"`.
"""
merge_by_field_config: ClassVar[bool] = False
merge_by_field_config: ClassVar[bool] = True
"""
A flag that indicates which implementation of
`vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.

View File

@ -28,7 +28,7 @@ from vllm.model_executor.models.utils import (
)
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.cache import BaseMultiModalProcessorCache
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
@ -103,7 +103,7 @@ class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingIn
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index

View File

@ -149,8 +149,6 @@ class LlamaAttention(nn.Module):
if head_dim is None:
head_dim = self.hidden_size // self.total_num_heads
self.head_dim = head_dim
# Phi models introduced a partial_rotary_factor parameter in the config
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
@ -265,7 +263,6 @@ class LlamaAttention(nn.Module):
max_position=self.max_position_embeddings,
rope_parameters=getattr(config, "rope_parameters", None),
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)

View File

@ -52,7 +52,6 @@ from vllm.multimodal.evs import (
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargs,
MultiModalKwargsItems,
VideoItem,
)
@ -849,17 +848,18 @@ class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
if "image_num_patches" in out_mm_kwargs:
image_num_patches = out_mm_kwargs["image_num_patches"]
out_mm_data = out_mm_kwargs.get_data()
if "image_num_patches" in out_mm_data:
image_num_patches = out_mm_data["image_num_patches"]
assert isinstance(image_num_patches, torch.Tensor)
image_num_patches = image_num_patches.tolist()
elif "image_embeds" in out_mm_kwargs:
elif "image_embeds" in out_mm_data:
# to compute num_patches (similar to Qwen2-VL)
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
image_num_patches = [None] * len(out_mm_data["image_embeds"])
else:
image_num_patches = []

View File

@ -178,7 +178,6 @@ class NemotronAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.partial_rotary_factor = config.partial_rotary_factor
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -203,7 +202,6 @@ class NemotronAttention(nn.Module):
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
rope_parameters=config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor,
)
self.attn = Attention(
self.num_heads,

View File

@ -122,7 +122,6 @@ class DeciLMAttention(LlamaAttention):
max_position=self.max_position_embeddings,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)

View File

@ -23,7 +23,7 @@ from vllm.config import VllmConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalFieldConfig,
MultiModalKwargs,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
from vllm.multimodal.processing import (
@ -153,7 +153,7 @@ class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo])
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs,
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)

View File

@ -62,7 +62,7 @@ from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFeatureSpec,
MultiModalFieldConfig,
MultiModalKwargs,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import (
ImageProcessorItems,
@ -307,7 +307,7 @@ class PaddleOCRVLMultiModalProcessor(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
hf_config = self.info.get_hf_config()

View File

@ -40,7 +40,6 @@ from .siglip import SiglipVisionModel
from .utils import (
AutoWeightsLoader,
WeightsMapper,
flatten_bn,
init_vllm_registered_model,
maybe_prefix,
)
@ -252,6 +251,8 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
dummy_inputs=PaliGemmaDummyInputsBuilder,
)
class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
@ -327,9 +328,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
return None
if pixel_values is not None:
pixel_values = flatten_bn(pixel_values, concat=True)
h = w = self.config.vision_config.image_size
return PaliGemmaImagePixelInputs(
type="pixel_values",
data=pixel_values,
@ -337,8 +337,6 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
)
if image_embeds is not None:
image_embeds = flatten_bn(image_embeds, concat=True)
return PaliGemmaImageEmbeddingInputs(
type="image_embeds",
data=image_embeds,

View File

@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module):
self.num_heads = self.total_num_heads // tensor_parallel_world_size
self.head_dim = self.hidden_size // self.total_num_heads
self.max_position_embeddings = config.max_position_embeddings
self.partial_rotary_factor = config.partial_rotary_factor
self.is_causal = True
assert (self.head_dim * self.total_num_heads) == self.hidden_size
@ -138,7 +137,6 @@ class PersimmonAttention(nn.Module):
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
rope_parameters=config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor,
)
self.scaling = self.head_dim**-0.5
self.attn = Attention(

View File

@ -109,10 +109,7 @@ class PhiAttention(nn.Module):
)
scaling = self.head_size**-0.5
rotary_dim = int(
config.partial_rotary_factor
* (config.hidden_size // config.num_attention_heads)
)
rotary_dim = config.hidden_size // config.num_attention_heads
assert rotary_dim % 2 == 0
max_position_embeddings = getattr(config, "max_position_embeddings", 2048)

View File

@ -77,7 +77,7 @@ from vllm.multimodal.evs import (
from vllm.multimodal.inputs import (
MultiModalFeatureSpec,
MultiModalFieldConfig,
MultiModalKwargs,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
@ -973,7 +973,7 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs,
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)

View File

@ -750,7 +750,6 @@ class Qwen3NextAttention(nn.Module):
rotary_dim=self.head_dim,
max_position=config.max_position_embeddings,
rope_parameters=config.rope_parameters,
partial_rotary_factor=config.partial_rotary_factor,
dual_chunk_attention_config=self.dual_chunk_attention_config,
)

View File

@ -103,7 +103,7 @@ from .qwen2_5_vl import (
Qwen2_5_VLVideoInputs,
Qwen2_5_VLVideoPixelInputs,
)
from .qwen2_vl import Qwen2VLProcessingInfo
from .qwen2_vl import Qwen2VLMultiModalDataParser, Qwen2VLProcessingInfo
from .qwen3 import Qwen3ForCausalLM, Qwen3Model
from .utils import (
AutoWeightsLoader,
@ -884,7 +884,10 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]):
def _get_data_parser(self) -> MultiModalDataParser:
return MultiModalDataParser(video_needs_metadata=True)
return Qwen2VLMultiModalDataParser(
self.info.get_hf_config().vision_config.spatial_merge_size,
video_needs_metadata=True,
)
def _call_hf_processor(
self,

View File

@ -119,9 +119,6 @@ class StablelmAttention(nn.Module):
self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
self.head_dim = self.hidden_size // self.total_num_heads
self.max_position_embeddings = config.max_position_embeddings
self.partial_rotary_factor = getattr(
config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
)
self.scaling = self.head_dim**-0.5
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_key_value_heads * self.head_dim
@ -154,7 +151,6 @@ class StablelmAttention(nn.Module):
rotary_dim=self.head_dim,
max_position=self.config.max_position_embeddings,
rope_parameters=self.config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor,
)
self.attn = Attention(
self.num_heads,

View File

@ -25,7 +25,6 @@ from .inputs import (
MultiModalBatchedField,
MultiModalFeatureSpec,
MultiModalFieldElem,
MultiModalKwargs,
MultiModalKwargsItem,
MultiModalKwargsItems,
NestedTensors,
@ -90,7 +89,6 @@ MultiModalCacheValue: TypeAlias = (
| MultiModalProcessorCacheItemMetadata
| MultiModalKwargsItems
| MultiModalKwargsItem
| MultiModalKwargs
| Mapping[str, NestedTensors]
)
@ -108,12 +106,7 @@ class MultiModalCache:
# These are not subclasses of dict
if isinstance(
leaf,
(
MultiModalKwargs,
MultiModalKwargsItems,
MultiModalKwargsItem,
MultiModalFieldElem,
),
(MultiModalKwargsItems, MultiModalKwargsItem, MultiModalFieldElem),
):
return cls.get_item_size(leaf.data) # type: ignore

View File

@ -3,7 +3,7 @@
from abc import ABC, abstractmethod
from collections import UserDict, defaultdict
from collections.abc import Mapping, Sequence
from collections.abc import Mapping, Sequence, Set
from dataclasses import dataclass
from functools import partial
from itertools import accumulate
@ -201,8 +201,10 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
"""Equality check between
[`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects."""
"""
Equality check between
[`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects.
"""
if isinstance(a, torch.Tensor):
return isinstance(b, torch.Tensor) and torch.equal(a, b)
elif isinstance(b, torch.Tensor):
@ -224,10 +226,24 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
BatchedTensorInputs: TypeAlias = dict[str, NestedTensors]
"""
A dictionary containing nested tensors which have been batched via
[`MultiModalKwargs.batch`][vllm.multimodal.inputs.MultiModalKwargs.batch].
[`MultiModalKwargsItems.get_data`][vllm.multimodal.inputs.MultiModalKwargsItems.get_data].
"""
def batched_tensors_equal(a: BatchedTensorInputs, b: BatchedTensorInputs) -> bool:
"""
Equality check between
[`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects.
"""
for k in a:
if k not in b:
return False
if not nested_tensors_equal(a[k], b[k]):
return False
return True
@dataclass
class MultiModalFeatureSpec:
"""
@ -823,7 +839,14 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
return self # type: ignore[return-value]
def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs":
def get_data(
self,
*,
device: torch.types.Device = None,
pin_memory: bool = False,
cpu_fields: Set[str] = frozenset(),
) -> BatchedTensorInputs:
"""Construct a dictionary of keyword arguments to pass to the model."""
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
for modality, items in self.items():
for i, item in enumerate(items):
@ -835,12 +858,23 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
for key, elem in item.items():
elems_by_key[key].append(elem)
return MultiModalKwargs(
{
key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
for key, elems in elems_by_key.items()
}
)
data = {
key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
for key, elems in elems_by_key.items()
}
if device is not None:
for k in data.keys() - cpu_fields:
data[k] = json_map_leaves(
(
lambda x: x.to(device=device, non_blocking=True)
if isinstance(x, torch.Tensor)
else x
),
data[k],
)
return data
MultiModalKwargsOptionalItems: TypeAlias = (
@ -849,6 +883,7 @@ MultiModalKwargsOptionalItems: TypeAlias = (
)
@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.")
class MultiModalKwargs(UserDict[str, NestedTensors]):
"""
A dictionary that represents the keyword arguments to
@ -882,91 +917,6 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
):
return MultiModalKwargsItems.from_seq(items).get_data(pin_memory=pin_memory)
@staticmethod
def _try_stack(
nested_tensors: NestedTensors, pin_memory: bool = False
) -> NestedTensors:
"""
Stack the inner dimensions that have the same shape in
a nested list of tensors.
Thus, a dimension represented by a list means that the inner
dimensions are different for each element along that dimension.
"""
if isinstance(nested_tensors, torch.Tensor):
return nested_tensors
# TODO: Remove these once all models have been migrated
if isinstance(nested_tensors, np.ndarray):
return torch.from_numpy(nested_tensors)
if isinstance(nested_tensors, (int, float)):
return torch.tensor(nested_tensors)
stacked = [MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors]
if not is_list_of(stacked, torch.Tensor, check="all"):
# Only tensors (not lists) can be stacked.
return stacked
tensors_ = cast(list[torch.Tensor], stacked)
if len(tensors_) == 1:
# An optimization when `tensors_` contains only one tensor:
# - produce exactly same result as `torch.stack(tensors_)`
# - will achieve zero-copy if the tensor is contiguous
return tensors_[0].unsqueeze(0).contiguous()
if any(t.shape != tensors_[0].shape for t in tensors_):
# The tensors have incompatible shapes and can't be stacked.
return tensors_
outputs = torch.empty(
len(tensors_),
*tensors_[0].shape,
dtype=tensors_[0].dtype,
device=tensors_[0].device,
pin_memory=pin_memory,
)
return torch.stack(tensors_, out=outputs)
@staticmethod
def batch(
inputs_list: list["MultiModalKwargs"], pin_memory: bool = False
) -> BatchedTensorInputs:
"""
Batch multiple inputs together into a dictionary.
The resulting dictionary has the same keys as the inputs.
If the corresponding value from each input is a tensor and they all
share the same shape, the output value is a single batched tensor;
otherwise, the output value is a list containing the original value
from each input.
"""
if len(inputs_list) == 0:
return {}
# We need to consider the case where each item in the batch
# contains different modalities (i.e. different keys).
item_lists = defaultdict[str, list[NestedTensors]](list)
for inputs in inputs_list:
for k, v in inputs.items():
item_lists[k].append(v)
return {
k: MultiModalKwargs._try_stack(item_list, pin_memory)
for k, item_list in item_lists.items()
}
@staticmethod
def as_kwargs(
batched_inputs: BatchedTensorInputs,
*,
device: torch.types.Device,
) -> BatchedTensorInputs:
return json_map_leaves(
lambda x: x.to(device=device, non_blocking=True),
batched_inputs,
)
def __getitem__(self, key: str):
if key not in self:
raise KeyError(

View File

@ -19,7 +19,6 @@ from PIL import Image, UnidentifiedImageError
import vllm.envs as envs
from vllm.connections import HTTPConnection, global_http_connection
from vllm.logger import init_logger
from vllm.utils.jsontree import json_map_leaves
from vllm.utils.registry import ExtensionManager
from .audio import AudioEmbeddingMediaIO, AudioMediaIO
@ -427,59 +426,25 @@ def group_mm_kwargs_by_modality(
Yields:
A tuple `(modality, num_items, grouped_kwargs)`.
"""
if merge_by_field_config is None:
raise RuntimeError(
"`group_mm_kwargs_by_modality` now requires "
"`merge_by_field_config` arg, please update your model runner "
"according to https://github.com/vllm-project/vllm/pull/25676."
)
if merge_by_field_config is False:
# TODO: After v0.13, remove merge_by_field_config attribute from model impls
if merge_by_field_config is not None:
logger.warning_once(
"The legacy code for batching multi-modal kwargs is deprecated and "
"will be removed in v0.12. Please update your model with "
"`merge_by_field_config=True` to use the new code defined by "
"`MultiModalFieldConfig`. You can refer to "
"https://github.com/vllm-project/vllm/issues/26149 "
"for some examples on how to do this."
"The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
"is deprecated and will be removed in v0.13."
)
from vllm.multimodal.inputs import MultiModalKwargs, MultiModalKwargsItems
from vllm.multimodal.inputs import MultiModalKwargsItems
for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
items_lst = list(items)
mm_kwargs_items = MultiModalKwargsItems.from_seq(items_lst)
mm_kwargs_data = mm_kwargs_items.get_data(
device=device,
pin_memory=pin_memory,
cpu_fields=multimodal_cpu_fields,
)
if merge_by_field_config:
mm_kwargs_group: BatchedTensorInputs = dict(
MultiModalKwargsItems.from_seq(items_lst).get_data(
pin_memory=pin_memory
)
)
if device is not None:
mm_kwargs_group = {
k: json_map_leaves(
lambda x: x.to(device=device, non_blocking=True)
if isinstance(x, torch.Tensor)
else x,
v,
)
if k not in multimodal_cpu_fields
else v
for k, v in mm_kwargs_group.items()
}
else:
mm_kwargs_group = MultiModalKwargs.as_kwargs(
MultiModalKwargs.batch(
[
MultiModalKwargsItems.from_seq([item]).get_data()
for item in items_lst
],
pin_memory=pin_memory,
),
device=device,
)
yield modality, len(items_lst), mm_kwargs_group
yield modality, len(items_lst), mm_kwargs_data
def fetch_audio(

View File

@ -233,6 +233,23 @@ class CudaPlatformBase(Platform):
from vllm.config import CUDAGraphMode
compilation_config = vllm_config.compilation_config
if compilation_config.cudagraph_mode.has_full_cudagraphs():
# decode context parallel does not support full cudagraphs
if parallel_config.decode_context_parallel_size > 1:
logger.warning_once(
"Decode context parallel (DCP) is enabled, which is "
"incompatible with full CUDA graphs. "
"Overriding cudagraph_mode to PIECEWISE."
)
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
# prefill context parallel do not support full cudagraphs
elif parallel_config.prefill_context_parallel_size > 1:
logger.warning_once(
"Prefill context parallel (PCP) is enabled, which is "
"incompatible with full CUDA graphs. "
"Overriding cudagraph_mode to PIECEWISE."
)
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
if (
parallel_config.all2all_backend == "deepep_high_throughput"
and parallel_config.data_parallel_size > 1

Some files were not shown because too many files have changed in this diff Show More