Merge branch 'vllm-project:main' into wye-refactor-quant-folder

This commit is contained in:
Wentao Ye 2025-08-12 15:18:34 -04:00 committed by GitHub
commit 66d491c494
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
318 changed files with 13639 additions and 4013 deletions

View File

@ -128,7 +128,7 @@ run_and_track_test() {
# --- Actual Test Execution ---
run_and_track_test 1 "test_struct_output_generate.py" \
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 2 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 3 "test_lora.py" \
@ -139,6 +139,8 @@ run_and_track_test 5 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
run_and_track_test 7 "test_tpu_int8.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then

View File

@ -134,7 +134,7 @@ run_and_track_test 1 "test_compilation.py" \
run_and_track_test 2 "test_basic.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
run_and_track_test 4 "test_quantization_accuracy.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
run_and_track_test 5 "examples/offline_inference/tpu.py" \

View File

@ -57,9 +57,10 @@ steps:
- vllm/
- tests/mq_llm_engine
- tests/async_engine
- tests/test_inputs
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/test_utils
- tests/utils_
- tests/worker
- tests/standalone_tests/lazy_imports.py
commands:
@ -70,7 +71,7 @@ steps:
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils
- pytest -v -s utils_ # Utils
- pytest -v -s worker # Worker
- label: Python-only Installation Test
@ -426,7 +427,6 @@ steps:
- label: Tensorizer Test # 11min
mirror_hardwares: [amdexperimental]
soft_fail: true
source_file_dependencies:
- vllm/model_executor/model_loader
- tests/tensorizer_loader
@ -535,8 +535,6 @@ steps:
- vllm/
- tests/models/language
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m core_model
@ -547,8 +545,10 @@ steps:
- vllm/
- tests/models/language/generation
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m hybrid_model
- label: Language Models Test (Extended Generation) # 1hr20min

10
.github/CODEOWNERS vendored
View File

@ -9,7 +9,7 @@
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
@ -20,7 +20,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
@ -34,16 +34,16 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
/tests/multi_step @alexm-redhat @comaniac
/tests/multimodal @DarkLight1337 @ywang96
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/weight_loading @mgoin @youkaichao
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
# Docs

14
.github/mergify.yml vendored
View File

@ -118,6 +118,20 @@ pull_request_rules:
add:
- qwen
- name: label-gpt-oss
description: Automatically apply gpt-oss label
conditions:
- or:
- files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
- title~=(?i)gpt[-_]?oss
actions:
label:
add:
- gpt-oss
- name: label-rocm
description: Automatically apply rocm label
conditions:

6
.gitignore vendored
View File

@ -4,6 +4,9 @@
# vllm-flash-attn built from source
vllm/vllm_flash_attn/*
# triton jit
.triton
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@ -147,7 +150,8 @@ venv.bak/
# mkdocs documentation
/site
docs/argparse
docs/examples
docs/examples/*
!docs/examples/README.md
# mypy
.mypy_cache/

View File

@ -428,6 +428,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"

View File

@ -18,14 +18,15 @@ Easy, fast, and cheap LLM serving for everyone
*Latest News* 🔥
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
<details>
<summary>Previous News</summary>
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
@ -121,6 +122,7 @@ Cash Donations:
Compute Resources:
- Alibaba Cloud
- AMD
- Anyscale
- AWS
@ -160,7 +162,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us
<!-- --8<-- [start:contact-us] -->
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature

View File

@ -31,7 +31,7 @@ class RequestFuncInput:
model_name: Optional[str] = None
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict] = None
multi_modal_content: Optional[dict | list[dict]] = None
ignore_eos: bool = False
language: Optional[str] = None
@ -364,7 +364,15 @@ async def async_request_openai_chat_completions(
) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
mm_content = request_func_input.multi_modal_content
if isinstance(mm_content, list):
content.extend(mm_content)
elif isinstance(mm_content, dict):
content.append(mm_content)
else:
raise TypeError(
"multi_modal_content must be a dict or list[dict] for openai-chat"
)
payload = {
"model": request_func_input.model_name
if request_func_input.model_name
@ -491,7 +499,10 @@ async def async_request_openai_audio(
buffer.seek(0)
return buffer
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
mm_audio = request_func_input.multi_modal_content
if not isinstance(mm_audio, dict) or "audio" not in mm_audio:
raise TypeError("multi_modal_content must be a dict containing 'audio'")
with to_bytes(*mm_audio["audio"]) as f:
form = aiohttp.FormData()
form.add_field("file", f, content_type="audio/wav")
for key, value in payload.items():

View File

@ -52,7 +52,7 @@ class SampleRequest:
prompt: Union[str, Any]
prompt_len: int
expected_output_len: int
multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
lora_request: Optional[LoRARequest] = None

View File

@ -263,7 +263,14 @@ async def benchmark(
input_requests[0].multi_modal_data,
)
assert test_mm_content is None or isinstance(test_mm_content, dict)
assert (
test_mm_content is None
or isinstance(test_mm_content, dict)
or (
isinstance(test_mm_content, list)
and all(isinstance(item, dict) for item in test_mm_content)
)
), "multi_modal_data must be a dict or list[dict]"
test_input = RequestFuncInput(
model=model_id,
model_name=model_name,

View File

@ -22,10 +22,10 @@ from vllm.utils import FlexibleArgumentParser
FP8_DTYPE = current_platform.fp8_dtype()
def ensure_divisibility(numerator, denominator):
def ensure_divisibility(numerator, denominator, text):
"""Ensure that numerator is divisible by the denominator."""
assert numerator % denominator == 0, (
"intermediate_size {} is not divisible by tp {}.".format(numerator, denominator)
assert numerator % denominator == 0, "{} {} is not divisible by tp {}.".format(
text, numerator, denominator
)
@ -577,12 +577,10 @@ def main(args: argparse.Namespace):
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
intermediate_size = config.ffn_config.ffn_hidden_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] == "JambaForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] in (
"DeepseekV3ForCausalLM",
"DeepseekV2ForCausalLM",
@ -591,17 +589,14 @@ def main(args: argparse.Namespace):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
E = config.num_experts
topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0]
shard_intermediate_size = 2 * intermediate_size // args.tp_size
else:
# Support for llama4
config = config.get_text_config()
@ -609,8 +604,14 @@ def main(args: argparse.Namespace):
E = config.num_local_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
enable_ep = bool(args.enable_expert_parallel)
if enable_ep:
ensure_divisibility(E, args.tp_size, "Number of experts")
E = E // args.tp_size
shard_intermediate_size = 2 * intermediate_size
else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size
ensure_divisibility(intermediate_size, args.tp_size)
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
@ -742,6 +743,7 @@ if __name__ == "__main__":
parser.add_argument(
"--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2
)
parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true")
parser.add_argument(
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
)

View File

@ -0,0 +1,328 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# This script benchmarks the mrope kernel (mainly for Qwen2VL and Qwen2.5VL models).
# It generates test data, runs benchmarks, and saves results to a CSV file.
#
# The CSV file (named with current date/time) contains these columns:
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
# speedup
#
# == Usage Examples ==
#
# Single model benchmark:
# python3 benchmark_mrope.py --model-name Qwen/Qwen2-VL-7B-Instruct --tp-size 1 \
# --warmup-iter 10 --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
#
# All models benchmark:
# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
#
# All models with different TP sizes:
# python3 benchmark_mrope.py --model-name "" --tp-size 1 2 4 8 --warmup-iter 10 \
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
#
# All models with different token counts:
# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 4096 16384
import csv
import os
import time
from datetime import datetime
from typing import Any
import numpy as np
import torch
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.utils import FlexibleArgumentParser
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def generate_test_data(
num_tokens: int,
num_q_heads: int,
num_kv_heads: int,
head_size: int,
max_position_embeddings: int,
dtype: torch.dtype,
device: torch.device,
):
"""Generate test data for given configuration."""
# Create 2D positions (3, num_tokens) for multimodal case
positions = torch.randint(
0, max_position_embeddings // 4, (3, num_tokens), device=device
)
# Create query and key tensors
query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device)
key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device)
return positions, query, key
def calculate_stats(times: list[float]) -> dict[str, float]:
"""Calculate statistics from a list of times."""
times_array = np.array(times)
return {
"mean": np.mean(times_array),
"median": np.median(times_array),
"p99": np.percentile(times_array, 99),
"min": np.min(times_array),
"max": np.max(times_array),
}
def benchmark_mrope(
model_name: str,
num_tokens: int,
head_dim: int,
tp_size: int,
num_heads: int,
num_kv_heads: int,
max_position: int = 8192,
rope_theta: float = 10000,
is_neox_style: bool = True,
rope_scaling: dict[str, Any] = None,
dtype: torch.dtype = torch.bfloat16,
seed: int = 0,
warmup_iter: int = 10,
benchmark_iter: int = 100,
csv_writer=None,
):
current_platform.seed_everything(seed)
torch.set_default_device(device)
# the parameters to compute the q k v size based on tp_size
mrope_helper_class = get_rope(
head_size=head_dim,
rotary_dim=head_dim,
max_position=max_position,
base=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=rope_scaling,
dtype=dtype,
).to(device=device)
print(80 * "=")
print(
f"Evaluating model: {model_name} "
f"with tp_size: {tp_size} "
f"and num_tokens: {num_tokens}, "
f"dtype: {dtype}"
)
# create q k v input tensors
# create rotary pos emb input tensors
positions, query, key = generate_test_data(
num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
)
# Warm up
for _ in range(warmup_iter):
mrope_helper_class.forward_native(
positions,
query.clone(),
key.clone(),
)
mrope_helper_class.forward_cuda(
positions,
query.clone(),
key.clone(),
)
torch.cuda.synchronize()
# Time reference implementation
torch_times = []
for _ in range(benchmark_iter):
query_clone = query.clone()
key_clone = key.clone()
torch.cuda.synchronize()
start_time = time.time()
mrope_helper_class.forward_native(
positions,
query_clone,
key_clone,
)
torch.cuda.synchronize()
torch_times.append(time.time() - start_time)
# Time triton kernel implementation
triton_times = []
for _ in range(benchmark_iter):
query_clone = query.clone()
key_clone = key.clone()
torch.cuda.synchronize()
start_time = time.time()
mrope_helper_class.forward_cuda(
positions,
query_clone,
key_clone,
)
torch.cuda.synchronize()
triton_times.append(time.time() - start_time)
# Calculate statistics
torch_stats = calculate_stats(torch_times)
triton_stats = calculate_stats(triton_times)
print(f"\nPerformance for config ({num_tokens}, {num_heads}, {num_kv_heads}):")
print(
f"Torch implementation: "
f"mean={torch_stats['mean']:.8f}s, "
f"median={torch_stats['median']:.8f}s, "
f"p99={torch_stats['p99']:.8f}s"
)
print(
f"Triton implementation: "
f"mean={triton_stats['mean']:.8f}s, "
f"median={triton_stats['median']:.8f}s, "
f"p99={triton_stats['p99']:.8f}s"
)
print(
f"Triton Speedup over Torch: {torch_stats['mean'] / triton_stats['mean']:.8f}x"
)
# Write to CSV
if csv_writer:
row = [
model_name,
tp_size,
num_tokens,
num_heads,
num_kv_heads,
head_dim,
max_position,
rope_theta,
is_neox_style,
str(rope_scaling),
str(dtype).split(".")[-1],
torch_stats["mean"],
torch_stats["median"],
torch_stats["p99"],
torch_stats["min"],
torch_stats["max"],
triton_stats["mean"],
triton_stats["median"],
triton_stats["p99"],
triton_stats["min"],
triton_stats["max"],
torch_stats["mean"] / triton_stats["mean"], # speedup
]
csv_writer.writerow(row)
return torch_stats, triton_stats
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="Benchmark the rotary embedding kernels."
)
parser.add_argument("--model-name", type=str, default="")
parser.add_argument("--tp-size", type=int, default=1)
parser.add_argument("--warmup-iter", type=int, default=10)
parser.add_argument("--benchmark-iter", type=int, default=100)
parser.add_argument("--dtype", type=str, choices=["bfloat16"], default="bfloat16")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--num-tokens", type=int, nargs="+", required=False)
parser.add_argument("--trust-remote-code", action="store_true")
parser.add_argument("--output-csv", type=str, default="mrope_benchmark_results.csv")
args = parser.parse_args()
print(args)
# Create CSV file for results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"{os.path.splitext(args.output_csv)[0]}_{timestamp}.csv"
with open(csv_filename, "w", newline="") as csvfile:
csv_writer = csv.writer(csvfile)
# Write header
header = [
"model_name",
"tp_size",
"num_tokens",
"num_heads",
"num_kv_heads",
"head_dim",
"max_position",
"rope_theta",
"is_neox_style",
"rope_scaling",
"dtype",
"torch_mean",
"torch_median",
"torch_p99",
"torch_min",
"torch_max",
"triton_mean",
"triton_median",
"triton_p99",
"triton_min",
"triton_max",
"speedup",
]
csv_writer.writerow(header)
model_tp_dict = {}
if args.model_name == "":
model_tp_dict = {
"Qwen/Qwen2-VL-2B-Instruct": [1],
"Qwen/Qwen2-VL-7B-Instruct": [1],
"Qwen/Qwen2-VL-72B-Instruct": [2, 4, 8],
"Qwen/Qwen2.5-VL-3B-Instruct": [1, 2, 4, 8],
"Qwen/Qwen2.5-VL-7B-Instruct": [1, 2, 4, 8],
"Qwen/Qwen2.5-VL-72B-Instruct": [2, 4, 8],
}
else:
model_tp_dict[args.model_name] = [args.tp_size]
if args.num_tokens is None:
num_tokens_list = [2**i for i in range(0, 18)]
else:
num_tokens_list = args.num_tokens
for model_name, tp_list in model_tp_dict.items():
config = get_config(model_name, trust_remote_code=args.trust_remote_code)
for tp_size in tp_list:
# get the model config
total_num_kv_heads = config.num_key_value_heads
total_num_heads = config.num_attention_heads
num_heads = total_num_heads // tp_size
num_kv_heads = max(1, total_num_kv_heads // tp_size)
head_dim = config.hidden_size // total_num_heads
q_size = num_heads * head_dim
kv_size = num_kv_heads * head_dim
is_neox_style = True
rope_theta = config.rope_theta
max_position = config.max_position_embeddings
for num_tokens in num_tokens_list:
benchmark_mrope(
model_name=model_name,
num_tokens=num_tokens,
head_dim=head_dim,
tp_size=tp_size,
num_heads=num_heads,
num_kv_heads=num_kv_heads,
max_position=max_position,
rope_theta=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=config.rope_scaling,
dtype=getattr(torch, args.dtype),
seed=args.seed,
warmup_iter=args.warmup_iter,
benchmark_iter=args.benchmark_iter,
csv_writer=csv_writer,
)
print(f"Benchmark results saved to {csv_filename}")

View File

@ -0,0 +1,71 @@
# Benchmark KV Cache Offloading with Multi-Turn Conversations
The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `requirements.txt`
First start serving your model
```bash
export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
vllm serve $MODEL_NAME --disable-log-requests
```
## Synthetic Multi-Turn Conversations
Download the following text file (used for generation of synthetic conversations)
```bash
wget https://www.gutenberg.org/ebooks/1184.txt.utf-8
mv 1184.txt.utf-8 pg1184.txt
```
The filename `pg1184.txt` is used in `generate_multi_turn.json` (see `"text_files"`).
But you may use other text files if you prefer (using this specific file is not required).
Then run the benchmarking script
```bash
export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
python benchmark_serving_multi_turn.py --model $MODEL_NAME --input-file generate_multi_turn.json \
--num-clients 2 --max-active-conversations 6
```
You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
If successful, you will see the following output
```bash
----------------------------------------------------------------------------------------------------
Statistics summary:
runtime_sec = 215.810
requests_per_sec = 0.769
----------------------------------------------------------------------------------------------------
count mean std min 25% 50% 75% 90% 99% max
ttft_ms 166.0 78.22 67.63 45.91 59.94 62.26 64.43 69.66 353.18 567.54
tpot_ms 166.0 25.37 0.57 24.40 25.07 25.31 25.50 25.84 27.50 28.05
latency_ms 166.0 2591.07 326.90 1998.53 2341.62 2573.01 2860.10 3003.50 3268.46 3862.94
input_num_turns 166.0 7.43 4.57 1.00 3.00 7.00 11.00 13.00 17.00 17.00
input_num_tokens 166.0 2006.20 893.56 522.00 1247.75 2019.00 2718.00 3233.00 3736.45 3899.00
output_num_tokens 166.0 100.01 11.80 80.00 91.00 99.00 109.75 116.00 120.00 120.00
output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75 115.00 119.00 119.00
----------------------------------------------------------------------------------------------------
```
## ShareGPT Conversations
To run with the ShareGPT data, download the following ShareGPT dataset:
`https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json`
Use the `convert_sharegpt_to_openai.py` script to convert the dataset to a format supported by `benchmark_serving_multi_turn.py`
```bash
python convert_sharegpt_to_openai.py sharegpt_20230401_clean_lang_split.json sharegpt_conv_128.json --seed=99 --max-items=128
```
The script will convert the ShareGPT dataset to a dataset with the standard user/assistant roles.
The flag `--max-items=128` is used to sample 128 conversations from the original dataset (change as needed).
Use the output JSON file `sharegpt_conv_128.json` as the `--input-file` for `benchmark_serving_multi_turn.py`.

View File

@ -0,0 +1,493 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from statistics import mean
from typing import Any, NamedTuple, Optional, Union
import numpy as np # type: ignore
import pandas as pd # type: ignore
from bench_utils import (
TEXT_SEPARATOR,
Color,
logger,
)
from transformers import AutoTokenizer # type: ignore
# Conversation ID is a string (e.g: "UzTK34D")
ConvId = str
# A list of dicts (dicts with keys "id" and "messages")
ShareGptConversations = list[dict[str, Any]]
# A list of dicts (dicts with keys "role" and "content")
MessagesList = list[dict[str, str]]
# Map conversation ID to conversation messages
ConversationsMap = list[ConvId, MessagesList]
class Distribution(ABC):
@abstractmethod
def sample(self, size: int = 1) -> np.ndarray:
pass
class UniformDistribution(Distribution):
def __init__(
self,
min_val: Union[int, float],
max_val: Union[int, float],
is_integer: bool = True,
) -> None:
self.min_val = min_val
self.max_val = max_val
self.is_integer = is_integer
def sample(self, size: int = 1) -> np.ndarray:
if self.is_integer:
return np.random.randint(
int(self.min_val), int(self.max_val + 1), size=size
)
else:
return np.random.uniform(self.min_val, self.max_val, size=size)
def __repr__(self) -> str:
return f"UniformDistribution[{self.min_val}, {self.max_val}]"
class ConstantDistribution(Distribution):
def __init__(self, value: Union[int, float]) -> None:
self.value = value
self.max_val = value
def sample(self, size: int = 1) -> np.ndarray:
return np.full(shape=size, fill_value=self.value)
def __repr__(self) -> str:
return f"Constant[{self.value}]"
class ZipfDistribution(Distribution):
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
self.alpha = alpha
self.max_val = max_val
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.zipf(self.alpha, size=size)
if self.max_val:
samples = np.minimum(samples, self.max_val)
return samples
def __repr__(self) -> str:
return f"ZipfDistribution[{self.alpha}]"
class PoissonDistribution(Distribution):
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
self.alpha = alpha
self.max_val = max_val
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.poisson(self.alpha, size=size)
if self.max_val:
samples = np.minimum(samples, self.max_val)
return samples
def __repr__(self) -> str:
return f"PoissonDistribution[{self.alpha}]"
class LognormalDistribution(Distribution):
def __init__(
self, mean: float, sigma: float, max_val: Optional[int] = None
) -> None:
self.mean = mean
self.sigma = sigma
self.max_val = max_val
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
if self.max_val:
samples = np.minimum(samples, self.max_val)
return np.round(samples).astype(int)
def __repr__(self) -> str:
return f"LognormalDistribution[{self.mean}, {self.sigma}]"
class GenConvArgs(NamedTuple):
num_conversations: int
text_files: list[str]
input_num_turns: Distribution
input_common_prefix_num_tokens: Distribution
input_prefix_num_tokens: Distribution
input_num_tokens: Distribution
output_num_tokens: Distribution
print_stats: bool
def verify_field_exists(
conf: dict, field_name: str, section: str, subsection: str
) -> None:
if field_name not in conf:
raise ValueError(
f"Missing field '{field_name}' in {section=} and {subsection=}"
)
def get_random_distribution(
conf: dict, section: str, subsection: str, optional: bool = False
) -> Distribution:
# section can be "prompt_input" or "prompt_output" (both required)
conf = conf[section]
if optional and subsection not in conf:
# Optional subsection, if not found assume the value is always 0
return ConstantDistribution(0)
# subsection can be "num_turns", "num_tokens" or "prefix_num_tokens"
if subsection not in conf:
raise ValueError(f"Missing subsection {subsection} in section {section}")
conf = conf[subsection]
distribution = conf.get("distribution")
if distribution is None:
raise ValueError(
f"Missing field 'distribution' in {section=} and {subsection=}"
)
if distribution == "constant":
verify_field_exists(conf, "value", section, subsection)
return ConstantDistribution(conf["value"])
elif distribution == "zipf":
verify_field_exists(conf, "alpha", section, subsection)
max_val = conf.get("max", None)
return ZipfDistribution(conf["alpha"], max_val=max_val)
elif distribution == "poisson":
verify_field_exists(conf, "alpha", section, subsection)
max_val = conf.get("max", None)
return PoissonDistribution(conf["alpha"], max_val=max_val)
elif distribution == "lognormal":
verify_field_exists(conf, "mean", section, subsection)
verify_field_exists(conf, "sigma", section, subsection)
max_val = conf.get("max", None)
return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
elif distribution == "uniform":
verify_field_exists(conf, "min", section, subsection)
verify_field_exists(conf, "max", section, subsection)
min_value = conf["min"]
max_value = conf["max"]
assert min_value > 0
assert min_value <= max_value
is_integer = isinstance(min_value, int) and isinstance(max_value, int)
return UniformDistribution(min_value, max_value, is_integer)
else:
raise ValueError(f"Unknown distribution: {distribution}")
def parse_input_json_file(conf: dict) -> GenConvArgs:
# Validate the input file
assert isinstance(conf, dict)
required_fields = [
"filetype",
"num_conversations",
"text_files",
"prompt_input",
"prompt_output",
]
for field in required_fields:
assert field in conf, f"Missing field {field} in input {conf}"
assert conf["filetype"] == "generate_conversations"
assert conf["num_conversations"] > 0, "num_conversations should be larger than zero"
text_files = conf["text_files"]
assert isinstance(text_files, list), "Field 'text_files' should be a list"
assert len(text_files) > 0, (
"Field 'text_files' should be a list with at least one file"
)
# Parse the parameters for the prompt input/output workload
input_num_turns = get_random_distribution(conf, "prompt_input", "num_turns")
input_num_tokens = get_random_distribution(conf, "prompt_input", "num_tokens")
input_common_prefix_num_tokens = get_random_distribution(
conf, "prompt_input", "common_prefix_num_tokens", optional=True
)
input_prefix_num_tokens = get_random_distribution(
conf, "prompt_input", "prefix_num_tokens"
)
output_num_tokens = get_random_distribution(conf, "prompt_output", "num_tokens")
print_stats: bool = conf.get("print_stats", False)
assert isinstance(print_stats, bool), (
"Field 'print_stats' should be either 'true' or 'false'"
)
args = GenConvArgs(
num_conversations=conf["num_conversations"],
text_files=text_files,
input_num_turns=input_num_turns,
input_common_prefix_num_tokens=input_common_prefix_num_tokens,
input_prefix_num_tokens=input_prefix_num_tokens,
input_num_tokens=input_num_tokens,
output_num_tokens=output_num_tokens,
print_stats=print_stats,
)
return args
def print_conv_stats(conversations: ConversationsMap, tokenizer: AutoTokenizer) -> None:
# Collect statistics
conv_stats: list[dict[Any, Any]] = []
req_stats: list[int] = []
print("\nCollecting statistics...")
for messages in conversations.values():
# messages is a list of dicts
user_tokens: list[int] = []
assistant_tokens: list[int] = []
request_tokens: list[int] = []
req_tokens = 0
for m in messages:
content = m["content"]
num_tokens = len(tokenizer(content).input_ids)
if m["role"] == "user":
user_tokens.append(num_tokens)
# New user prompt including all chat history
req_tokens += num_tokens
request_tokens.append(req_tokens)
elif m["role"] == "assistant":
assistant_tokens.append(num_tokens)
# Update assistant answer
# (will be part of chat history for the next user prompt)
req_tokens += num_tokens
item_stats = {
"conversation_turns": len(messages),
"user_tokens": mean(user_tokens),
"assistant_tokens": mean(assistant_tokens),
}
conv_stats.append(item_stats)
req_stats.extend(request_tokens)
# Print statistics
percentiles = [0.25, 0.5, 0.75, 0.9, 0.99]
print(TEXT_SEPARATOR)
print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}")
print(TEXT_SEPARATOR)
df = pd.DataFrame(conv_stats)
print(df.describe(percentiles=percentiles).transpose())
print(TEXT_SEPARATOR)
print(f"{Color.YELLOW}Request statistics:{Color.RESET}")
print(TEXT_SEPARATOR)
df = pd.DataFrame(req_stats, columns=["request_tokens"])
print(df.describe(percentiles=percentiles).transpose())
print(TEXT_SEPARATOR)
def generate_conversations(
args: GenConvArgs, tokenizer: AutoTokenizer
) -> ConversationsMap:
# Text for all user prompts
# (text from the input text files will be appended to this line)
base_prompt_text = "Please rewrite the following text and add more content: "
base_prompt_token_count = len(
tokenizer.encode(base_prompt_text, add_special_tokens=False)
)
logger.info(f"{Color.PURPLE}Generating conversations...{Color.RESET}")
logger.info(args)
list_of_tokens = []
for filename in args.text_files:
# Load text file that will be used to generate prompts
with open(filename) as file:
data = file.read()
tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
list_of_tokens.extend(tokens_in_file)
conversations: ConversationsMap = {}
conv_id = 0
# Generate number of turns for every conversation
turn_count: np.ndarray = args.input_num_turns.sample(args.num_conversations)
# Turn count should be at least 2 (one user prompt and one assistant answer)
turn_count = np.maximum(turn_count, 2)
# Round up to an even number (every user prompt should have an answer)
turn_count = turn_count + (turn_count % 2)
# Generate number of prefix tokens for every conversation
conv_prefix_tokens: np.ndarray = args.input_prefix_num_tokens.sample(
args.num_conversations
)
# Used to reduce shared text between conversations
# (jump/skip over text sections between conversations)
base_offset = 0
# Common prefix size for all conversations (only 1 sample required)
common_prefix_text = ""
common_prefix_tokens: int = args.input_common_prefix_num_tokens.sample(1)[0]
if common_prefix_tokens > 0:
# Using "." at the end to separate sentences
common_prefix_text = (
tokenizer.decode(list_of_tokens[: common_prefix_tokens - 2]) + "."
)
base_offset += common_prefix_tokens
for conv_id in range(args.num_conversations):
# Generate a single conversation
messages: MessagesList = []
nturns = turn_count[conv_id]
# User prompt token count per turn (with lower limit)
input_token_count: np.ndarray = args.input_num_tokens.sample(nturns)
input_token_count = np.maximum(input_token_count, base_prompt_token_count)
# Assistant answer token count per turn (with lower limit)
output_token_count: np.ndarray = args.output_num_tokens.sample(nturns)
output_token_count = np.maximum(output_token_count, 1)
user_turn = True
for turn_id in range(nturns):
if user_turn:
role = "user"
num_tokens = input_token_count[turn_id]
# Generate the user prompt,
# use a unique prefix (the conv_id) for each conversation
# (to avoid shared prefix between conversations)
content = f"{conv_id} is a nice number... "
if len(common_prefix_text) > 0 and turn_id == 0:
content = common_prefix_text + content
# Update the number of tokens left for the content
num_tokens -= len(tokenizer.encode(content, add_special_tokens=False))
if turn_id == 0:
prefix_num_tokens = conv_prefix_tokens[conv_id]
if prefix_num_tokens > 0:
# Add prefix text (context) to the first turn
start_offset = base_offset
end_offset = start_offset + prefix_num_tokens
assert len(list_of_tokens) > end_offset, (
"Not enough input text to generate "
f"{prefix_num_tokens} tokens for the "
f"prefix text ({start_offset=}, {end_offset=})"
)
content += f"{conv_id}, " + tokenizer.decode(
list_of_tokens[start_offset:end_offset]
)
base_offset += prefix_num_tokens
# Add the actual user prompt/question after the prefix text
content += base_prompt_text
num_tokens -= base_prompt_token_count
if num_tokens > 0:
# Add text from the input file (to reach the desired token count)
start_offset = base_offset + turn_id * input_token_count.max()
end_offset = start_offset + num_tokens
assert len(list_of_tokens) > end_offset, (
f"Not enough input text to generate {num_tokens} tokens "
f"for the prompt ({start_offset=}, {end_offset=})"
)
# Convert tokens back to text
content += tokenizer.decode(list_of_tokens[start_offset:end_offset])
else:
role = "assistant"
# This content will not be used as input to the LLM server
# (actual answers will be used instead).
# Content is only required to determine the min_tokens/max_tokens
# (inputs to the LLM server).
num_tokens = output_token_count[turn_id]
assert len(list_of_tokens) > num_tokens, (
f"Not enough input text to generate {num_tokens} "
"tokens for assistant content"
)
content = tokenizer.decode(list_of_tokens[:num_tokens])
# Append the user/assistant message to the list of messages
messages.append({"role": role, "content": content})
user_turn = not user_turn
# Add the new conversation
conversations[f"CONV_ID_{conv_id}"] = messages
# Increase base offset for the next conversation
base_offset += nturns
if args.print_stats:
print_conv_stats(conversations, tokenizer)
return conversations
def conversations_list_to_dict(input_list: ShareGptConversations) -> ConversationsMap:
conversations: ConversationsMap = {}
for item in input_list:
conv_id: str = item["id"]
assert isinstance(conv_id, str)
assert conv_id not in conversations, (
f"Conversation ID {conv_id} found more than once in the input"
)
messages: MessagesList = item["messages"]
assert isinstance(messages, list), (
f"Conversation messages should be a list (ID: {conv_id})"
)
assert len(messages) > 0, f"Conversation with no messages (ID: {conv_id})"
conversations[conv_id] = messages
logger.info(f"Using {len(conversations)} unique conversations (IDs)")
assert len(conversations) == len(input_list)
# Print statistics about the selected conversations
stats: list[dict[str, Any]] = []
for conv_data in conversations.values():
stats.append({"num_turns": len(conv_data)})
print(TEXT_SEPARATOR)
print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}")
print(TEXT_SEPARATOR)
percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
conv_stats = pd.DataFrame(stats).describe(percentiles=percentiles)
print(conv_stats.transpose())
print(TEXT_SEPARATOR)
return conversations
def conversations_dict_to_list(input_dict: ConversationsMap) -> ShareGptConversations:
output: ShareGptConversations = []
for conv_id, conv_data in input_dict.items():
new_item = {"id": conv_id, "messages": conv_data}
output.append(new_item)
return output

View File

@ -0,0 +1,28 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import logging
from enum import Enum
class Color(Enum):
RED = "\033[91m"
GREEN = "\033[92m"
BLUE = "\033[94m"
PURPLE = "\033[95m"
CYAN = "\033[96m"
YELLOW = "\033[93m"
RESET = "\033[0m"
def __str__(self):
return self.value
TEXT_SEPARATOR = "-" * 100
# Configure the logger
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] - %(message)s",
datefmt="%d-%m-%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,354 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Download dataset from:
https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json
Convert to OpenAI API:
export INPUT_FILE=sharegpt_20230401_clean_lang_split.json
python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128
"""
import argparse
import json
import random
from statistics import mean
from typing import Any, Optional
import pandas as pd # type: ignore
import tqdm # type: ignore
from transformers import AutoTokenizer # type: ignore
def has_non_english_chars(text: str) -> bool:
return not text.isascii()
def content_is_valid(
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
) -> bool:
if min_content_len and len(content) < min_content_len:
return False
if max_content_len and len(content) > max_content_len:
return False
return has_non_english_chars(content)
def print_stats(
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
) -> None:
# Collect statistics
stats = []
print("\nCollecting statistics...")
for item in tqdm.tqdm(conversations):
# item has "id" and "messages"
messages = item["messages"]
user_turns = 0
assistant_turns = 0
user_words = 0
assistant_words = 0
conv_chars = 0
user_tokens: list[int] = []
assistant_tokens: list[int] = []
for m in messages:
content = m["content"]
conv_chars += len(content)
content_num_words = content.count(" ") + 1
num_tokens = 0
if tokenizer:
num_tokens = len(tokenizer(m["content"]).input_ids)
if m["role"] == "user":
user_turns += 1
user_words += content_num_words
if tokenizer:
user_tokens.append(num_tokens)
elif m["role"] == "assistant":
assistant_turns += 1
assistant_words += content_num_words
if tokenizer:
assistant_tokens.append(num_tokens)
# assert user_turns == assistant_turns, \
# f"Invalid conversation ID {item['id']}"
conv_words = user_words + assistant_words
item_stats = {
"user_turns": user_turns,
"assistant_turns": assistant_turns,
"user_words": user_words,
"assistant_words": assistant_words,
"conv_turns": len(messages),
"conv_words": conv_words,
"conv_characters": conv_chars,
}
if len(user_tokens) > 0:
item_stats["user_tokens"] = int(mean(user_tokens))
if len(assistant_tokens) > 0:
item_stats["assistant_tokens"] = int(mean(assistant_tokens))
stats.append(item_stats)
print("\nStatistics:")
percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
df = pd.DataFrame(stats)
print(df.describe(percentiles=percentiles).transpose())
def convert_sharegpt_to_openai(
seed: int,
input_file: str,
output_file: str,
max_items: Optional[int],
min_content_len: Optional[int] = None,
max_content_len: Optional[int] = None,
min_turns: Optional[int] = None,
max_turns: Optional[int] = None,
model: Optional[str] = None,
) -> None:
if min_turns and max_turns:
assert min_turns <= max_turns
if min_content_len and max_content_len:
# Verify that min is not larger than max if both were given
assert min_content_len <= max_content_len
print(
f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=},"
f" {max_content_len=}, {min_turns=}, {max_turns=}\n"
)
random.seed(seed)
tokenizer = None
if model is not None:
print(f"Loading tokenizer from: {model}")
tokenizer = AutoTokenizer.from_pretrained(model)
# Read the ShareGPT JSON file
print(f"Reading file: {input_file}")
with open(input_file, encoding="utf-8") as f:
# Should be a list of dicts
# Each dict should have "id" (string) and "conversations" (list of dicts)
sharegpt_data = json.load(f)
assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts"
print(f"Total items in input file: {len(sharegpt_data):,}")
print(f"Shuffling dataset with seed {seed}")
random.shuffle(sharegpt_data)
# Map conversation ID to the all the messages
conversation_parts: dict[str, list[Any]] = {}
for item in tqdm.tqdm(sharegpt_data):
assert "id" in item, "Missing key 'id'"
assert "conversations" in item, "Missing key 'conversations'"
# Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.)
conv_id, _ = item["id"].split("_")
new_turns = item["conversations"]
if conv_id not in conversation_parts:
# Start new conversation
conversation_parts[conv_id] = []
elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0:
prev_turns = conversation_parts[conv_id][-1]
if prev_turns[-1]["from"] == new_turns[0]["from"]:
new_turns = new_turns[1:]
if len(new_turns) > 0:
# We assume that parts are in order in the ShareGPT dataset
conversation_parts[conv_id].append(new_turns)
dataset: list[dict[str, Any]] = []
for conv_id, conv_parts in conversation_parts.items():
new_item = {"id": conv_id}
conversations: list[dict[str, str]] = []
# Merge all parts
for conv_part in conv_parts:
conversations.extend(conv_part)
if len(conversations) > 0:
new_item["conversations"] = conversations
dataset.append(new_item)
print(f"Total unique conversations (IDs) in input file: {len(dataset):,}")
# Final output data
final_openai_dataset: list[dict] = []
# Filter conversations from the ShareGPT dataset and convert to OpenAI format
for item in tqdm.tqdm(dataset):
messages: list[dict] = []
assert "id" in item, "Missing key 'id'"
assert "conversations" in item, "Missing key 'conversations'"
conv_id = item["id"]
conversations = item["conversations"]
if min_turns is not None and len(conversations) < min_turns:
# Skip short conversations
continue
# Convert each message in the conversation, up to max_turns if specified
for i, turn in enumerate(conversations):
assert "from" in turn and "value" in turn, (
f"Invalid conversation ID {conv_id} - missing 'from' or 'value'"
)
role = None
turn_from = turn["from"]
if turn_from in {"human", "user"}:
role = "user"
elif turn_from in {"gpt", "bing", "chatgpt", "bard"}:
role = "assistant"
elif turn_from == "system":
role = "system"
assert role is not None, (
f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid"
)
if i == 0 and role != "user":
# If the first message is from assistant (gpt), skip it.
# this happens when the conversation is a follow-up
# to a previous conversation (from the same user).
continue
if max_turns is not None and i >= max_turns:
break
# Convert message to OpenAI format (with "role" and "content")
content = turn["value"]
messages.append({"role": role, "content": content})
# Add the converted conversation to the OpenAI format
if len(messages) > 0:
valid_messages = True
# First turn should always be from the user
user_turn = True
for m in messages:
# Make sure that turns alternate between user and assistant
if (user_turn and m["role"] != "user") or (
not user_turn and m["role"] != "assistant"
):
valid_messages = False
break
user_turn = not user_turn
content = m["content"]
valid_messages = content_is_valid(
content, min_content_len, max_content_len
)
if not valid_messages:
break
if valid_messages is True:
final_openai_dataset.append({"id": conv_id, "messages": messages})
assert len(final_openai_dataset) > 0, "Final number of conversations is zero"
print_stats(final_openai_dataset)
print_stats_again = False
if max_items is not None and len(final_openai_dataset) > max_items:
print(f"\n\nSampling {max_items} items from the dataset...")
print_stats_again = True
final_openai_dataset = random.sample(final_openai_dataset, max_items)
if print_stats_again:
# Print stats after the dataset changed
print_stats(final_openai_dataset, tokenizer)
# Write the converted data to a new JSON file
final_size = len(final_openai_dataset)
print(f"\nTotal conversations converted (after filtering): {final_size:,}")
print(f"\nWriting file: {output_file}")
with open(output_file, "w", encoding="utf-8") as f:
json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2)
def main() -> None:
parser = argparse.ArgumentParser(
description="Convert ShareGPT dataset to OpenAI API format"
)
parser.add_argument("input_file", help="Path to the input ShareGPT JSON file")
parser.add_argument(
"output_file", help="Path to the output OpenAI format JSON file"
)
parser.add_argument(
"--seed", type=int, default=0, help="Seed for random number generators"
)
parser.add_argument(
"--max-items",
type=int,
default=None,
help="Maximum number of items in the output file",
)
parser.add_argument(
"--min-turns",
type=int,
default=None,
help="Minimum number of turns per conversation",
)
parser.add_argument(
"--max-turns",
type=int,
default=None,
help="Maximum number of turns per conversation",
)
parser.add_argument(
"--min-content-len",
type=int,
default=None,
help="Min number of characters in the messages' content",
)
parser.add_argument(
"--max-content-len",
type=int,
default=None,
help="Max number of characters in the messages' content",
)
parser.add_argument(
"--model",
type=str,
default=None,
help="LLM model, only the tokenizer will be used",
)
args = parser.parse_args()
convert_sharegpt_to_openai(
args.seed,
args.input_file,
args.output_file,
args.max_items,
args.min_content_len,
args.max_content_len,
args.min_turns,
args.max_turns,
args.model,
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,35 @@
{
"filetype": "generate_conversations",
"num_conversations": 24,
"text_files": ["pg1184.txt"],
"print_stats": false,
"prompt_input": {
"num_turns": {
"distribution": "uniform",
"min": 12,
"max": 18
},
"common_prefix_num_tokens": {
"distribution": "constant",
"value": 500
},
"prefix_num_tokens": {
"distribution": "lognormal",
"mean": 6,
"sigma": 4,
"max": 1500
},
"num_tokens": {
"distribution": "uniform",
"min": 120,
"max": 160
}
},
"prompt_output": {
"num_tokens": {
"distribution": "uniform",
"min": 80,
"max": 120
}
}
}

View File

@ -0,0 +1,5 @@
numpy>=1.24
pandas>=2.0.0
aiohttp>=3.10
transformers>=4.46
xlsxwriter>=3.2.1

View File

@ -19,7 +19,7 @@ else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845
GIT_TAG 0e43e774597682284358ff2c54530757b654b8d1
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
@ -37,9 +37,9 @@ cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
set(FlashMLA_SOURCES
${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu)
${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu)
set(FlashMLA_INCLUDES
${flashmla_SOURCE_DIR}/csrc/cutlass/include

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 6dbc6e011a3ebe9349eeb74578940dd7095436ba
GIT_TAG 93cf5a08f421a3efd0c4a7e005ef8f742b578ce0
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -60,3 +60,13 @@ struct enable_sm100_only : Kernel {
#endif
}
};
template <typename Kernel>
struct enable_sm120_only : Kernel {
template <typename... Args>
CUTLASS_DEVICE void operator()(Args&&... args) {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1200
Kernel::operator()(std::forward<Args>(args)...);
#endif
}
};

View File

@ -0,0 +1,23 @@
#include "scaled_mm_kernels.hpp"
#include "scaled_mm_blockwise_sm120_fp8_dispatch.cuh"
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
namespace vllm {
void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out,
torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
if (out.dtype() == torch::kBFloat16) {
cutlass_gemm_blockwise_sm120_fp8_dispatch<cutlass::bfloat16_t>(
out, a, b, a_scales, b_scales);
} else {
TORCH_CHECK(out.dtype() == torch::kFloat16);
cutlass_gemm_blockwise_sm120_fp8_dispatch<cutlass::half_t>(
out, a, b, a_scales, b_scales);
}
}
} // namespace vllm

View File

@ -0,0 +1,183 @@
#pragma once
#include "cuda_utils.h"
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
#include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler_params.h"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass_extensions/gemm/dispatch_policy.hpp"
#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
#include "cutlass_gemm_caller.cuh"
namespace vllm {
using namespace cute;
// clang-format off
template <class OutType, int ScaleGranularityM,
int ScaleGranularityN, int ScaleGranularityK,
class MmaTileShape, class ClusterShape,
class EpilogueScheduler, class MainloopScheduler>
struct cutlass_3x_gemm_fp8_blockwise {
using ElementAB = cutlass::float_e4m3_t;
using ElementA = ElementAB;
using LayoutA = cutlass::layout::RowMajor;
using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
using ElementB = ElementAB;
// ColumnMajor is used for B to match the CUTLASS convention.
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
using ElementD = OutType;
using LayoutD = cutlass::layout::RowMajor;
using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
using ElementC = void; // TODO: support bias
using LayoutC = LayoutD;
using LayoutC_Transpose = LayoutD_Transpose;
static constexpr int AlignmentC = AlignmentD;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBlockScale = float;
using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
cute::UMMA::Major::MN, cute::UMMA::Major::K>;
// layout_SFA and layout_SFB cannot be swapped since they are deduced.
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
using ArchTag = cutlass::arch::Sm120;
using OperatorClass = cutlass::arch::OpClassTensorOp;
static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
using ElementScalar = float;
using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag,
OperatorClass,
MmaTileShape,
ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator,
ElementCompute,
ElementC,
LayoutC,
AlignmentC,
ElementD,
LayoutD,
AlignmentD,
EpilogueScheduler,
DefaultOperation
>::CollectiveOp;
using StageCountType = cutlass::gemm::collective::StageCountAuto;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag,
OperatorClass,
ElementA,
cute::tuple<LayoutA, LayoutSFA>,
AlignmentA,
ElementB,
cute::tuple<LayoutB, LayoutSFB>,
AlignmentB,
ElementAccumulator,
MmaTileShape,
ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopScheduler
>::CollectiveOp;
using KernelType = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
struct GemmKernel : public KernelType {};
};
template <typename Gemm>
void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
using GemmKernel = typename Gemm::GemmKernel;
using StrideA = typename Gemm::GemmKernel::StrideA;
using StrideB = typename Gemm::GemmKernel::StrideB;
using StrideD = typename Gemm::GemmKernel::StrideD;
using StrideC = typename Gemm::GemmKernel::StrideC;
using LayoutSFA = typename Gemm::LayoutSFA;
using LayoutSFB = typename Gemm::LayoutSFB;
using ScaleConfig = typename Gemm::ScaleConfig;
using ElementAB = typename Gemm::ElementAB;
using ElementD = typename Gemm::ElementD;
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
StrideA a_stride;
StrideB b_stride;
StrideC c_stride;
a_stride =
cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
b_stride =
cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
c_stride =
cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
LayoutSFA layout_SFA =
ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
LayoutSFB layout_SFB =
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
auto mainloop_args = [&](){
return typename GemmKernel::MainloopArguments{
a_ptr, a_stride, b_ptr, b_stride,
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
};
}();
auto prob_shape = cute::make_shape(m, n, k, 1);
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
typename GemmKernel::EpilogueArguments epilogue_args{
{}, c_ptr, c_stride, c_ptr, c_stride};
c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
epilogue_args);
}
template <typename OutType>
void cutlass_gemm_blockwise_sm120_fp8_dispatch(torch::Tensor& out,
torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
// TODO: better heuristics
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, 128, 128, Shape<_128, _128, _128>,
Shape<_1, _1, _1>, cutlass::epilogue::collective::EpilogueScheduleAuto,
cutlass::gemm::collective::KernelScheduleAuto>>(
out, a, b, a_scales, b_scales);
}
} // namespace vllm

View File

@ -47,4 +47,10 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales);
void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out,
torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales);
} // namespace vllm

View File

@ -1,11 +1,9 @@
#include <cudaTypedefs.h>
#include "c3x/scaled_mm_helper.hpp"
#include "c3x/scaled_mm_kernels.hpp"
#include "cuda_utils.h"
/*
This file defines quantized GEMM operations using the CUTLASS 3.x API, for
NVIDIA GPUs with sm120 (Blackwell Geforce).
NVIDIA GPUs with sm120 (Blackwell).
*/
#if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
@ -15,20 +13,10 @@ void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias) {
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
int M = a.size(0), N = b.size(1), K = a.size(1);
TORCH_CHECK(
(a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
(b_scales.numel() == 1 || b_scales.numel() == b.size(1)),
"Currently, block scaled fp8 gemm is not implemented for Blackwell");
// Standard per-tensor/per-token/per-channel scaling
TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn,
"Currently, only fp8 gemm is implemented for Blackwell");
vllm::cutlass_scaled_mm_sm120_fp8(c, a, b, a_scales, b_scales, bias);
dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
vllm::cutlass_scaled_mm_sm120_fp8,
nullptr, // int8 not supported on SM120
vllm::cutlass_scaled_mm_blockwise_sm120_fp8);
}
#endif

View File

@ -270,7 +270,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
const int num_kv_heads,
const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
@ -304,12 +304,12 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
const auto max_num_partitions = gridDim.y;
const int context_len = context_lens[seq_idx];
const int seq_len = seq_lens[seq_idx];
const int partition_start_token_idx =
partition_idx * T_PAR_SIZE; // partition_size;
// exit if partition is out of context for seq
if (partition_start_token_idx >= context_len) {
if (partition_start_token_idx >= seq_len) {
return;
}
@ -361,8 +361,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
// output layout from QKmfma : QH16xT4x4 16 qheads across 16 lanes, 16 tokens
// across 4 rows x 4 tokens per lane
const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
const int last_ctx_block = num_context_blocks - 1;
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
const int last_seq_block = num_seq_blocks - 1;
const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
@ -373,9 +373,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
const int klocal_token_idx =
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
const int kblock_idx = (kglobal_token_idx < context_len)
const int kblock_idx = (kglobal_token_idx < seq_len)
? kglobal_token_idx / BLOCK_SIZE
: last_ctx_block;
: last_seq_block;
kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
}
@ -476,9 +476,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
// tokens
const int vglobal_token_idx =
partition_start_token_idx + vlocal_token_idx;
const int vblock_idx = (vglobal_token_idx < context_len)
const int vblock_idx = (vglobal_token_idx < seq_len)
? vglobal_token_idx / BLOCK_SIZE
: last_ctx_block;
: last_seq_block;
vphysical_block_number[vtoken_depth][vblock_depth] =
block_table_seq[vblock_idx];
}
@ -554,7 +554,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
if constexpr (ALIBI_ENABLED) {
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
const int local_token_idx = qkout_token_idx + token_depth * 16;
const int alibi_offset = local_token_idx - context_len + 1;
const int alibi_offset = local_token_idx - seq_len + 1;
for (int i = 0; i < 4; i++) {
d_out[token_depth][i] += alibi_slope * (alibi_offset + i);
}
@ -568,9 +568,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
const int local_token_idx = qkout_token_idx + token_depth * 16;
for (int i = 0; i < 4; i++) {
const float tmp = (local_token_idx + i < context_len)
? d_out[token_depth][i]
: -FLT_MAX;
const float tmp =
(local_token_idx + i < seq_len) ? d_out[token_depth][i] : -FLT_MAX;
qk_max = fmaxf(qk_max, tmp);
}
}
@ -582,7 +581,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
const int local_token_idx = qkout_token_idx + token_depth * 16;
for (int i = 0; i < 4; i++) {
const float tmp = (local_token_idx + i < context_len)
const float tmp = (local_token_idx + i < seq_len)
? __expf(d_out[token_depth][i] - qk_max)
: 0.0f;
d_out[token_depth][i] = tmp;
@ -780,7 +779,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
const int num_kv_heads,
const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
@ -809,10 +808,10 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
const auto partition_size = blockDim.x;
const auto max_num_partitions = gridDim.y;
const int context_len = context_lens[seq_idx];
const int seq_len = seq_lens[seq_idx];
const int partition_start_token_idx = partition_idx * partition_size;
// exit if partition is out of context for seq
if (partition_start_token_idx >= context_len) {
if (partition_start_token_idx >= seq_len) {
return;
}
// every 4 lanes fetch 4 different qheads
@ -855,7 +854,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
const int warp_start_token_idx =
partition_start_token_idx + warpid * WARP_SIZE;
if (warp_start_token_idx >= context_len) { // warp out of context
if (warp_start_token_idx >= seq_len) { // warp out of context
#pragma unroll
for (int h = 0; h < GQA_RATIO4; h++) {
shared_qk_max[warpid][h] = -FLT_MAX;
@ -863,8 +862,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
}
} else { // warp within context
const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
const int last_ctx_block = num_context_blocks - 1;
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
const int last_seq_block = num_seq_blocks - 1;
const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
// token id within partition
@ -873,9 +872,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
const int global_token_idx = partition_start_token_idx + local_token_idx;
// fetch block number for k
const int block_idx = (global_token_idx < context_len)
const int block_idx = (global_token_idx < seq_len)
? global_token_idx / BLOCK_SIZE
: last_ctx_block;
: last_seq_block;
// fetch k physical block number
// int32 physical_block_number leads to overflow when multiplied with
@ -888,7 +887,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
for (int b = 0; b < VBLOCKS; b++) {
const int vblock_idx = warp_start_block_idx + b;
const int vblock_idx_ctx =
(vblock_idx <= last_ctx_block) ? vblock_idx : last_ctx_block;
(vblock_idx <= last_seq_block) ? vblock_idx : last_seq_block;
vphysical_blocks[b] = block_table[vblock_idx_ctx];
}
@ -1057,7 +1056,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
const int lane4_token_idx = 4 * (global_token_idx >> 2);
if constexpr (ALIBI_ENABLED) {
const int alibi_offset = lane4_token_idx - context_len + 1;
const int alibi_offset = lane4_token_idx - seq_len + 1;
for (int h = 0; h < QHLOOP; h++) {
for (int i = 0; i < 4; i++) {
d_out[h][i] += alibi_slope[h] * (alibi_offset + i);
@ -1070,7 +1069,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
for (int h = 0; h < QHLOOP; h++) {
qk_max[h] = -FLT_MAX;
for (int i = 0; i < 4; i++) {
qk_max[h] = (lane4_token_idx + i < context_len)
qk_max[h] = (lane4_token_idx + i < seq_len)
? fmaxf(qk_max[h], d_out[h][i])
: qk_max[h];
}
@ -1101,7 +1100,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
for (int h = 0; h < QHLOOP; h++) {
exp_sum[h] = 0.0f;
for (int i = 0; i < 4; i++) {
d_out[h][i] = (lane4_token_idx + i < context_len)
d_out[h][i] = (lane4_token_idx + i < seq_len)
? __expf(d_out[h][i] - qk_max[h])
: 0.0f;
exp_sum[h] += d_out[h][i];
@ -1181,7 +1180,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
}
}
if (warp_start_token_idx >= context_len) { // warp out of context
if (warp_start_token_idx >= seq_len) { // warp out of context
for (int qh = 0; qh < QHLOOP; qh++) {
for (int vh = 0; vh < VHELOOP; vh++) {
vout_shared[qh][vh][laneid][warpid] = {0};
@ -1279,7 +1278,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
// max_num_partitions]
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
// max_num_partitions, head_size]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
const auto num_heads = gridDim.x;
@ -1293,8 +1292,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
return;
}
const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
const int seq_len = seq_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
const auto warpid = threadIdx.x / WARP_SIZE;
__shared__ float shared_global_exp_sum;
@ -1581,7 +1580,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
// head_size, block_size]
const int num_kv_heads, const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
@ -1615,11 +1614,11 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
const int max_num_partitions = gridDim.y;
const int context_len = context_lens[seq_idx]; // length of a seq
const int seq_len = seq_lens[seq_idx]; // length of a seq
const int partition_start_token_idx = partition_idx * T_PAR_SIZE;
// exit if partition is out of context for seq
if (partition_start_token_idx >= context_len) {
if (partition_start_token_idx >= seq_len) {
return;
}
@ -1715,8 +1714,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
}
}
const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
const int last_ctx_block = num_context_blocks - 1;
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
const int last_seq_block = num_seq_blocks - 1;
const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
@ -1727,9 +1726,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
const int klocal_token_idx =
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
const int kblock_idx = (kglobal_token_idx < context_len)
const int kblock_idx = (kglobal_token_idx < seq_len)
? kglobal_token_idx / BLOCK_SIZE
: last_ctx_block;
: last_seq_block;
kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
}
@ -1781,9 +1780,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
vblock_depth * BLOCK_SIZE;
const int vglobal_token_idx =
partition_start_token_idx + vlocal_token_idx;
const int vblock_idx = (vglobal_token_idx < context_len)
const int vblock_idx = (vglobal_token_idx < seq_len)
? vglobal_token_idx / BLOCK_SIZE
: last_ctx_block;
: last_seq_block;
vphysical_block_number[vtoken_depth][vblock_depth] =
block_table_seq[vblock_idx];
}
@ -1836,9 +1835,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
const int local_token_idx = qkout_token_idx + token_depth * 16;
for (int i = 0; i < 8; i++) {
const float tmp = (local_token_idx + 2 * i < context_len)
? dout[token_depth][i]
: -FLT_MAX;
const float tmp =
(local_token_idx + 2 * i < seq_len) ? dout[token_depth][i] : -FLT_MAX;
qk_max = fmaxf(qk_max, tmp);
}
}
@ -1848,7 +1846,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
const int local_token_idx = qkout_token_idx + token_depth * 16;
for (int i = 0; i < 8; i++) {
const float tmp = (local_token_idx + 2 * i < context_len)
const float tmp = (local_token_idx + 2 * i < seq_len)
? __expf(dout[token_depth][i] - qk_max)
: 0.0f;
dout[token_depth][i] = tmp;
@ -2019,7 +2017,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
// head_size, block_size]
const int num_kv_heads, const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
@ -2046,7 +2044,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
// max_num_partitions]
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
// max_num_partitions, head_size]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
const auto num_heads = gridDim.x;
@ -2060,8 +2058,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
return;
}
const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
const int seq_len = seq_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
const int warpid = threadIdx.x / WARP_SIZE;
__shared__ float shared_global_exp_sum;
@ -2349,7 +2347,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
// head_size, block_size]
const int num_kv_heads, const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
@ -2382,11 +2380,11 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
const int max_num_partitions = gridDim.y;
const int context_len = context_lens[seq_idx]; // length of a seq
const int seq_len = seq_lens[seq_idx]; // length of a seq
const int partition_start_token_idx = partition_idx * T_PAR_SIZE;
// exit if partition is out of context for seq
if (partition_start_token_idx >= context_len) {
if (partition_start_token_idx >= seq_len) {
return;
}
@ -2482,8 +2480,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
}
}
const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
const int last_ctx_block = num_context_blocks - 1;
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
const int last_seq_block = num_seq_blocks - 1;
const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
@ -2494,9 +2492,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
const int klocal_token_idx =
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
const int kblock_idx = (kglobal_token_idx < context_len)
const int kblock_idx = (kglobal_token_idx < seq_len)
? kglobal_token_idx / BLOCK_SIZE
: last_ctx_block;
: last_seq_block;
kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
}
@ -2548,9 +2546,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
rowid * VTOKENS_PER_LANE + vblock_depth * BLOCK_SIZE;
const int vglobal_token_idx =
partition_start_token_idx + vlocal_token_idx;
const int vblock_idx = (vglobal_token_idx < context_len)
const int vblock_idx = (vglobal_token_idx < seq_len)
? vglobal_token_idx / BLOCK_SIZE
: last_ctx_block;
: last_seq_block;
vphysical_block_number[vtoken_depth][vblock_depth] =
block_table_seq[vblock_idx];
}
@ -2604,7 +2602,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
const int local_token_idx = qkout_token_idx + token_depth * 16;
for (int i = 0; i < 8; i++) {
const float tmp =
(local_token_idx + i < context_len) ? dout[token_depth][i] : -FLT_MAX;
(local_token_idx + i < seq_len) ? dout[token_depth][i] : -FLT_MAX;
qk_max = fmaxf(qk_max, tmp);
}
}
@ -2614,7 +2612,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
const int local_token_idx = qkout_token_idx + token_depth * 16;
for (int i = 0; i < 8; i++) {
const float tmp = (local_token_idx + i < context_len)
const float tmp = (local_token_idx + i < seq_len)
? __expf(dout[token_depth][i] - qk_max)
: 0.0f;
dout[token_depth][i] = tmp;
@ -2751,7 +2749,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
// head_size, block_size]
const int num_kv_heads, const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
@ -2778,7 +2776,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
// max_num_partitions]
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
// max_num_partitions, head_size]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
const auto num_heads = gridDim.x;
@ -2792,8 +2790,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
return;
}
const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
const int seq_len = seq_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
const int warpid = threadIdx.x / WARP_SIZE;
__shared__ float shared_global_exp_sum;
@ -2980,7 +2978,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
const int num_kv_heads,
const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
@ -3007,7 +3005,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
const int num_kv_heads,
const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
@ -3031,7 +3029,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
const int* __restrict__ context_lens, // [num_seqs]
const int* __restrict__ seq_lens, // [num_seqs]
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
UNREACHABLE_CODE
@ -3046,7 +3044,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
GQA_RATIO> \
<<<grid, block, 0, stream>>>( \
query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \
block_tables_ptr, seq_lens_ptr, query_start_loc_ptr, \
max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \
max_ctx_blocks, k_scale_ptr, v_scale_ptr);
@ -3057,18 +3055,17 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
GQA_RATIO> \
<<<grid, block, 0, stream>>>( \
query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \
block_tables_ptr, seq_lens_ptr, query_start_loc_ptr, \
max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \
max_ctx_blocks, k_scale_ptr, v_scale_ptr);
#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \
paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
PARTITION_SIZE, NPAR_LOOPS> \
<<<reduce_grid, reduce_block, 0, stream>>>( \
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, \
context_lens_ptr, query_start_loc_ptr, max_num_partitions, \
fp8_out_scale_ptr);
#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \
paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
PARTITION_SIZE, NPAR_LOOPS> \
<<<reduce_grid, reduce_block, 0, stream>>>( \
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \
query_start_loc_ptr, max_num_partitions, fp8_out_scale_ptr);
template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
@ -3077,8 +3074,8 @@ void paged_attention_custom_launcher(
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, const int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& context_lens,
const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
torch::Tensor& block_tables, torch::Tensor& seq_lens,
const std::optional<torch::Tensor>& query_start_loc, int max_seq_len,
const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale) {
int num_seqs = block_tables.size(0);
@ -3109,7 +3106,7 @@ void paged_attention_custom_launcher(
KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
int* block_tables_ptr = block_tables.data_ptr<int>();
int* context_lens_ptr = context_lens.data_ptr<int>();
int* seq_lens_ptr = seq_lens.data_ptr<int>();
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
// NOTE: fp8_out_scale is optional.
@ -3119,13 +3116,12 @@ void paged_attention_custom_launcher(
: nullptr;
OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
const int max_ctx_blocks = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE);
// partition size is fixed at 256 since both mfma4 and mfma16 kernels support
// it mfma4 kernel also supports partition size 512
constexpr int PARTITION_SIZE = 256;
const int max_num_partitions =
DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
const int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
const int gqa_ratio = num_heads / num_kv_heads;
assert(num_heads % num_kv_heads == 0);
assert(head_size == HEAD_SIZE);
@ -3234,8 +3230,8 @@ void paged_attention_custom_launcher_navi(
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, const int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& context_lens,
const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
torch::Tensor& block_tables, torch::Tensor& seq_lens,
const std::optional<torch::Tensor>& query_start_loc, int max_seq_len,
const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
torch::Tensor& v_scale) {
int num_seqs = block_tables.size(0);
@ -3263,7 +3259,7 @@ void paged_attention_custom_launcher_navi(
KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
int* block_tables_ptr = block_tables.data_ptr<int>();
int* context_lens_ptr = context_lens.data_ptr<int>();
int* seq_lens_ptr = seq_lens.data_ptr<int>();
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
@ -3271,11 +3267,10 @@ void paged_attention_custom_launcher_navi(
const auto fp8_out_scale_ptr = nullptr;
OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
const int max_ctx_blocks = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE);
constexpr int PARTITION_SIZE = 256;
const int max_num_partitions =
DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
const int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
const int gqa_ratio = num_heads / num_kv_heads;
assert(num_heads % num_kv_heads == 0);
assert(head_size == HEAD_SIZE);
@ -3407,14 +3402,14 @@ void paged_attention_custom_launcher_navi(
paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \
OUTT, PSIZE, ALIBI_ENABLED>( \
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
num_kv_heads, scale, block_tables, context_lens, query_start_loc, \
max_context_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); \
num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \
max_seq_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); \
} else { \
paged_attention_custom_launcher_navi< \
T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
num_kv_heads, scale, block_tables, context_lens, query_start_loc, \
max_context_len, alibi_slopes, k_scale, v_scale); \
num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \
max_seq_len, alibi_slopes, k_scale, v_scale); \
}
#define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \
@ -3502,9 +3497,9 @@ void paged_attention(
int64_t num_kv_heads,
double scale,
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
torch::Tensor& context_lens, // [num_seqs]
torch::Tensor& seq_lens, // [num_seqs]
const std::optional<torch::Tensor>& query_start_loc, // [num_seqs]
int64_t block_size, int64_t max_context_len,
int64_t block_size, int64_t max_seq_len,
const std::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale,

View File

@ -15,8 +15,8 @@ void paged_attention(
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
torch::Tensor& block_tables, torch::Tensor& context_lens,
torch::Tensor& block_tables, torch::Tensor& seq_lens,
const std::optional<torch::Tensor>& query_start_loc, int64_t block_size,
int64_t max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale);

View File

@ -41,10 +41,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
" Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads,"
" float scale, Tensor block_tables,"
" Tensor context_lens,"
" Tensor seq_lens,"
" Tensor? query_start_loc,"
" int block_size,"
" int max_context_len,"
" int max_seq_len,"
" Tensor? alibi_slopes,"
" str kv_cache_dtype,"
" Tensor k_scale, Tensor v_scale,"

View File

@ -210,16 +210,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels
ARG VLLM_USE_PRECOMPILED
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
ENV VLLM_USE_PRECOMPILED=""
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
export VLLM_USE_PRECOMPILED=1 && \
echo "Using precompiled wheels"; \
else \
unset VLLM_USE_PRECOMPILED && \
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
fi
ARG VLLM_USE_PRECOMPILED=""
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
@ -236,6 +227,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
@ -249,6 +242,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
@ -392,7 +387,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
ARG FLASHINFER_GIT_REF="v0.2.10"
ARG FLASHINFER_GIT_REF="v0.2.11"
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
. /etc/environment
git clone --depth 1 --recursive --shallow-submodules \

View File

@ -1,9 +1,12 @@
# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
RUN rm /etc/apt/sources.list.d/intel-graphics.list
RUN apt-get update -y && \
RUN apt clean && apt-get update -y && \
apt-get install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get install -y python3.10 python3.10-distutils && \
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
apt-get install -y --no-install-recommends --fix-missing \
curl \
ffmpeg \
@ -14,11 +17,13 @@ RUN apt-get update -y && \
libgl1 \
lsb-release \
numactl \
python3 \
python3-dev \
python3-pip \
python3.10-dev \
wget
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
WORKDIR /workspace/vllm
COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
COPY requirements/common.txt /workspace/vllm/requirements/common.txt

View File

@ -1,25 +1,17 @@
nav:
- Home:
- vLLM: README.md
- Home: README.md
- User Guide:
- usage/README.md
- Getting Started:
- getting_started/quickstart.md
- getting_started/installation
- Examples:
- examples/README.md
- Offline Inference: examples/offline_inference
- Online Serving: examples/online_serving
- Others: examples/others
- Quick Links:
- User Guide: usage/README.md
- Developer Guide: contributing/README.md
- API Reference: api/README.md
- CLI Reference: cli/README.md
- Timeline:
- Roadmap: https://roadmap.vllm.ai
- Releases: https://github.com/vllm-project/vllm/releases
- User Guide:
- Summary: usage/README.md
- usage/v1_guide.md
- General:
- usage/v1_guide.md
- usage/*
- Inference and Serving:
- serving/offline_inference.md
@ -32,7 +24,7 @@ nav:
- deployment/integrations
- Training: training
- Configuration:
- Summary: configuration/README.md
- configuration/README.md
- configuration/*
- Models:
- models/supported_models.md
@ -45,11 +37,11 @@ nav:
- features/*
- features/quantization
- Developer Guide:
- Summary: contributing/README.md
- contributing/README.md
- General:
- glob: contributing/*
flatten_single_child_sections: true
- Model Implementation:
- Model Implementation:
- contributing/model/README.md
- contributing/model/basic.md
- contributing/model/registration.md
@ -58,12 +50,9 @@ nav:
- CI: contributing/ci
- Design Documents: design
- API Reference:
- Summary: api/README.md
- Contents:
- glob: api/vllm/*
preserve_directory_names: true
- CLI Reference:
- Summary: cli/README.md
- api/README.md
- api/vllm/*
- CLI Reference: cli
- Community:
- community/*
- Blog: https://blog.vllm.ai

View File

@ -21,6 +21,17 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
Where to get started with vLLM depends on the type of user. If you are looking to:
- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
- Build vLLM, we recommend starting with [Developer Guide](./contributing)
For information about the development of vLLM, see:
- [Roadmap](https://roadmap.vllm.ai)
- [Releases](https://github.com/vllm-project/vllm/releases)
vLLM is fast with:
- State-of-the-art serving throughput

View File

@ -1,7 +1,5 @@
# Summary
[](){ #configuration }
## Configuration
API documentation for vLLM's configuration classes.

1
docs/cli/.meta.yml Normal file
View File

@ -0,0 +1 @@
toc_depth: 3

8
docs/cli/.nav.yml Normal file
View File

@ -0,0 +1,8 @@
nav:
- README.md
- serve.md
- chat.md
- complete.md
- run-batch.md
- vllm bench:
- bench/*.md

View File

@ -1,7 +1,3 @@
---
toc_depth: 4
---
# vLLM CLI Guide
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
@ -18,37 +14,46 @@ vllm {chat,complete,serve,bench,collect-env,run-batch}
## serve
Start the vLLM OpenAI Compatible API server.
Starts the vLLM OpenAI Compatible API server.
??? console "Examples"
Start with a model:
```bash
# Start with a model
vllm serve meta-llama/Llama-2-7b-hf
```bash
vllm serve meta-llama/Llama-2-7b-hf
```
# Specify the port
vllm serve meta-llama/Llama-2-7b-hf --port 8100
Specify the port:
# Check with --help for more options
# To list all groups
vllm serve --help=listgroup
```bash
vllm serve meta-llama/Llama-2-7b-hf --port 8100
```
# To view a argument group
vllm serve --help=ModelConfig
Serve over a Unix domain socket:
# To view a single argument
vllm serve --help=max-num-seqs
```bash
vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock
```
# To search by keyword
vllm serve --help=max
Check with --help for more options:
# To view full help with pager (less/more)
vllm serve --help=page
```
```bash
# To list all groups
vllm serve --help=listgroup
### Options
# To view a argument group
vllm serve --help=ModelConfig
--8<-- "docs/argparse/serve.md"
# To view a single argument
vllm serve --help=max-num-seqs
# To search by keyword
vllm serve --help=max
# To view full help with pager (less/more)
vllm serve --help=page
```
See [vllm serve](./serve.md) for the full reference of all available arguments.
## chat
@ -65,6 +70,8 @@ vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1
vllm chat --quick "hi"
```
See [vllm chat](./chat.md) for the full reference of all available arguments.
## complete
Generate text completions based on the given prompt via the running API server.
@ -80,7 +87,7 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
vllm complete --quick "The future of AI is"
```
</details>
See [vllm complete](./complete.md) for the full reference of all available arguments.
## bench
@ -107,6 +114,8 @@ vllm bench latency \
--load-format dummy
```
See [vllm bench latency](./bench/latency.md) for the full reference of all available arguments.
### serve
Benchmark the online serving throughput.
@ -121,6 +130,8 @@ vllm bench serve \
--num-prompts 5
```
See [vllm bench serve](./bench/serve.md) for the full reference of all available arguments.
### throughput
Benchmark offline inference throughput.
@ -134,6 +145,8 @@ vllm bench throughput \
--load-format dummy
```
See [vllm bench throughput](./bench/throughput.md) for the full reference of all available arguments.
## collect-env
Start collecting environment information.
@ -146,24 +159,25 @@ vllm collect-env
Run batch prompts and write results to file.
<details>
<summary>Examples</summary>
Running with a local file:
```bash
# Running with a local file
vllm run-batch \
-i offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
```
# Using remote file
Using remote file:
```bash
vllm run-batch \
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
```
</details>
See [vllm run-batch](./run-batch.md) for the full reference of all available arguments.
## More Help

View File

@ -0,0 +1,9 @@
# vllm bench latency
## JSON CLI Arguments
--8<-- "docs/cli/json_tip.inc.md"
## Options
--8<-- "docs/argparse/bench_latency.md"

9
docs/cli/bench/serve.md Normal file
View File

@ -0,0 +1,9 @@
# vllm bench serve
## JSON CLI Arguments
--8<-- "docs/cli/json_tip.inc.md"
## Options
--8<-- "docs/argparse/bench_serve.md"

View File

@ -0,0 +1,9 @@
# vllm bench throughput
## JSON CLI Arguments
--8<-- "docs/cli/json_tip.inc.md"
## Options
--8<-- "docs/argparse/bench_throughput.md"

5
docs/cli/chat.md Normal file
View File

@ -0,0 +1,5 @@
# vllm chat
## Options
--8<-- "docs/argparse/chat.md"

5
docs/cli/complete.md Normal file
View File

@ -0,0 +1,5 @@
# vllm complete
## Options
--8<-- "docs/argparse/complete.md"

9
docs/cli/json_tip.inc.md Normal file
View File

@ -0,0 +1,9 @@
When passing JSON CLI arguments, the following sets of arguments are equivalent:
- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
Additionally, list elements can be passed individually using `+`:
- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`

9
docs/cli/run-batch.md Normal file
View File

@ -0,0 +1,9 @@
# vllm run-batch
## JSON CLI Arguments
--8<-- "docs/cli/json_tip.inc.md"
## Options
--8<-- "docs/argparse/run-batch.md"

9
docs/cli/serve.md Normal file
View File

@ -0,0 +1,9 @@
# vllm serve
## JSON CLI Arguments
--8<-- "docs/cli/json_tip.inc.md"
## Options
--8<-- "docs/argparse/serve.md"

View File

@ -2,6 +2,7 @@
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
- [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
- [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).

View File

@ -15,6 +15,7 @@ Cash Donations:
Compute Resources:
- Alibaba Cloud
- AMD
- Anyscale
- AWS

View File

@ -11,6 +11,8 @@ Engine arguments control the behavior of the vLLM engine.
The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings.
--8<-- "docs/cli/json_tip.inc.md"
## `EngineArgs`
--8<-- "docs/argparse/engine_args.md"

View File

@ -96,7 +96,7 @@ Although its common to do this with GPUs, don't try to fragment 2 or 8 differ
### Tune your workloads
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
### Future Topics We'll Cover

View File

@ -131,19 +131,6 @@ MAX_JOBS=16 uv pip install --system \
--no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
```
### Mamba
```bash
uv pip install --system \
--no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
```
### causal-conv1d
```bash
uv pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
```
## Update all the different vLLM platforms
Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable

View File

@ -117,7 +117,7 @@ For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `m
To support a model with interleaving sliding windows, we need to take care of the following details:
- Make sure the model's `config.json` contains `sliding_window_pattern`. vLLM then sets `self.hf_text_config.interleaved_sliding_window` to the value of `self.hf_text_config.sliding_window` and deletes `sliding_window` from `self.hf_text_config`. The model will then be treated as a full-attention model.
- Make sure the model's `config.json` contains `layer_types`.
- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
With these two steps, interleave sliding windows should work with the model.

View File

@ -540,8 +540,10 @@ return a schema of the tensors outputted by the HF processor that are related to
The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore
`(1, num_images, num_patches, patch_width * patch_height * num_channels)`.
In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
In order to support the use of
[MultiModalFieldConfig.batched][vllm.multimodal.inputs.MultiModalFieldConfig.batched]
like in LLaVA, we remove the extra batch dimension by overriding
[BaseMultiModalProcessor._call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor]:
??? code
@ -816,7 +818,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2),
[BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3),
and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4),
decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.processing.MultiModalRegistry.register_processor]
decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.registry.MultiModalRegistry.register_processor]
to register them to the multi-modal registry:
```diff

View File

@ -57,11 +57,11 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
These are documented under [Inferencing and Serving -> Production Metrics](../../usage/metrics.md).
These are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md).
### Grafana Dashboard
vLLM also provides [a reference example](../../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
vLLM also provides [a reference example](../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
@ -455,7 +455,7 @@ In general:
[an escape hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics)
for some time before deleting them.
See the [deprecation policy](../../contributing/deprecation_policy.md) for
See the [deprecation policy](../contributing/deprecation_policy.md) for
the project-wide deprecation policy.
### Unimplemented - `vllm:tokens_total`
@ -655,7 +655,7 @@ v0 has support for OpenTelemetry tracing:
- Added by <gh-pr:4687>
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
- [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
- [User-facing docs](../../examples/online_serving/opentelemetry.md)
- [User-facing docs](../examples/online_serving/opentelemetry.md)
- [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
- [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)

7
docs/examples/README.md Normal file
View File

@ -0,0 +1,7 @@
# Examples
vLLM's examples are split into three categories:
- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference/)
- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving/)
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others/)

View File

@ -1,7 +1,4 @@
---
title: FP8 INC
---
[](){ #inc }
# FP8 INC
vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators.
Currently, quantization is validated only in Llama models.

View File

@ -0,0 +1,80 @@
# Sleep Mode
vLLM's Sleep Mode allows you to temporarily release most GPU memory used by a model, including model weights and KV cache, without stopping the server or unloading the Docker container. This is especially useful for RLHF, training, or cost-saving scenarios where GPU resources need to be freed between inference workloads.
Key benefits:
- **Frees GPU memory**: Offloads model weights to CPU RAM and discards KV cache, releasing up to 90%+ of GPU memory for other tasks.
- **Fast resume**: Quickly wake up the engine and resume inference without full model reload.
- **API endpoints**: Control sleep/wake_up state via HTTP endpoints or Python API.
- **Supports distributed workloads**: Works with tensor parallelism, pipeline parallelism, etc.
- **Fine-grained control**: Optionally wake up only model weights or KV cache to avoid OOM during weight updates.
!!! note
This feature is only supported on CUDA platform.
## Sleep levels
Level 1 sleep will offload the model weights and discard the KV cache. The content of KV cache is forgotten. Level 1 sleep is good for sleeping and waking up the engine to run the same model again. The model weights are backed up in CPU memory. Please make sure there's enough CPU memory to store the model weights. Level 2 sleep will discard both the model weights and the KV cache (while the model's buffers are kept in CPU, like rope scaling tensors). The content of both the model weights and KV cache is forgotten. Level 2 sleep is good for sleeping and waking up the engine to run a different model or update the model, where previous model weights are not needed, e.g. RLHF weight update.
## Usage
### Offline inference
Enable sleep mode by passing `enable_sleep_mode=True` to the `LLM` class.
```python
from vllm import LLM
llm = LLM("Qwen/Qwen3-0.6B", enable_sleep_mode=True)
```
#### Python API
```python
# Put the engine to sleep (level=1: offload weights to CPU RAM, discard KV cache)
llm.sleep(level=1)
# Wake up the engine (restore weights)
llm.wake_up()
```
#### RLHF weight updates
During RLHF training, vLLM allows you to selectively wake up only the model weights or the KV cache using the tags argument in wake_up(). This fine-grained control is especially useful when updating model weights: by waking up just the weights (e.g., llm.wake_up(tags=["weights"])), you avoid allocating memory for the KV cache until after the weight update is complete. This approach helps prevent GPU out-of-memory (OOM) errors, particularly with large models, by minimizing peak memory usage during weight synchronization and update operations.
Use `tags=["weights"]` or `tags=["kv_cache"]` to control which resources are restored, useful for RLHF and weight updates. **Note** that `is_sleeping` will report `true` until all components are awake.
```python
# Put engine to deep sleep (level=2)
llm.sleep(level=2)
# ... Get the new weights
# Wake up only weights to avoid OOM
llm.wake_up(tags=["weights"])
# ... Update the weights
# wake up KV cache after weights are updated
llm.wake_up(tags=["kv_cache"])
```
### Online Serving
To enable sleep mode in a vLLM server you need to initialize it with the flag `VLLM_SERVER_DEV_MODE=1` and pass `--enable-sleep-mode` to the vLLM server.
#### Server in development mode
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
```bash
VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-0.6B \
--enable-sleep-mode \
--port 8000
```
#### HTTP endpoints
- `POST /sleep?level=1` — Put the model to sleep (`level=1`).
- `POST /wake_up` — Wake up the model. Supports optional `tags` query parameters for partial wake-up (e.g., `?tags=weights`).
- `GET /is_sleeping` — Check if the model is sleeping.
!!! note
These endpoints are only available when passing `VLLM_SERVER_DEV_MODE=1`.

View File

@ -203,6 +203,7 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"draft_tensor_parallel_size": 1,
"num_speculative_tokens": 2,
"method": "eagle",
},
)
@ -231,6 +232,9 @@ A few important things to consider when using the EAGLE based draft models:
reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
investigation and tracked here: <gh-issue:9565>.
4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3".
That is, to specify `"method": "eagle3"` in `speculative_config`.
A variety of EAGLE draft models are available on the Hugging Face hub:
| Base Model | EAGLE on Hugging Face | # EAGLE Parameters |

View File

@ -6,7 +6,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
# --8<-- [start:requirements]
- OS: Linux
- CPU flags: `avx512f`, `avx512_bf16` (Optional), `avx512_vnni` (Optional)
- CPU flags: `avx512f` (Recommended), `avx512_bf16` (Optional), `avx512_vnni` (Optional)
!!! tip
Use `lscpu` to check the CPU flags.
@ -28,7 +28,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
[https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
!!! warning
If deploying the pre-built images on machines only contain `avx512f`, `Illegal instruction` error may be raised. It is recommended to build images for these machines with `--build-arg VLLM_CPU_AVX512BF16=false` and `--build-arg VLLM_CPU_AVX512VNNI=false`.
If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. It is recommended to build images for these machines with the appropriate build arguments (e.g., `--build-arg VLLM_CPU_DISABLE_AVX512=true`, `--build-arg VLLM_CPU_AVX512BF16=false`, or `--build-arg VLLM_CPU_AVX512VNNI=false`) to disable unsupported features. Please note that without `avx512f`, AVX2 will be used and this version is not recommended because it only has basic feature support.
# --8<-- [end:pre-built-images]
# --8<-- [start:build-image-from-source]
@ -37,6 +37,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
docker build -f docker/Dockerfile.cpu \
--build-arg VLLM_CPU_AVX512BF16=false (default)|true \
--build-arg VLLM_CPU_AVX512VNNI=false (default)|true \
--build-arg VLLM_CPU_DISABLE_AVX512=false (default)|true \
--tag vllm-cpu-env \
--target vllm-openai .

View File

@ -15,8 +15,14 @@ sys.modules["aiohttp"] = MagicMock()
sys.modules["blake3"] = MagicMock()
sys.modules["vllm._C"] = MagicMock()
from vllm.benchmarks import latency # noqa: E402
from vllm.benchmarks import serve # noqa: E402
from vllm.benchmarks import throughput # noqa: E402
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402
from vllm.entrypoints.cli.openai import ChatCommand # noqa: E402
from vllm.entrypoints.cli.openai import CompleteCommand # noqa: E402
from vllm.entrypoints.openai import cli_args # noqa: E402
from vllm.entrypoints.openai import run_batch # noqa: E402
from vllm.utils import FlexibleArgumentParser # noqa: E402
logger = logging.getLogger("mkdocs")
@ -68,7 +74,8 @@ class MarkdownFormatter(HelpFormatter):
self._markdown_output.append(
f"Possible choices: {metavar}\n\n")
self._markdown_output.append(f"{action.help}\n\n")
if action.help:
self._markdown_output.append(f"{action.help}\n\n")
if (default := action.default) != SUPPRESS:
self._markdown_output.append(f"Default: `{default}`\n\n")
@ -78,7 +85,7 @@ class MarkdownFormatter(HelpFormatter):
return "".join(self._markdown_output)
def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
"""Create a parser for the given class with markdown formatting.
Args:
@ -88,18 +95,12 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
Returns:
FlexibleArgumentParser: A parser with markdown formatting for the class.
"""
parser = FlexibleArgumentParser()
parser = FlexibleArgumentParser(add_json_tip=False)
parser.formatter_class = MarkdownFormatter
with patch("vllm.config.DeviceConfig.__post_init__"):
return cls.add_cli_args(parser, **kwargs)
def create_serve_parser() -> FlexibleArgumentParser:
"""Create a parser for the serve command with markdown formatting."""
parser = FlexibleArgumentParser()
parser.formatter_class = lambda prog: MarkdownFormatter(
prog, starting_heading_level=4)
return make_arg_parser(parser)
_parser = add_cli_args(parser, **kwargs)
# add_cli_args might be in-place so return parser if _parser is None
return _parser or parser
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
@ -113,10 +114,24 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
# Create parsers to document
parsers = {
"engine_args": create_parser(EngineArgs),
"async_engine_args": create_parser(AsyncEngineArgs,
async_args_only=True),
"serve": create_serve_parser(),
"engine_args":
create_parser(EngineArgs.add_cli_args),
"async_engine_args":
create_parser(AsyncEngineArgs.add_cli_args, async_args_only=True),
"serve":
create_parser(cli_args.make_arg_parser),
"chat":
create_parser(ChatCommand.add_cli_args),
"complete":
create_parser(CompleteCommand.add_cli_args),
"bench_latency":
create_parser(latency.add_cli_args),
"bench_throughput":
create_parser(throughput.add_cli_args),
"bench_serve":
create_parser(serve.add_cli_args),
"run-batch":
create_parser(run_batch.make_arg_parser),
}
# Generate documentation for each parser

View File

@ -105,7 +105,7 @@ class Example:
return fix_case(self.path.stem.replace("_", " ").title())
def generate(self) -> str:
content = f"---\ntitle: {self.title}\n---\n\n"
content = f"# {self.title}\n\n"
content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
# Use long code fence to avoid issues with

View File

@ -23,6 +23,13 @@ a:not(:has(svg)):not(.md-icon):not(.autorefs-external) {
}
}
a[href*="localhost"]::after,
a[href*="127.0.0.1"]::after,
a[href*="org.readthedocs.build"]::after,
a[href*="docs.vllm.ai"]::after {
display: none !important;
}
/* Light mode: darker section titles */
body[data-md-color-scheme="default"] .md-nav__item--section > label.md-nav__link .md-ellipsis {
color: rgba(0, 0, 0, 0.7) !important;

View File

@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL
In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text.
which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text.
## Configuration
@ -19,7 +19,7 @@ Run a model in generation mode via the option `--runner generate`.
## Offline Inference
The [LLM][vllm.LLM] class provides various methods for offline inference.
See [configuration][configuration] for a list of options when initializing the model.
See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
### `LLM.generate`

View File

@ -81,7 +81,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
## Offline Inference
The [LLM][vllm.LLM] class provides various methods for offline inference.
See [configuration][configuration] for a list of options when initializing the model.
See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
### `LLM.embed`

View File

@ -320,7 +320,7 @@ th {
}
</style>
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -331,7 +331,7 @@ th {
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
@ -349,9 +349,10 @@ th {
| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4MoeForCausalLM` | GLM-4.5 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
@ -404,16 +405,19 @@ th {
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | |
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | |
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ |
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ | ✅︎ |
!!! note
Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
!!! note
Only text inputs are currently supported for `Gemma3nForConditionalGeneration`. To use this model, please upgrade Hugging Face Transformers to version 4.53.0.
### Pooling Models
See [this page](./pooling_models.md) for more information on how to use pooling models.
@ -426,7 +430,7 @@ See [this page](./pooling_models.md) for more information on how to use pooling
These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
@ -466,7 +470,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding
These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
@ -483,7 +487,7 @@ If your model is not in the above list, we will try to automatically convert the
Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | |
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
@ -521,7 +525,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -583,6 +587,9 @@ See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inp
**This is no longer required if you are using vLLM V1.**
!!! tip
For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
!!! note
vLLM currently only supports adding LoRA to the language backbone of multimodal models.
@ -594,20 +601,21 @@ See [this page](generative_models.md) for more information on how to use generat
These models primarily accept the [`LLM.generate`](./generative_models.md#llmgenerate) API. Chat/Instruct models additionally support the [`LLM.chat`](./generative_models.md#llmchat) API.
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4MoeForCausalLM` | GLM-4.5 | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
@ -647,7 +655,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ |
@ -674,6 +682,15 @@ Some models are supported only via the [Transformers backend](#transformers). Th
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
!!! note
`Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
MobileNet-v5 vision backbone.
Performance is not yet fully optimized mainly due to:
- Both audio and vision MM encoders use `transformers.AutoModel` implementation.
- There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.
!!! note
Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
@ -726,7 +743,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th
Speech2Text models trained specifically for Automatic Speech Recognition.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ |
@ -744,7 +761,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
The following table lists those that are tested in vLLM.
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
@ -760,7 +777,7 @@ The following table lists those that are tested in vLLM.
Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |

View File

@ -1,4 +1,4 @@
# Distributed inference and serving
# Parallelism and Scaling
## Distributed inference strategies for a single-model replica

View File

@ -1,6 +1,8 @@
# Using vLLM
vLLM supports the following usage patterns:
First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
Then, vLLM supports the following usage patterns:
- [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model.
- [Deployment](../deployment/docker.md): Scale up model instances for production.

View File

@ -289,7 +289,7 @@ Traceback (most recent call last):
...
```
This indicates vLLM failed to initialize the NCCL communicator, possibly due to a missing `IPC_LOCK` linux capability or an unmounted `/dev/shm`. Refer to [Distributed Inference and Serving](../serving/distributed_serving.md#running-vllm-on-multiple-nodes) for guidance on properly configuring the environment for distributed serving.
This indicates vLLM failed to initialize the NCCL communicator, possibly due to a missing `IPC_LOCK` linux capability or an unmounted `/dev/shm`. Refer to [Enabling GPUDirect RDMA](../serving/parallelism_scaling.md#enabling-gpudirect-rdma) for guidance on properly configuring the environment for GPUDirect RDMA.
## Known Issues

View File

@ -59,12 +59,13 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
### Hardware
| Hardware | Status |
|------------|------------------------------------|
| **NVIDIA** | <nobr>🚀</nobr> |
| **AMD** | <nobr>🟢</nobr> |
| **TPU** | <nobr>🟢</nobr> |
| **CPU** | <nobr>🟢 (x86) 🟡 (MacOS) </nobr> |
| Hardware | Status |
|------------|-----------------------------------------------|
| **NVIDIA** | <nobr>🚀</nobr> |
| **AMD** | <nobr>🟢</nobr> |
| **INTEL GPU** | <nobr>🟢</nobr> |
| **TPU** | <nobr>🟢</nobr> |
| **CPU** | <nobr>🟢 (x86\_64/aarch64) 🟡 (MacOS) </nobr> |
!!! note
@ -72,6 +73,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
- [vllm-ascend](https://github.com/vllm-project/vllm-ascend)
- [vllm-spyre](https://github.com/vllm-project/vllm-spyre)
- [vllm-gaudi](https://github.com/vllm-project/vllm-gaudi)
- [vllm-openvino](https://github.com/vllm-project/vllm-openvino)
Please check their corresponding repositories for more details.
@ -111,6 +113,10 @@ Models that combine Mamba-2 and Mamba-1 layers with standard attention layers ar
`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
these models currently require disabling prefix caching and using the FlashInfer attention backend in V1.
Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
Please note that these models currently require disabling prefix caching, enforcing eager mode, and using the FlashInfer
attention backend in V1.
#### Encoder-Decoder Models
Models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`)

View File

@ -96,6 +96,25 @@ def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
)
# Gemma3N
def run_gemma3n(question: str, audio_count: int) -> ModelRequestData:
model_name = "google/gemma-3n-E2B-it"
engine_args = EngineArgs(
model=model_name,
max_model_len=2048,
max_num_batched_tokens=2048,
max_num_seqs=2,
limit_mm_per_prompt={"audio": audio_count},
enforce_eager=True,
)
prompt = f"<start_of_turn>user\n<audio_soft_token>{question}"
"<end_of_turn>\n<start_of_turn>model\n"
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
)
# Granite Speech
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
# NOTE - the setting in this example are somehat different than what is
@ -331,6 +350,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
model_example_map = {
"voxtral": run_voxtral,
"gemma3n": run_gemma3n,
"granite_speech": run_granite_speech,
"minicpmo": run_minicpmo,
"phi4_mm": run_phi4mm,

View File

@ -126,6 +126,29 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
)
def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "CohereLabs/command-a-vision-07-2025"
engine_args = EngineArgs(
model=model_name,
max_model_len=32768,
tensor_parallel_size=4,
limit_mm_per_prompt={modality: 1},
)
prompts = [
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><|IMG_PATCH|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Deepseek-VL2
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -211,7 +234,33 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Gemma3N
def run_gemma3n(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "google/gemma-3n-E2B-it"
engine_args = EngineArgs(
model=model_name,
max_model_len=2048,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
enforce_eager=True,
)
prompts = [
(
"<start_of_turn>user\n"
f"<image_soft_token>{question}<end_of_turn>\n"
"<start_of_turn>model\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
@ -1391,10 +1440,12 @@ model_example_map = {
"aya_vision": run_aya_vision,
"blip-2": run_blip2,
"chameleon": run_chameleon,
"command_a_vision": run_command_a_vision,
"deepseek_vl_v2": run_deepseek_vl2,
"florence2": run_florence2,
"fuyu": run_fuyu,
"gemma3": run_gemma3,
"gemma3n": run_gemma3n,
"glm4v": run_glm4v,
"glm4_1v": run_glm4_1v,
"h2ovl_chat": run_h2ovl,

View File

@ -107,6 +107,42 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "CohereLabs/command-a-vision-07-2025"
# NOTE: This model is 122B parameters and requires tensor parallelism
# Recommended to use tp=4 on H100 GPUs
engine_args = EngineArgs(
model=model_name,
max_model_len=32768,
tensor_parallel_size=4,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "deepseek-ai/deepseek-vl2-tiny"
@ -1031,6 +1067,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
model_example_map = {
"aria": load_aria,
"aya_vision": load_aya_vision,
"command_a_vision": load_command_a_vision,
"deepseek_vl_v2": load_deepseek_vl2,
"gemma3": load_gemma3,
"h2ovl_chat": load_h2ovl,

View File

@ -15,6 +15,14 @@ else
MODEL=$2
fi
# The prefillers and decoders in LMCache use the same hash seed for all chunk keys.
# This seed must be aligned so that decoders can identify and retrieve KV cache
# entries stored by prefillers.
#
# WARNING: Using a fixed hash seed is insecure and makes the application vulnerable to
# denial-of-service attacks. In a production environment, this should be set to a
# secure random value. This is set to a fixed value for demonstration purposes only.
export PYTHONHASHSEED=${VLLM_PYTHON_HASH_SEED:-123}
if [[ $1 == "prefiller" ]]; then
# Prefiller listens on port 8100

View File

@ -34,11 +34,13 @@ theme:
- content.action.edit
- content.code.copy
- content.tabs.link
- navigation.instant
- navigation.instant.progress
- navigation.tracking
- navigation.tabs
- navigation.tabs.sticky
- navigation.sections
- navigation.prune
- navigation.indexes
- navigation.top
- search.highlight
- search.share
@ -51,11 +53,6 @@ hooks:
- docs/mkdocs/hooks/generate_argparse.py
- docs/mkdocs/hooks/url_schemes.py
# Required to stop api-autonav from raising an error
# https://github.com/tlambert03/mkdocs-api-autonav/issues/16
nav:
- api
plugins:
- meta
- search

View File

@ -73,8 +73,6 @@ line-length = 80
"vllm/engine/**/*.py" = ["UP006", "UP035"]
"vllm/executor/**/*.py" = ["UP006", "UP035"]
"vllm/worker/**/*.py" = ["UP006", "UP035"]
# Python 3.8 typing - skip utils for ROCm
"vllm/utils/__init__.py" = ["UP006", "UP035"]
[tool.ruff.lint]
select = [

View File

@ -12,7 +12,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp
openai >= 1.98.0 # For Responses API with reasoning content
openai >= 1.99.1 # For Responses API with reasoning content
pydantic >= 2.10
prometheus_client >= 0.18.0
pillow # Required for image processing

View File

@ -29,3 +29,5 @@ setproctitle
torch
transformers
zmq
uvloop
prometheus-client

View File

@ -10,7 +10,7 @@ pytest-timeout
# testing utils
backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl and Mamba
einops # required for MPT, qwen-vl
httpx
librosa # required for audio tests
vector_quantize_pytorch # required for minicpmo_26 test
@ -21,12 +21,11 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli
sentence-transformers # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
timm # required for internvl test
timm >=1.0.17 # required for internvl and gemma3n-mm test
torch==2.7.1
torchaudio==2.7.1
torchvision==0.22.1
transformers_stream_generator # required for qwen-vl test
mamba_ssm==2.2.5 # required for plamo2 test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.2 # required for voxtral test
num2words # required for smolvlm test
@ -53,4 +52,4 @@ runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10
terratorch==1.1rc2 # required for PrithviMAE test
terratorch==1.1rc2 # required for PrithviMAE test

View File

@ -178,7 +178,6 @@ einops==0.8.1
# via
# -r requirements/test.in
# encodec
# mamba-ssm
# terratorch
# torchgeo
# vector-quantize-pytorch
@ -417,8 +416,6 @@ lxml==5.3.0
# sacrebleu
mako==1.3.10
# via alembic
mamba-ssm==2.2.5
# via -r requirements/test.in
markdown==3.8.2
# via mlflow
markdown-it-py==3.0.0
@ -475,8 +472,6 @@ networkx==3.2.1
# via
# scikit-image
# torch
ninja==1.11.1.3
# via mamba-ssm
nltk==3.9.1
# via rouge-score
num2words==0.5.14
@ -629,7 +624,6 @@ packaging==24.2
# lazy-loader
# lightning
# lightning-utilities
# mamba-ssm
# matplotlib
# mlflow-skinny
# peft
@ -973,7 +967,6 @@ sentencepiece==0.2.0
setuptools==77.0.3
# via
# lightning-utilities
# mamba-ssm
# pytablewriter
# torch
# triton
@ -1058,7 +1051,7 @@ tiktoken==0.7.0
# via
# lm-eval
# mistral-common
timm==1.0.15
timm==1.0.17
# via
# -r requirements/test.in
# open-clip-torch
@ -1085,7 +1078,6 @@ torch==2.7.1+cu128
# lightly
# lightning
# lm-eval
# mamba-ssm
# mteb
# open-clip-torch
# peft
@ -1152,16 +1144,13 @@ transformers==4.55.0
# -r requirements/test.in
# genai-perf
# lm-eval
# mamba-ssm
# peft
# sentence-transformers
# transformers-stream-generator
transformers-stream-generator==0.0.5
# via -r requirements/test.in
triton==3.3.1
# via
# mamba-ssm
# torch
# via torch
tritonclient==2.51.0
# via
# -r requirements/test.in

View File

@ -10,15 +10,10 @@ wheel
jinja2>=3.1.6
datasets # for benchmark scripts
numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
torch==2.7.0+xpu
--extra-index-url=https://download.pytorch.org/whl/xpu
torch==2.8.0+xpu
torchaudio
torchvision
pytorch-triton-xpu
--extra-index-url=https://download.pytorch.org/whl/xpu
# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
intel-extension-for-pytorch==2.7.10+xpu
oneccl_bind_pt==2.7.0+xpu
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.8.10+xpu

187
setup.py
View File

@ -7,6 +7,7 @@ import json
import logging
import os
import re
import shutil
import subprocess
import sys
from pathlib import Path
@ -281,10 +282,81 @@ class cmake_build_ext(build_ext):
self.copy_file(file, dst_file)
class repackage_wheel(build_ext):
class precompiled_build_ext(build_ext):
"""Disables extension building when using precompiled binaries."""
def run(self) -> None:
assert _is_cuda(
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
def build_extensions(self) -> None:
print("Skipping build_ext: using precompiled extensions.")
return
class precompiled_wheel_utils:
"""Extracts libraries and other files from an existing wheel."""
def get_base_commit_in_main_branch(self) -> str:
@staticmethod
def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
import tempfile
import zipfile
temp_dir = None
try:
if not os.path.isfile(wheel_url_or_path):
wheel_filename = wheel_url_or_path.split("/")[-1]
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_url_or_path} "
f"to {wheel_path}")
from urllib.request import urlretrieve
urlretrieve(wheel_url_or_path, filename=wheel_path)
else:
wheel_path = wheel_url_or_path
print(f"Using existing wheel at {wheel_path}")
package_data_patch = {}
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
]
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members = list(
filter(lambda x: x.filename in files_to_copy,
wheel.filelist))
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
for file in file_members:
print(f"[extract] {file.filename}")
target_path = os.path.join(".", file.filename)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with wheel.open(file.filename) as src, open(
target_path, "wb") as dst:
shutil.copyfileobj(src, dst)
pkg = os.path.dirname(file.filename).replace("/", ".")
package_data_patch.setdefault(pkg, []).append(
os.path.basename(file.filename))
return package_data_patch
finally:
if temp_dir is not None:
print(f"Removing temporary directory {temp_dir}")
shutil.rmtree(temp_dir)
@staticmethod
def get_base_commit_in_main_branch() -> str:
# Force to use the nightly wheel. This is mainly used for CI testing.
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
return "nightly"
@ -297,6 +369,10 @@ class repackage_wheel(build_ext):
]).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"]
# In Docker build context, .git may be immutable or missing.
if envs.VLLM_DOCKER_BUILD_CONTEXT:
return upstream_main_commit
# Check if the upstream_main_commit exists in the local repo
try:
subprocess.check_output(
@ -329,86 +405,6 @@ class repackage_wheel(build_ext):
"wheel may not be compatible with your dev branch: %s", err)
return "nightly"
def run(self) -> None:
assert _is_cuda(
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is None:
base_commit = self.get_base_commit_in_main_branch()
wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
# Fallback to nightly wheel if latest commit wheel is unavailable,
# in this rare case, the nightly release CI hasn't finished on main.
if not is_url_available(wheel_location):
wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
import zipfile
if os.path.isfile(wheel_location):
wheel_path = wheel_location
print(f"Using existing wheel={wheel_path}")
else:
# Download the wheel from a given URL, assume
# the filename is the last part of the URL
wheel_filename = wheel_location.split("/")[-1]
import tempfile
# create a temporary directory to store the wheel
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_location} to {wheel_path}")
from urllib.request import urlretrieve
try:
urlretrieve(wheel_location, filename=wheel_path)
except Exception as e:
from setuptools.errors import SetupError
raise SetupError(
f"Failed to get vLLM wheel from {wheel_location}") from e
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
# "vllm/_version.py", # not available in nightly wheels yet
]
file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
for file in file_members:
print(f"Extracting and including {file.filename} "
"from existing wheel")
package_name = os.path.dirname(file.filename).replace("/", ".")
file_name = os.path.basename(file.filename)
if package_name not in package_data:
package_data[package_name] = []
wheel.extract(file)
if file_name.endswith(".py"):
# python files shouldn't be added to package_data
continue
package_data[package_name].append(file_name)
def _no_device() -> bool:
return VLLM_TARGET_DEVICE == "empty"
@ -639,6 +635,29 @@ package_data = {
]
}
# If using precompiled, extract and patch package_data (in advance of setup)
if envs.VLLM_USE_PRECOMPILED:
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is not None:
wheel_url = wheel_location
else:
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
from urllib.request import urlopen
try:
with urlopen(wheel_url) as resp:
if resp.status != 200:
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
except Exception as e:
print(f"[warn] Falling back to nightly wheel: {e}")
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
wheel_url)
for pkg, files in patch.items():
package_data.setdefault(pkg, []).extend(files)
if _no_device():
ext_modules = []
@ -647,7 +666,7 @@ if not ext_modules:
else:
cmdclass = {
"build_ext":
repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
precompiled_build_ext if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
}
setup(
@ -665,7 +684,7 @@ setup(
"mistral_common[audio]"], # Required for audio processing
"video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile
"flashinfer": ["flashinfer-python==0.2.10"],
"flashinfer": ["flashinfer-python==0.2.11"],
},
cmdclass=cmdclass,
package_data=package_data,

View File

@ -10,8 +10,7 @@ import torch.distributed as dist
from vllm.distributed.communication_op import ( # noqa
tensor_model_parallel_all_reduce)
from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
get_tp_group, graph_capture)
from vllm.distributed.parallel_state import get_tp_group, graph_capture
from ..utils import (ensure_model_parallel_initialized,
init_test_distributed_environment, multi_process_parallel)
@ -37,7 +36,7 @@ def graph_allreduce(
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
ensure_model_parallel_initialized(tp_size, pp_size)
group = get_tensor_model_parallel_group().device_group
group = get_tp_group().device_group
# A small all_reduce for warmup.
# this is needed because device communicators might be created lazily

View File

@ -10,8 +10,7 @@ import torch.distributed as dist
from vllm.distributed.communication_op import ( # noqa
tensor_model_parallel_all_reduce)
from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
get_tp_group, graph_capture)
from vllm.distributed.parallel_state import get_tp_group, graph_capture
from vllm.platforms import current_platform
from ..utils import (ensure_model_parallel_initialized,
@ -42,7 +41,7 @@ def graph_quickreduce(
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
ensure_model_parallel_initialized(tp_size, pp_size)
group = get_tensor_model_parallel_group().device_group
group = get_tp_group().device_group
# A small all_reduce for warmup.
# this is needed because device communicators might be created lazily

View File

@ -93,32 +93,6 @@ class NestedConfig:
"""field"""
@config
@dataclass
class FromCliConfig1:
field: int = 1
"""field"""
@classmethod
def from_cli(cls, cli_value: str):
inst = cls(**json.loads(cli_value))
inst.field += 1
return inst
@config
@dataclass
class FromCliConfig2:
field: int = 1
"""field"""
@classmethod
def from_cli(cls, cli_value: str):
inst = cls(**json.loads(cli_value))
inst.field += 2
return inst
@config
@dataclass
class DummyConfig:
@ -144,10 +118,6 @@ class DummyConfig:
"""Dict which will be JSON in CLI"""
nested_config: NestedConfig = field(default_factory=NestedConfig)
"""Nested config"""
from_cli_config1: FromCliConfig1 = field(default_factory=FromCliConfig1)
"""Config with from_cli method"""
from_cli_config2: FromCliConfig2 = field(default_factory=FromCliConfig2)
"""Different config with from_cli method"""
@pytest.mark.parametrize(("type_hint", "expected"), [
@ -199,9 +169,6 @@ def test_get_kwargs():
assert json_tip in kwargs["json_tip"]["help"]
# nested config should should construct the nested config
assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)
# from_cli configs should be constructed with the correct method
assert kwargs["from_cli_config1"]["type"]('{"field": 2}').field == 3
assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4
@pytest.mark.parametrize(

View File

@ -96,9 +96,6 @@ def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
# xet doesn't work well for Qwen/Qwen3-1.7B
m.setenv("HF_HUB_DISABLE_XET", "1")
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
# Add TP test (if provided)

View File

@ -65,3 +65,9 @@ def test_pooling_params(llm: LLM):
assert torch.allclose(
softmax(wo_activation), w_activation, atol=1e-2
), "w_activation should be close to activation(wo_activation)."
def test_encode_api(llm: LLM):
err_msg = "pooling_task must be one of.+"
with pytest.raises(ValueError, match=err_msg):
llm.encode(prompts, use_tqdm=False)

View File

@ -2,15 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import contextlib
import random
import time
from typing import Callable
import openai
import pytest
import pytest_asyncio
import requests
from tests.utils import RemoteOpenAIServer
@ -87,54 +84,3 @@ async def test_with_and_without_truncate(
responses = await asyncio.gather(*[get_status_code(**b) for b in bodies])
assert 500 not in responses
@pytest.mark.asyncio
@pytest.mark.parametrize(
ids=["single completion", "multiple completions", "chat"],
argnames=["create_func_gen", "content_body"],
argvalues=[
(lambda x: x.completions.create, {
"prompt": " ".join(['A'] * 300_000)
}),
(lambda x: x.completions.create, {
"prompt": [" ".join(['A'] * 300_000)] * 2
}),
(lambda x: x.chat.completions.create, {
"messages": [{
"role": "user",
"content": " ".join(['A'] * 300_000)
}]
}),
],
)
async def test_healthcheck_response_time(
server: RemoteOpenAIServer,
client: openai.AsyncOpenAI,
create_func_gen: Callable,
content_body: dict,
):
num_requests = 50
create_func = create_func_gen(client)
body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
def get_response_time(url):
start_time = time.monotonic()
res = requests.get(url)
end_time = time.monotonic()
assert res.status_code == 200
return end_time - start_time
no_load_response_time = get_response_time(server.url_for("health"))
tasks = [
asyncio.create_task(create_func(**body)) for _ in range(num_requests)
]
await asyncio.sleep(1) # give the tasks a chance to start running
load_response_time = get_response_time(server.url_for("health"))
with contextlib.suppress(openai.APIStatusError):
await asyncio.gather(*tasks)
assert load_response_time < 100 * no_load_response_time
assert load_response_time < 0.1

View File

@ -23,6 +23,8 @@ MAXIMUM_AUDIOS = 2
@pytest.fixture(scope="module")
def server():
args = [
"--dtype",
"float32",
"--max-model-len",
"2048",
"--max-num-seqs",

View File

@ -211,3 +211,18 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str):
assert torch.allclose(
F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2
), "w_activation should be close to activation(wo_activation)."
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_pooling(server: RemoteOpenAIServer, model_name: str):
# pooling api uses ALL pooling, which does not support chunked prefill.
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": "test",
"encoding_format": "float"
},
)
assert response.json()["error"]["type"] == "BadRequestError"

View File

@ -126,7 +126,9 @@ def test_invocations(server: RemoteOpenAIServer):
invocation_output["results"]):
assert rerank_result.keys() == invocations_result.keys()
assert rerank_result["relevance_score"] == pytest.approx(
invocations_result["relevance_score"], rel=0.01)
invocations_result["relevance_score"], rel=0.05)
# TODO: reset this tolerance to 0.01 once we find
# an alternative to flash_attn with bfloat16
@pytest.mark.asyncio

View File

@ -0,0 +1,624 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import time
import pytest
import pytest_asyncio
import requests
from openai import BadRequestError, NotFoundError, OpenAI
from ...utils import RemoteOpenAIServer
pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.")
MODEL_NAME = "openai/gpt-oss-20b"
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
args = ["--enforce-eager", "--tool-server", "demo"]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
)
assert response is not None
print("response: ", response)
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic_with_instructions(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
instructions="Respond in Korean.",
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is the capital of South Korea?",
reasoning={"effort": "low"},
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{
"role": "system",
"content": "Respond in Korean."
},
{
"role": "user",
"content": "Hello!"
},
{
"role": "assistant",
"content": "Hello! How can I help you today?"
},
{
"role": "user",
"content": "What is 13 * 24? Explain your answer."
},
],
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_with_input_type(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{
"role": "user",
"content": [{
"type": "input_text",
"text": "What is 13*24?"
}],
},
],
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_structured_output(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{
"role": "system",
"content": "Extract the event information."
},
{
"role": "user",
"content":
"Alice and Bob are going to a science fair on Friday.",
},
],
text={
"format": {
"type": "json_schema",
"name": "calendar_event",
"schema": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"date": {
"type": "string"
},
"participants": {
"type": "array",
"items": {
"type": "string"
}
},
},
"required": ["name", "date", "participants"],
"additionalProperties": False,
},
"description": "A calendar event.",
"strict": True,
}
},
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_structured_output_with_parse(client: OpenAI, model_name: str):
from pydantic import BaseModel
class CalendarEvent(BaseModel):
name: str
date: str
participants: list[str]
response = await client.responses.parse(
model=model_name,
input="Alice and Bob are going to a science fair on Friday",
instructions="Extract the event information",
text_format=CalendarEvent,
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_store(client: OpenAI, model_name: str):
for store in [True, False]:
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
store=store,
)
assert response is not None
try:
_retrieved_response = await client.responses.retrieve(response.id)
is_not_found = False
except NotFoundError:
is_not_found = True
assert is_not_found == (not store)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_background(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
background=True,
)
assert response is not None
retries = 0
max_retries = 30
while retries < max_retries:
response = await client.responses.retrieve(response.id)
if response.status == "completed":
break
time.sleep(1)
retries += 1
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_background_cancel(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="Write a long story about a cat.",
background=True,
)
assert response is not None
time.sleep(1)
cancelled_response = await client.responses.cancel(response.id)
assert cancelled_response is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_stateful_multi_turn(client: OpenAI, model_name: str):
response1 = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
)
assert response1 is not None
assert response1.status == "completed"
response2 = await client.responses.create(
model=model_name,
input="What if I increase both numbers by 1?",
previous_response_id=response1.id,
)
assert response2 is not None
assert response2.status == "completed"
response3 = await client.responses.create(
model=model_name,
input="Divide the result by 2.",
previous_response_id=response2.id,
)
assert response3 is not None
assert response3.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_streaming(client: OpenAI, model_name: str):
prompts = [
"tell me a story about a cat in 20 words",
"What is 13 * 24? Use python to calculate the result.",
"When did Jensen found NVIDIA? Search it and answer the year only.",
]
for prompt in prompts:
response = await client.responses.create(
model=model_name,
input=prompt,
reasoning={"effort": "low"},
tools=[
{
"type": "web_search_preview"
},
{
"type": "code_interpreter",
"container": {
"type": "auto"
}
},
],
stream=True,
)
events = []
current_event_mode = None
async for event in response:
if current_event_mode != event.type:
current_event_mode = event.type
print(f"\n[{event.type}] ", end="", flush=True)
if "text.delta" in event.type:
print(event.delta, end="", flush=True)
elif "reasoning_text.delta" in event.type:
print(f"{event.delta}", end="", flush=True)
elif "response.code_interpreter_call_code.done" in event.type:
print(f"Code: {event.code}", end="", flush=True)
elif ("response.output_item.added" in event.type
and event.item.type == "web_search_call"):
print(f"Web search: {event.item.action}", end="", flush=True)
events.append(event)
assert len(events) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_web_search(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="Who is the president of South Korea as of now?",
tools=[{
"type": "web_search_preview"
}],
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_code_interpreter(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="Multiply 64548*15151 using builtin python interpreter.",
tools=[{
"type": "code_interpreter",
"container": {
"type": "auto"
}
}],
)
assert response is not None
assert response.status == "completed"
def get_weather(latitude, longitude):
response = requests.get(
f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&current=temperature_2m,wind_speed_10m&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m" # noqa
)
data = response.json()
return data["current"]["temperature_2m"]
def get_place_to_travel():
return "Paris"
def call_function(name, args):
if name == "get_weather":
return get_weather(**args)
elif name == "get_place_to_travel":
return get_place_to_travel()
else:
raise ValueError(f"Unknown function: {name}")
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling(client: OpenAI, model_name: str):
tools = [{
"type": "function",
"name": "get_weather",
"description":
"Get current temperature for provided coordinates in celsius.", # noqa
"parameters": {
"type": "object",
"properties": {
"latitude": {
"type": "number"
},
"longitude": {
"type": "number"
},
},
"required": ["latitude", "longitude"],
"additionalProperties": False,
},
"strict": True,
}]
response = await client.responses.create(
model=model_name,
input="What's the weather like in Paris today?",
tools=tools,
)
assert response is not None
assert response.status == "completed"
assert len(response.output) == 2
assert response.output[0].type == "reasoning"
assert response.output[1].type == "function_call"
tool_call = response.output[1]
name = tool_call.name
args = json.loads(tool_call.arguments)
result = call_function(name, args)
response_2 = await client.responses.create(
model=model_name,
input=[{
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}],
tools=tools,
previous_response_id=response.id,
)
assert response_2 is not None
assert response_2.status == "completed"
assert response_2.output_text is not None
# NOTE: chain-of-thought should be removed.
response_3 = await client.responses.create(
model=model_name,
input="What's the weather like in Paris today?",
tools=tools,
previous_response_id=response_2.id,
)
assert response_3 is not None
assert response_3.status == "completed"
assert response_3.output_text is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
tools = [
{
"type": "function",
"name": "get_place_to_travel",
"description": "Get a random place to travel",
"parameters": {
"type": "object",
"properties": {},
"required": [],
"additionalProperties": False,
},
"strict": True,
},
{
"type": "function",
"name": "get_weather",
"description":
"Get current temperature for provided coordinates in celsius.", # noqa
"parameters": {
"type": "object",
"properties": {
"latitude": {
"type": "number"
},
"longitude": {
"type": "number"
},
},
"required": ["latitude", "longitude"],
"additionalProperties": False,
},
"strict": True,
},
]
response = await client.responses.create(
model=model_name,
input=
"Help me plan a trip to a random place. And tell me the weather there.",
tools=tools,
)
assert response is not None
assert response.status == "completed"
assert len(response.output) == 2
assert response.output[0].type == "reasoning"
assert response.output[1].type == "function_call"
tool_call = response.output[1]
name = tool_call.name
args = json.loads(tool_call.arguments)
result = call_function(name, args)
response_2 = await client.responses.create(
model=model_name,
input=[{
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}],
tools=tools,
previous_response_id=response.id,
)
assert response_2 is not None
assert response_2.status == "completed"
assert len(response_2.output) == 2
assert response_2.output[0].type == "reasoning"
assert response_2.output[1].type == "function_call"
tool_call = response_2.output[1]
name = tool_call.name
args = json.loads(tool_call.arguments)
result = call_function(name, args)
response_3 = await client.responses.create(
model=model_name,
input=[{
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}],
tools=tools,
previous_response_id=response_2.id,
)
assert response_3 is not None
assert response_3.status == "completed"
assert response_3.output_text is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_required(client: OpenAI, model_name: str):
tools = [{
"type": "function",
"name": "get_weather",
"description":
"Get current temperature for provided coordinates in celsius.", # noqa
"parameters": {
"type": "object",
"properties": {
"latitude": {
"type": "number"
},
"longitude": {
"type": "number"
},
},
"required": ["latitude", "longitude"],
"additionalProperties": False,
},
"strict": True,
}]
with pytest.raises(BadRequestError):
await client.responses.create(
model=model_name,
input="What's the weather like in Paris today?",
tools=tools,
tool_choice="required",
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_full_history(client: OpenAI, model_name: str):
tools = [{
"type": "function",
"name": "get_weather",
"description":
"Get current temperature for provided coordinates in celsius.", # noqa
"parameters": {
"type": "object",
"properties": {
"latitude": {
"type": "number"
},
"longitude": {
"type": "number"
},
},
"required": ["latitude", "longitude"],
"additionalProperties": False,
},
"strict": True,
}]
input_messages = [{
"role": "user",
"content": "What's the weather like in Paris today?"
}]
response = await client.responses.create(
model=model_name,
input=input_messages,
tools=tools,
)
assert response is not None
assert response.status == "completed"
tool_call = response.output[-1]
name = tool_call.name
args = json.loads(tool_call.arguments)
result = call_function(name, args)
input_messages.extend(
response.output) # append model's function call message
input_messages.append(
{ # append result message
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": str(result),
}
)
response_2 = await client.responses.create(
model=model_name,
input=input_messages,
tools=tools,
)
assert response_2 is not None
assert response_2.status == "completed"
assert response_2.output_text is not None

View File

@ -220,7 +220,9 @@ class TestModel:
invocation_output["data"]):
assert score_data.keys() == invocation_data.keys()
assert score_data["score"] == pytest.approx(
invocation_data["score"], rel=0.01)
invocation_data["score"], rel=0.05)
# TODO: reset this tolerance to 0.01 once we find
# an alternative to flash_attn with bfloat16
def test_activation(self, server: RemoteOpenAIServer, model: dict[str,
Any]):

View File

@ -44,7 +44,7 @@ def model_uri(tmp_dir):
def tensorize_model_and_lora(tmp_dir, model_uri):
tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri,
lora_dir=tmp_dir)
args = EngineArgs(model=MODEL_NAME, device="cuda")
args = EngineArgs(model=MODEL_NAME)
tensorize_lora_adapter(LORA_PATH, tensorizer_config)
tensorize_vllm_model(args, tensorizer_config)

View File

@ -0,0 +1,43 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from tempfile import TemporaryDirectory
import httpx
import pytest
from vllm.version import __version__ as VLLM_VERSION
from ...utils import RemoteOpenAIServer
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module")
def server():
with TemporaryDirectory() as tmpdir:
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
"--uds",
f"{tmpdir}/vllm.sock",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_show_version(server: RemoteOpenAIServer):
transport = httpx.HTTPTransport(uds=server.uds)
client = httpx.Client(transport=transport)
response = client.get(server.url_for("version"))
response.raise_for_status()
assert response.json() == {"version": VLLM_VERSION}

View File

@ -0,0 +1,215 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from transformers import AutoConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
head_size: int, max_position_embeddings: int,
dtype: torch.dtype, device: torch.device):
"""Generate test data for given configuration."""
# Create 2D positions (3, num_tokens) for multimodal case
positions = torch.randint(0,
max_position_embeddings // 4, (3, num_tokens),
device=device)
# Create query and key tensors
query = torch.randn(num_tokens,
num_q_heads * head_size,
dtype=dtype,
device=device)
key = torch.randn(num_tokens,
num_kv_heads * head_size,
dtype=dtype,
device=device)
return positions, query, key
def unroll_model_tp_dict(model_tp_dict):
return [(model_name, tp_size)
for model_name, tp_sizes in model_tp_dict.items()
for tp_size in tp_sizes]
model_tp_dict = {
"Qwen/Qwen2-VL-7B-Instruct": [1, 2],
"Qwen/Qwen2-VL-72B-Instruct": [1, 2],
"Qwen/Qwen2.5-VL-72B-Instruct": [1, 2],
"zai-org/GLM-4.1V-9B-Thinking": [1, 2],
}
# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
dtype_atol_rtol_list = [
[torch.bfloat16, 1e-2, 1.6e-2],
]
num_tokens_list = [11, 8192]
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Skipping CUDA/ROCm only tests.")
@pytest.mark.parametrize("model_name, tp_size",
unroll_model_tp_dict(model_tp_dict))
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
@pytest.mark.parametrize("num_tokens", num_tokens_list)
def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
config = AutoConfig.from_pretrained(model_name)
# get the model config
total_num_kv_heads = config.num_key_value_heads
total_num_heads = config.num_attention_heads
num_heads = total_num_heads // tp_size
num_kv_heads = max(1, total_num_kv_heads // tp_size)
head_dim = config.hidden_size // total_num_heads
is_neox_style = True
rope_theta = config.rope_theta
max_position = config.max_position_embeddings
partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
rotary_dim = int(head_dim * partial_rotary_factor)
mrope_helper_class = get_rope(
head_size=head_dim,
rotary_dim=rotary_dim,
max_position=max_position,
base=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=config.rope_scaling,
dtype=dtype,
).to(device=device)
# create q k v input tensors
# create rotary pos emb input tensors
positions, query, key = generate_test_data(num_tokens, num_heads,
num_kv_heads, head_dim,
max_position, dtype, device)
query_native, key_native = mrope_helper_class.forward_native(
positions,
query.clone(),
key.clone(),
)
query_cuda, key_cuda = mrope_helper_class.forward_cuda(
positions,
query.clone(),
key.clone(),
)
torch.testing.assert_close(query_native, query_cuda, atol=atol, rtol=rtol)
torch.testing.assert_close(key_native, key_cuda, atol=atol, rtol=rtol)
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Skipping CUDA/ROCm only tests.")
@pytest.mark.parametrize(
"model_name, tp_size",
unroll_model_tp_dict({
"Qwen/Qwen2-VL-7B-Instruct": [1, 2],
"zai-org/GLM-4.1V-9B-Thinking": [1, 2]
}))
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
@pytest.mark.parametrize("num_tokens", [4])
def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol,
num_tokens):
config = AutoConfig.from_pretrained(model_name)
# get the model config
total_num_kv_heads = config.num_key_value_heads
total_num_heads = config.num_attention_heads
num_heads = total_num_heads // tp_size
num_kv_heads = max(1, total_num_kv_heads // tp_size)
head_dim = config.hidden_size // total_num_heads
is_neox_style = True
rope_theta = config.rope_theta
max_position = config.max_position_embeddings
partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
rotary_dim = int(head_dim * partial_rotary_factor)
mrope_helper_class = get_rope(
head_size=head_dim,
rotary_dim=rotary_dim,
max_position=max_position,
base=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=config.rope_scaling,
dtype=dtype,
).to(device=device)
# Generate test data
positions, query, key = generate_test_data(num_tokens, num_heads,
num_kv_heads, head_dim,
max_position, dtype, device)
# Create a wrapper that makes the in-place function appear functional
def functional_forward_cuda(pos, q, k):
"""Wrapper that converts in-place operation to functional style
CUDA Graph does not support in-place operations.
This wrapper creates working copies of the
input tensors and modifies them.
"""
q_work = q.clone() # Create working copies
k_work = k.clone()
# Your in-place function modifies q_work and k_work
mrope_helper_class.forward_cuda(pos, q_work, k_work)
return q_work, k_work # Return the modified tensors
# Get reference results
query_native, key_native = mrope_helper_class.forward_native(
positions,
query.clone(),
key.clone(),
)
try:
compiled_forward_cuda = torch.compile(functional_forward_cuda,
fullgraph=True,
backend="inductor",
mode="reduce-overhead",
dynamic=False)
# Run compiled version
query_compiled_cuda, key_compiled_cuda = compiled_forward_cuda(
positions,
query,
key,
)
# Run original version for comparison
query_cuda = query.clone()
key_cuda = key.clone()
mrope_helper_class.forward_cuda(positions, query_cuda, key_cuda)
# Verify results
torch.testing.assert_close(query_compiled_cuda,
query_cuda,
atol=atol,
rtol=rtol)
torch.testing.assert_close(key_compiled_cuda,
key_cuda,
atol=atol,
rtol=rtol)
torch.testing.assert_close(query_compiled_cuda,
query_native,
atol=atol,
rtol=rtol)
torch.testing.assert_close(key_compiled_cuda,
key_native,
atol=atol,
rtol=rtol)
print("✓ forward_cuda successfully traced with torch.compile inductor")
except Exception as e:
pytest.fail(
f"forward_cuda failed to trace with torch.compile inductor: {e}")

View File

@ -187,7 +187,7 @@ def generate_continuous_batched_examples(example_lens_by_batch,
[torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
@pytest.mark.parametrize("seq_len_chunk_size", [(119, 17), (128, 32)])
@pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)])
def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
itype):
@ -253,15 +253,15 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
(8, 8, 16, 32, 16),
]), # mode examples with varied lengths
# odd chunk_size
(64, 29, 2, [(11, 4), (13, 23), (19, 22),
(21, 15)]), # irregular sizes
# large-ish chunk_size (256)
(64, 256, 1, [(5, ), (1, ), (1, ),
(1, )]), # irregular sizes with small sequences
(64, 256, 2, [(5, 30), (1, 2), (1, 2),
(1, 2)]), # irregular sizes with small sequences
# we also need to test some large seqlen
# to catch errors with init states decay
(768, 128, 2, [(138, 225), (138, 225)]),
])
def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
itype):
@ -271,10 +271,9 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
seqlen, chunk_size, num_examples, cases = seq_len_chunk_size_cases
# TODO: the irregular chunk size cases have some issues and require higher
# tolerance. This is to be invesigated
if chunk_size not in {8, 256}:
atol, rtol = 5e-1, 5e-1
# This test can have larger error for longer sequences
if seqlen > 256:
atol, rtol = 1e-2, 5e-3
else:
atol, rtol = 5e-3, 5e-3

View File

@ -36,7 +36,6 @@ def _set_vllm_config(vllm_config: VllmConfig, world_size: int, rank: int,
import tempfile
temp_file = tempfile.mkstemp()[1]
set_current_vllm_config(vllm_config)
with set_current_vllm_config(vllm_config):
init_distributed_environment(
world_size=world_size,

View File

@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_topk, modular_triton_fused_moe)
from vllm.platforms import current_platform
from vllm.utils import has_deep_gemm
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
dg_available = has_deep_gemm()
@ -224,7 +224,8 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
@pytest.mark.skipif(is_blackwell_deep_gemm_used(), reason="Not E8M0 scale MOE")
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
reason="Not E8M0 scale MOE")
@torch.inference_mode()
def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
monkeypatch):

View File

@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel)
from vllm.platforms import current_platform
from vllm.utils import has_deep_ep, has_deep_gemm
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used,
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
is_deep_gemm_supported)
from .parallel_utils import ProcessGroupInfo, parallel_launch
@ -370,7 +370,7 @@ NUM_EXPERTS = [32]
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
@requires_deep_ep
@requires_deep_gemm
@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
reason="Skipping test for Blackwell DeepGEMM")
def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
topk: int, world_dp_size: tuple[int, int]):
@ -427,7 +427,7 @@ USE_FP8_DISPATCH = [False]
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
@requires_deep_ep
@requires_deep_gemm
@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
reason="Skipping test for Blackwell DeepGEMM")
def test_ll_deepep_deepgemm_moe(
mnk: tuple[int, int, int],

View File

@ -0,0 +1,453 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass, fields
import pytest
import torch
import torch.nn.functional as F
from vllm.utils import has_triton_kernels
if not has_triton_kernels():
pytest.skip(
"triton_kernels not found, skipping all related tests",
allow_module_level=True,
)
import triton_kernels.swiglu
from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
from triton_kernels.numerics import InFlexData
from triton_kernels.numerics_details.mxfp import (downcast_to_mxfp,
upcast_from_mxfp)
from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
from triton_kernels.tensor_details import layout
from triton_kernels.testing import assert_close
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
BatchedPrepareAndFinalize)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
BatchedOAITritonExperts, triton_kernel_moe_forward)
from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel)
from vllm.model_executor.layers.utils import shuffle_weight
from vllm.utils import round_up
def deshuffle(w: torch.Tensor):
first = w[..., ::2]
second = w[..., 1::2]
deshuffled = torch.concat((first, second), dim=-1)
return deshuffled
def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
randbits = [torch.randperm(E) for _ in range(M)]
x_list = [
(-1)**i *
((16384 +
((i * 512) % 4096) + bits).to(torch.int16).view(torch.bfloat16))
for i, bits in enumerate(randbits)
]
exp_data = torch.stack(x_list).to(
device="cuda") # simulating gate_output (M, E)
# create input tensor
x = torch.randn((M, K), dtype=torch.bfloat16, device="cuda")
w1 = torch.randn((E, 2 * N, K), dtype=torch.bfloat16, device="cuda")
w1_bias = torch.randn((E, 2 * N), dtype=torch.bfloat16, device="cuda")
w2 = torch.randn((E, K, N), dtype=torch.bfloat16, device="cuda")
w2_bias = torch.randn((E, K), dtype=torch.bfloat16, device="cuda")
exp_data_tri = exp_data.clone()
x_tri = x.clone()
w1_tri = w1.clone()
w2_tri = w2.clone()
w1_bias_tri = w1_bias.clone()
w2_bias_tri = w2_bias.clone()
w1_bias_tri = w1_bias_tri.to(torch.float32)
w2_bias_tri = w2_bias_tri.to(torch.float32)
dtype_dict = {
"bf16": torch.bfloat16,
"fp8_e4m3": torch.float8_e4m3fn,
"fp8_e5m2": torch.float8_e5m2,
}
x = x.to(dtype_dict[a_dtype]).to(torch.bfloat16)
if w_dtype != "mx4":
# simulate quantization support on reference impl
w1 = w1.to(dtype_dict[w_dtype]).to(torch.bfloat16)
w2 = w2.to(dtype_dict[w_dtype]).to(torch.bfloat16)
# triton moe kernel use transposed shape for matmul
w1_tri = w1_tri.transpose(-2, -1)
w2_tri = w2_tri.transpose(-2, -1)
# shuffle weights
w1_tri = shuffle_weight(w1_tri)
w1_bias_tri = shuffle_weight(w1_bias_tri)
# quant triton_weights
x_tri = x.to(dtype_dict[a_dtype])
if w_dtype != "mx4":
pytest.skip("NYI")
else: # quantize to mx4
# careful on the padding here, the activation padding need to be
# multiple of 64, the actual engine is not implemented
w1_bottom_pad = round_up(w1_tri.shape[1], 64) - w1_tri.shape[1]
w1_right_pad = round_up(w1_tri.shape[2], 128) - w1_tri.shape[2]
w2_bottom_pad = w1_right_pad // 2
w2_right_pad = w1_bottom_pad
x_pad = w1_bottom_pad
w1_tri = F.pad(
w1_tri,
(0, w1_right_pad, 0, w1_bottom_pad, 0, 0),
mode="constant",
value=0,
)
w2_tri = F.pad(
w2_tri,
(0, w2_right_pad, 0, w2_bottom_pad, 0, 0),
mode="constant",
value=0,
)
w1_bias_tri = F.pad(w1_bias_tri, (0, w1_right_pad, 0, 0),
mode="constant",
value=0)
w2_bias_tri = F.pad(w2_bias_tri, (0, w2_right_pad, 0, 0),
mode="constant",
value=0)
x_tri = F.pad(x_tri, (0, x_pad, 0, 0), mode="constant", value=0)
w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
mx_axis=1)
w_scale_layout, w_scale_layout_opts = (
layout.make_default_matmul_mxfp4_w_scale_layout(
mx_axis=1, num_warps=num_warps))
w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1)
w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, torch.bfloat16, axis=1)
w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1)
w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, torch.bfloat16, axis=1)
w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout,
**w_layout_opts)
w1_scale_tri = convert_layout(
wrap_torch_tensor(w1_scale_tri),
w_scale_layout,
**w_scale_layout_opts,
)
w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout,
**w_layout_opts)
w2_scale_tri = convert_layout(
wrap_torch_tensor(w2_scale_tri),
w_scale_layout,
**w_scale_layout_opts,
)
pc1 = PrecisionConfig(weight_scale=w1_scale_tri,
flex_ctx=FlexCtx(rhs_data=InFlexData()))
pc2 = PrecisionConfig(weight_scale=w2_scale_tri,
flex_ctx=FlexCtx(rhs_data=InFlexData()))
# tucuate so the rest can run properly
w1 = w1[..., :K, :2 * N]
w2 = w2[..., :N, :K]
w1 = deshuffle(w1)
w1 = w1.transpose(-1, -2).contiguous()
w2 = w2.transpose(-1, -2).contiguous()
return (
x,
w1,
w1_bias,
w2,
w2_bias,
exp_data,
x_tri,
w1_tri,
w2_tri,
exp_data_tri,
w1_bias_tri,
w2_bias_tri,
pc1,
pc2,
)
@dataclass
class ModelConfig:
num_hidden_layers: int = 36
num_experts: int = 128
experts_per_token: int = 4
vocab_size: int = 201088
hidden_size: int = 2880
intermediate_size: int = 2880
head_dim: int = 64
num_attention_heads: int = 64
num_key_value_heads: int = 8
sliding_window: int = 128
initial_context_length: int = 4096
rope_theta: float = 150000.0
rope_scaling_factor: float = 32.0
rope_ntk_alpha: float = 1.0
rope_ntk_beta: float = 32.0
def swiglu(x, alpha: float = 1.702, limit: float = 1.0):
# Note we add an extra bias of 1 to the linear layer
x_glu, x_linear = torch.chunk(x, 2, dim=-1)
if limit is not None:
x_glu = x_glu.clamp(max=limit)
out_glu = x_glu * torch.sigmoid(alpha * x_glu)
if limit is not None:
x_linear = x_linear.clamp(min=-limit, max=limit)
return out_glu * (x_linear + 1)
def oai_moe_forward(
hidden_states: torch.Tensor, # (M, K)
w1: torch.Tensor, # (E, 2N)
w1_bias: torch.Tensor, # (E, 2N, K)
w2: torch.Tensor, # (E, K, N)
w2_bias: torch.Tensor, # (E, N)
gating_output: torch.Tensor, # (M, E)
topk: int,
):
# model.py 309:330, assuming gating and norm
t = hidden_states
experts = torch.topk(gating_output, k=topk, dim=-1, sorted=True)
expert_weights = torch.nn.functional.softmax(experts.values, dim=1)
expert_indices = experts.indices
# MLP #1
mlp1_weight = w1[expert_indices, ...]
mlp1_bias = w1_bias[expert_indices, ...]
t = torch.einsum("beck,bk->bec", mlp1_weight, t) + mlp1_bias
t = swiglu(t, limit=7)
# MLP #2
mlp2_weight = w2[expert_indices, ...]
mlp2_bias = w2_bias[expert_indices, ...]
t = torch.einsum("beck,bek->bec", mlp2_weight, t)
t += mlp2_bias
# Weighted sum of experts
t = torch.einsum("bec,be->bc", t, expert_weights)
return t
@dataclass
class Case:
a_dtype: str
w_dtype: str
@pytest.mark.parametrize(
", ".join(f.name for f in fields(Case)),
[
tuple(getattr(case, f.name) for f in fields(Case)) for case in [
# Case(a_dtype="bf16", w_dtype="bf16"),
# Case(a_dtype="fp8_e4m3", w_dtype="fp8_e5m2"),
Case(a_dtype="bf16", w_dtype="mx4")
]
],
)
@pytest.mark.parametrize("num_token", [2])
@pytest.mark.parametrize("tp", [1, 2, 4, 8])
def test_equiv(num_token, a_dtype, w_dtype, tp):
M = num_token
E = ModelConfig.num_experts
K = ModelConfig.hidden_size
N = ModelConfig.intermediate_size // tp
topk = ModelConfig.experts_per_token
(
x,
w1,
w1_bias,
w2,
w2_bias,
exp_data,
x_tri,
w1_tri,
w2_tri,
exp_data_tri,
w1_bias_tri,
w2_bias_tri,
pc1,
pc2,
) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=8)
out_triton_monolithic = triton_kernel_moe_forward(
hidden_states=x_tri,
w1=w1_tri,
w2=w2_tri,
gating_output=exp_data_tri,
topk=topk,
renormalize=True,
w1_bias=w1_bias_tri,
w2_bias=w2_bias_tri,
w1_precision=pc1,
w2_precision=pc2,
)
out_triton_monolithic = out_triton_monolithic[..., :K]
out_ref = oai_moe_forward(
hidden_states=x,
w1=w1,
w1_bias=w1_bias,
w2=w2,
w2_bias=w2_bias,
gating_output=exp_data,
topk=topk,
)
assert_close(ref=out_ref,
tri=out_triton_monolithic,
maxtol=0.025,
rmstol=0.005)
def batched_moe(
a: torch.Tensor,
w1,
w2,
gating_output: torch.Tensor,
topk: int,
renormalize: bool,
w1_bias: torch.Tensor,
w2_bias: torch.Tensor,
w1_precision: PrecisionConfig,
w2_precision: PrecisionConfig,
) -> torch.Tensor:
max_num_tokens = round_up(a.shape[0], 64)
fused_experts = FusedMoEModularKernel(
BatchedPrepareAndFinalize(
max_num_tokens,
num_dispatchers=1,
num_local_experts=w1.shape[0],
rank=0,
),
BatchedOAITritonExperts(
None,
max_num_tokens=max_num_tokens,
num_dispatchers=1,
w1_precision=w1_precision,
w2_precision=w2_precision,
),
)
extra_expert_args = {
"w1_bias": w1_bias,
"w2_bias": w2_bias,
}
topk_weight, topk_ids, _ = fused_topk(a, gating_output, topk, renormalize)
return fused_experts(
a,
w1,
w2,
topk_weight,
topk_ids,
extra_expert_args=extra_expert_args,
)
@pytest.mark.parametrize(
", ".join(f.name for f in fields(Case)),
[
tuple(getattr(case, f.name) for f in fields(Case)) for case in [
# Case(a_dtype="bf16", w_dtype="bf16"),
# Case(a_dtype="fp8_e4m3", w_dtype="fp8_e5m2"),
Case(a_dtype="bf16", w_dtype="mx4")
]
],
)
@pytest.mark.parametrize("num_token", [64])
@pytest.mark.parametrize("ep", [1, 2, 4, 8])
def test_triton_kernel_batched_moe(num_token, a_dtype, w_dtype, ep):
M = num_token
E = ModelConfig.num_experts // ep
K = ModelConfig.hidden_size
N = ModelConfig.intermediate_size
topk = ModelConfig.experts_per_token
(
x,
w1,
w1_bias,
w2,
w2_bias,
exp_data,
x_tri,
w1_tri,
w2_tri,
exp_data_tri,
w1_bias_tri,
w2_bias_tri,
pc1,
pc2,
) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=4)
out_tri = batched_moe(
a=x_tri,
w1=w1_tri,
w2=w2_tri,
gating_output=exp_data_tri,
topk=topk,
renormalize=True,
w1_bias=w1_bias_tri,
w2_bias=w2_bias_tri,
w1_precision=pc1,
w2_precision=pc2,
)
out_tri = out_tri[..., :K]
out_ref = oai_moe_forward(
hidden_states=x,
w1=w1,
w1_bias=w1_bias,
w2=w2,
w2_bias=w2_bias,
gating_output=exp_data,
topk=topk,
)
assert_close(ref=out_ref, tri=out_tri, maxtol=0.025, rmstol=0.005)
def test_unit_shuffle():
N = ModelConfig.intermediate_size
K = ModelConfig.hidden_size
m = torch.randn((K, 2 * N), dtype=torch.bfloat16, device="cuda")
x = torch.randn(K, dtype=torch.bfloat16, device="cuda")
m_shuffled = shuffle_weight(m)
out_ref = x @ m
out_ref = swiglu(out_ref, limit=1.0)
out = x @ m_shuffled
out = triton_kernels.swiglu.swiglu_torch(
out,
alpha=1.702,
precision_config=triton_kernels.swiglu.PrecisionConfig(limit=1.0),
)
assert_close(ref=out_ref, tri=out)

Some files were not shown because too many files have changed in this diff Show More