Merge branch 'main' into mlm-full-lora-support

This commit is contained in:
Jee Jee Li 2025-12-04 22:25:42 +08:00 committed by GitHub
commit c94cdf1c50
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
190 changed files with 7128 additions and 3855 deletions

View File

@ -1,46 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import os
template = """<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
</body>
</html>
"""
parser = argparse.ArgumentParser()
parser.add_argument("--wheel", help="The wheel path.", required=True)
args = parser.parse_args()
filename = os.path.basename(args.wheel)
with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}")
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
if "x86_64" in filename:
x86_wheel = filename
arm_wheel = filename.replace("x86_64", "aarch64").replace(
"manylinux1", "manylinux2014"
)
elif "aarch64" in filename:
x86_wheel = filename.replace("aarch64", "x86_64").replace(
"manylinux2014", "manylinux1"
)
arm_wheel = filename
else:
raise ValueError(f"Unsupported wheel: {filename}")
# cloudfront requires escaping the '+' character
f.write(
template.format(
x86_wheel=x86_wheel,
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
arm_wheel=arm_wheel,
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
)
)

View File

@ -7,13 +7,14 @@
import argparse
import json
import re
import sys
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
from urllib.parse import quote
import regex as re
if not sys.version_info >= (3, 12):
raise RuntimeError("This script requires Python 3.12 or higher.")

View File

@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV SOC_VERSION="ascend910b1"
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \

View File

@ -0,0 +1,73 @@
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8030}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="deepseek-ai/DeepSeek-V2-lite"
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 2 \
--data-parallel-size 2 \
--enable-expert-parallel \
--enable-eplb \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--trust-remote-code \
--max-model-len 2048 \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
PY
cleanup
SERVER_PID=
sleep 1
PORT=$((PORT+1))
done

View File

@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do
--data-parallel-size 2 \
--enable-expert-parallel \
--enable-eplb \
--eplb-config '{"window_size":200,"step_interval":600}' \
--trust-remote-code \
--max-model-len 2048 \
--port $PORT &

View File

@ -0,0 +1,74 @@
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8040}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 4 \
--enable-expert-parallel \
--enable-eplb \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \
--max-model-len 2048 \
--gpu-memory-utilization 0.9 \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
PY
cleanup
SERVER_PID=
sleep 1
PORT=$((PORT+1))
done

View File

@ -81,7 +81,7 @@ else
alias_arg=""
fi
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
$PYTHON pip install regex && .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
# copy indices to /<commit>/ unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX"

View File

@ -987,7 +987,8 @@ steps:
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1
- label: Multi-Modal Models Test (Extended) 1 # 60min
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@ -1011,7 +1012,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models Test (Extended) 3
- label: Multi-Modal Models Test (Extended) 3 # 75min
timeout_in_minutes: 150
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking

View File

@ -387,6 +387,7 @@ steps:
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
- vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
@ -1373,4 +1374,22 @@ steps:
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

View File

@ -137,6 +137,7 @@ Compute Resources:
- Alibaba Cloud
- AMD
- Anyscale
- Arm
- AWS
- Crusoe Cloud
- Databricks

View File

@ -0,0 +1,120 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
This focuses on a single test payload shaped like the prefix-cache hash input:
(32-byte bytes object, 32-int tuple)
Usage:
python benchmarks/hash_micro_benchmark.py --iterations 20000
"""
from __future__ import annotations
import argparse
import random
import statistics
import time
from collections.abc import Callable, Iterable
from vllm.utils.hashing import sha256, xxhash
def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
"""Generate a deterministic test payload."""
random.seed(seed)
bytes_data = bytes(random.getrandbits(8) for _ in range(32))
int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
return (bytes_data, int_tuple)
def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
"""Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
times: list[float] = []
# Warm-up to avoid first-run noise.
for _ in range(200):
func(data)
for _ in range(iterations):
start = time.perf_counter()
func(data)
end = time.perf_counter()
times.append(end - start)
avg = statistics.mean(times)
std = statistics.stdev(times) if len(times) > 1 else 0.0
return avg, std
def _run_benchmarks(
benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
data: tuple,
iterations: int,
):
"""Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
for name, func in benchmarks:
try:
avg, std = _benchmark_func(func, data, iterations)
except ModuleNotFoundError as exc:
print(f"Skipping {name}: {exc}")
continue
yield name, avg, std
def builtin_hash(data: tuple) -> int:
"""Wrapper for Python's built-in hash()."""
return hash(data)
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--iterations",
type=int,
default=10_000,
help="Number of measured iterations per hash function.",
)
parser.add_argument(
"--seed", type=int, default=42, help="Random seed for test payload."
)
args = parser.parse_args()
data = _generate_test_data(args.seed)
benchmarks = (
("SHA256 (pickle)", sha256),
("xxHash (pickle)", xxhash),
("built-in hash()", builtin_hash),
)
print("=" * 60)
print("HASH FUNCTION MICRO BENCHMARK")
print("=" * 60)
print("Test data: (32-byte bytes object, 32-int tuple)")
print(f"Iterations: {args.iterations:,}")
print("=" * 60)
results = list(_run_benchmarks(benchmarks, data, args.iterations))
builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
print("\nResults:")
for name, avg, std in results:
print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
if builtin_entry:
_, builtin_avg, _ = builtin_entry
print("\n" + "=" * 60)
print("SUMMARY (relative to built-in hash())")
print("=" * 60)
for name, avg, _ in results:
if name == "built-in hash()":
continue
speed_ratio = avg / builtin_avg
print(f"{name} is {speed_ratio:.1f}x slower than built-in hash()")
else:
print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,110 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Simple benchmark to compare prefix-cache block hashing algorithms.
Example:
python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
"""
from __future__ import annotations
import argparse
import random
import statistics
import sys
import time
from collections.abc import Callable, Iterable, Sequence
from vllm.utils.hashing import get_hash_fn_by_name
from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
def _generate_blocks(
num_blocks: int, block_size: int, vocab_size: int, seed: int
) -> list[list[int]]:
rng = random.Random(seed)
return [
[rng.randrange(vocab_size) for _ in range(block_size)]
for _ in range(num_blocks)
]
def _hash_all_blocks(
hash_fn: Callable[[object], bytes],
blocks: Iterable[Sequence[int]],
) -> float:
parent_hash: BlockHash | None = None
start = time.perf_counter()
for block in blocks:
parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
end = time.perf_counter()
return end - start
def _benchmark(
hash_algo: str,
blocks: list[list[int]],
trials: int,
) -> tuple[float, float, float] | None:
try:
hash_fn = get_hash_fn_by_name(hash_algo)
init_none_hash(hash_fn)
timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
except ModuleNotFoundError as exc:
print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
return None
avg = statistics.mean(timings)
best = min(timings)
# throughput: tokens / second
tokens_hashed = len(blocks) * len(blocks[0])
throughput = tokens_hashed / best
return avg, best, throughput
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
parser.add_argument(
"--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
)
parser.add_argument("--seed", type=int, default=0, help="Random seed.")
parser.add_argument(
"--trials", type=int, default=5, help="Number of timed trials per algorithm."
)
parser.add_argument(
"--algorithms",
nargs="+",
default=SUPPORTED_ALGOS,
choices=SUPPORTED_ALGOS,
help="Hash algorithms to benchmark.",
)
args = parser.parse_args()
blocks = _generate_blocks(
args.num_blocks, args.block_size, args.vocab_size, args.seed
)
print(
f"Benchmarking {len(args.algorithms)} algorithms on "
f"{args.num_blocks} blocks (block size={args.block_size})."
)
for algo in args.algorithms:
result = _benchmark(algo, blocks, args.trials)
if result is None:
continue
avg, best, throughput = result
print(
f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s "
f"throughput: {throughput / 1e6:.2f}M tokens/s"
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,244 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from enum import Enum
from itertools import product
from typing import Any
import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_per_token_group_quant_fp8_colmajor,
silu_mul_per_token_group_quant_fp8_colmajor,
)
from vllm.triton_utils import triton
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
from .utils import ArgPool, Bench, CudaGraphBenchParams
GROUP_SIZE = 128
FLOAT8_T = torch.float8_e4m3fn
def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
print(
f"Note : The timings reported above is for {cuda_graph_nops} "
"consecutive invocations of the benchmarking functions. "
f"Please divide by {cuda_graph_nops} for single invocation "
"timings."
)
compare = TBenchmark.Compare(timers)
compare.print()
class ImplType(Enum):
SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
REFERENCE = 2
def get_impl(self):
if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
return silu_mul_per_token_group_quant_fp8_colmajor
elif self == ImplType.REFERENCE:
return reference
raise ValueError(f"Unrecognized ImplType {self}")
@dataclass
class BenchmarkTensors:
input: torch.Tensor
output: torch.Tensor
# Reference act output tensor
ref_act_out: torch.Tensor
ref_quant_out: torch.Tensor
@staticmethod
def make(T: int, N: int) -> "BenchmarkTensors":
assert T % GROUP_SIZE == 0
assert N % (GROUP_SIZE * 2) == 0
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
# silu_mul_per_token_group_quant_fp8_colmajor output.
output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
FLOAT8_T
)
# reference output.
ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
ref_quant_out = torch.empty(
(T, N // 2), dtype=torch.bfloat16, device="cuda"
).to(FLOAT8_T)
return BenchmarkTensors(
input=input,
output=output,
ref_act_out=ref_act_out,
ref_quant_out=ref_quant_out,
)
@property
def T(self):
return self.input.size(0)
@property
def N(self):
return self.input.size(1)
def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
return {
"input": self.input,
"output": self.output,
"use_ue8m0": is_deep_gemm_e8m0_used(),
}
elif impl_type == ImplType.REFERENCE:
return {
"input": self.input,
"act_out": self.ref_act_out,
"quant_out": self.ref_quant_out,
"use_ue8m0": is_deep_gemm_e8m0_used(),
}
raise ValueError(f"Unrecognized impl_type {impl_type}")
def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
"""
Reference triton quant kernel from,
vllm.model_executor.layers.quantization.utils.fp8_utils
"""
assert quant_out.size() == x.size()
# Allocate the scale tensor column-major format.
shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
x_q = quant_out
x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
M = x.numel() // GROUP_SIZE
N = GROUP_SIZE
BLOCK = triton.next_power_of_2(N)
# heuristics for number of warps
num_warps = min(max(BLOCK // 256, 1), 8)
num_stages = 1
finfo = torch.finfo(FLOAT8_T)
fp8_min = finfo.min
fp8_max = finfo.max
_per_token_group_quant_fp8_colmajor[(M,)](
x,
x_q,
x_s,
GROUP_SIZE,
x.shape[1],
x.stride(0),
x_s.stride(1),
eps=1e-10,
fp8_min=fp8_min,
fp8_max=fp8_max,
use_ue8m0=use_ue8m0,
BLOCK=BLOCK,
num_warps=num_warps,
num_stages=num_stages,
)
return x_q, x_s
def reference(
input: torch.Tensor,
act_out: torch.Tensor,
quant_out: torch.Tensor,
use_ue8m0: bool,
) -> tuple[torch.Tensor, torch.Tensor]:
torch.ops._C.silu_and_mul(act_out, input)
return reference_quant(act_out, quant_out, use_ue8m0)
def bench_impl(
bench_tensors: list[BenchmarkTensors], impl_type: ImplType
) -> TMeasurement:
T = bench_tensors[0].T
N = bench_tensors[0].N
arg_pool_size = len(bench_tensors)
kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
# warmup
for kwargs in kwargs_list:
impl_type.get_impl()(**kwargs)
torch.cuda.synchronize()
# Merge into a single kwargs and qualify arguments as ArgPool
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
for _kwargs in kwargs_list:
for k, v in _kwargs.items():
kwargs[k].values.append(v)
cuda_graph_params = None
cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
timer = None
with Bench(
cuda_graph_params,
"silu-mul-quant",
f"num_tokens={T}, N={N}",
impl_type.name,
impl_type.get_impl(),
**kwargs,
) as bench:
timer = bench.run()
return timer
def test_correctness(T: int, N: int):
print(f"Testing num_tokens={T}, N={N} ...")
bench_tensor = BenchmarkTensors.make(T, N)
def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
# reference output
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
# test ouptut
out_q, out_s = output_from_impl(
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
)
torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
torch.testing.assert_close(ref_out_s, out_s)
def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
timers = []
for N, T in product(Ns, Ts):
test_correctness(T, N)
bench_tensors: list[BenchmarkTensors] = [
BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
]
silu_mul_quant_timer = bench_impl(
bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
)
timers.append(silu_mul_quant_timer)
reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
timers.append(reference_timer)
print_timers(
[silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
)
print_timers(timers, cuda_graph_nops=arg_pool_size)
return timers
if __name__ == "__main__":
T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
N = [2048, 4096, 8192]
print(f"T = {T}, N = {N}")
run(T, N, arg_pool_size=8)

View File

@ -150,6 +150,97 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################
#################### CSRC BUILD IMAGE ####################
FROM base AS csrc-build
ARG TARGETPLATFORM
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
# install build dependencies
COPY requirements/build.txt requirements/build.txt
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
WORKDIR /workspace
COPY pyproject.toml setup.py CMakeLists.txt ./
COPY cmake cmake/
COPY csrc csrc/
COPY vllm/envs.py vllm/envs.py
COPY vllm/__init__.py vllm/__init__.py
# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels
ARG VLLM_USE_PRECOMPILED=""
ARG VLLM_MERGE_BASE_COMMIT=""
ARG VLLM_MAIN_CUDA_VERSION=""
# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
&& tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
&& export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" \
&& export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
fi
ARG vllm_target_device="cuda"
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
if [ "$USE_SCCACHE" != "1" ]; then \
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
#################### CSRC BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
ARG TARGETPLATFORM
@ -172,66 +263,28 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
WORKDIR /workspace
COPY --from=csrc-build /workspace/dist /precompiled-wheels
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels
ARG VLLM_USE_PRECOMPILED=""
ARG VLLM_MAIN_CUDA_VERSION=""
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
&& tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
&& export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
fi
ARG vllm_target_device="cuda"
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" != "1" ]; then \
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
# Skip adding +precompiled suffix to version (preserves git-derived version)
ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "${vllm_target_device}" = "cuda" ]; then \
export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
fi && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
# Install DeepGEMM from source
ARG DEEPGEMM_GIT_REF
@ -527,7 +580,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
else \
BITSANDBYTES_VERSION="0.46.1"; \
fi; \
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0'
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
ENV VLLM_USAGE_SOURCE production-docker-image

Binary file not shown.

Before

Width:  |  Height:  |  Size: 146 KiB

After

Width:  |  Height:  |  Size: 174 KiB

View File

@ -670,6 +670,35 @@ vllm bench serve \
</details>
### 🧪 Hashing Benchmarks
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
Two helper scripts live in `benchmarks/` to compare hashing options used by prefix caching and related utilities. They are standalone (no server required) and help choose a hash algorithm before enabling prefix caching in production.
- `benchmarks/benchmark_hash.py`: Micro-benchmark that measures per-call latency of three implementations on a representative `(bytes, tuple[int])` payload.
```bash
python benchmarks/benchmark_hash.py --iterations 20000 --seed 42
```
- `benchmarks/benchmark_prefix_block_hash.py`: End-to-end block hashing benchmark that runs the full prefix-cache hash pipeline (`hash_block_tokens`) across many fake blocks and reports throughput.
```bash
python benchmarks/benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32 --trials 5
```
Supported algorithms: `sha256`, `sha256_cbor`, `xxhash`, `xxhash_cbor`. Install optional deps to exercise all variants:
```bash
uv pip install xxhash cbor2
```
If an algorithms dependency is missing, the script will skip it and continue.
</details>
### ⚡ Request Prioritization Benchmark
<details class="admonition abstract" markdown="1">

View File

@ -18,6 +18,7 @@ Compute Resources:
- Alibaba Cloud
- AMD
- Anyscale
- Arm
- AWS
- Crusoe Cloud
- Databricks

View File

@ -57,15 +57,15 @@ vLLM also provides [a reference example](../../examples/online_serving/prometheu
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
- `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds.
- `vllm:prompt_tokens_total` - Prompt tokens.
- `vllm:generation_tokens_total` - Generation tokens.
- `vllm:prompt_tokens` - Prompt tokens.
- `vllm:generation_tokens` - Generation tokens.
- `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
- `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
- `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
- `vllm:request_prompt_tokens` - Request prompt length.
- `vllm:request_generation_tokens` - Request generation length.
- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
- `vllm:request_success` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
- `vllm:request_queue_time_seconds` - Queue time.
- `vllm:request_prefill_time_seconds` - Requests prefill time.
- `vllm:request_decode_time_seconds` - Requests decode time.
@ -571,9 +571,9 @@ model and then validate those tokens with the larger model.
- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
- `vllm:spec_decode_efficiency` (Gauge)
- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
- `vllm:spec_decode_num_accepted_tokens` (Counter)
- `vllm:spec_decode_num_draft_tokens` (Counter)
- `vllm:spec_decode_num_emitted_tokens` (Counter)
There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
speculative decoding to v1. Other techniques will follow. We should

View File

@ -90,7 +90,6 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
@ -114,5 +113,5 @@ The following table shows "families" of modular kernels that are intended to wor
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|---------|-----------------------------------------|----------------------------------------------|
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |

View File

@ -54,7 +54,7 @@ th:not(:first-child) {
| beam-search | ✅ | ✅ | ✅ | [](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
\* Chunked prefill and prefix caching are only applicable to last-token or all pooling with causal attention.
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
### Feature x Hardware

View File

@ -0,0 +1,58 @@
# MooncakeConnector Usage Guide
## About Mooncake
Mooncake aims to enhance the inference efficiency of large language models (LLMs), especially in slow object storage environments, by constructing a multi-level caching pool on high-speed interconnected DRAM/SSD resources. Compared to traditional caching systems, Mooncake utilizes (GPUDirect) RDMA technology to transfer data directly in a zero-copy manner, while maximizing the use of multi-NIC resources on a single machine.
For more details about Mooncake, please refer to [Mooncake project](https://github.com/kvcache-ai/Mooncake) and [Mooncake documents](https://kvcache-ai.github.io/Mooncake/).
## Prerequisites
### Installation
Install mooncake through pip: `uv pip install mooncake-transfer-engine`.
Refer to [Mooncake official repository](https://github.com/kvcache-ai/Mooncake) for more installation instructions
## Usage
### Prefiller Node (192.168.0.2)
```bash
vllm serve Qwen/Qwen2.5-7B-Instruct --port 8010 --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_producer"}'
```
### Decoder Node (192.168.0.3)
```bash
vllm serve Qwen/Qwen2.5-7B-Instruct --port 8020 --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_consumer"}'
```
### Proxy
```bash
python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --prefiller-host 192.168.0.2 --prefiller-port 8010 --decoder-host 192.168.0.3 --decoder-port 8020
```
> NOTE: The Mooncake Connector currently uses the proxy from nixl_integration. This will be replaced with a self-developed proxy in the future.
Now you can send requests to the proxy server through port 8000.
## Environment Variables
- `VLLM_MOONCAKE_BOOTSTRAP_PORT`: Port for Mooncake bootstrap server
- Default: 8998
- Required only for prefiller instances
- Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine
- For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank
- Used for the decoder notifying the prefiller
- `VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefillers KV cache for a particular request. (Optional)
- Default: 480
- If a request is aborted and the decoder has not yet notified the prefiller, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
## KV Role Options
- **kv_producer**: For prefiller instances that generate KV caches
- **kv_consumer**: For decoder instances that consume KV caches from prefiller
- **kv_both**: Enables symmetric functionality where the connector can act as both producer and consumer. This provides flexibility for experimental setups and scenarios where the role distinction is not predetermined.

View File

@ -795,14 +795,12 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
??? code
```python
from vllm.utils.serial_utils import tensor2base64
image_embedding = torch.load(...)
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
buffer = io.BytesIO()
torch.save(image_embedding, buffer)
buffer.seek(0)
binary_data = buffer.read()
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
base64_image_embedding = tensor2base64(image_embedding)
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")

View File

@ -4,9 +4,6 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must
Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
!!! warning
There are no pre-built wheels or images for this device, so you must build vLLM from source.
# --8<-- [end:installation]
# --8<-- [start:requirements]
@ -20,6 +17,8 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
# --8<-- [end:set-up-using-python]
# --8<-- [start:pre-built-wheels]
Currently, there are no pre-built Apple silicon CPU wheels.
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]
@ -78,6 +77,8 @@ uv pip install -e .
# --8<-- [end:build-wheel-from-source]
# --8<-- [start:pre-built-images]
Currently, there are no pre-built Arm silicon CPU images.
# --8<-- [end:pre-built-images]
# --8<-- [start:build-image-from-source]

View File

@ -1,11 +1,6 @@
# --8<-- [start:installation]
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform.
ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
!!! warning
There are no pre-built wheels or images for this device, so you must build vLLM from source.
vLLM offers basic model inferencing and serving on Arm CPU platform, with support NEON, data types FP32, FP16 and BF16.
# --8<-- [end:installation]
# --8<-- [start:requirements]
@ -20,6 +15,23 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
# --8<-- [end:set-up-using-python]
# --8<-- [start:pre-built-wheels]
Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels contain pre-compiled C++ binaries.
Please replace `<version>` in the commands below with a specific version string (e.g., `0.11.2`).
```bash
uv pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
```
??? console "pip"
```bash
pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
```
The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
!!! note
Nightly wheels are currently unsupported for this architecture. (e.g. to bisect the behavior change, performance regression).
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]
@ -69,6 +81,8 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
# --8<-- [end:build-wheel-from-source]
# --8<-- [start:pre-built-images]
Currently, there are no pre-built Arm CPU images.
# --8<-- [end:pre-built-images]
# --8<-- [start:build-image-from-source]
```bash

View File

@ -46,11 +46,25 @@ vLLM is a Python library that supports the following CPU variants. Select your C
### Pre-built wheels
Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
=== "Intel/AMD x86"
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-wheels"
=== "ARM AArch64"
--8<-- "docs/getting_started/installation/cpu.arm.inc.md:pre-built-wheels"
=== "Apple silicon"
--8<-- "docs/getting_started/installation/cpu.apple.inc.md:pre-built-wheels"
=== "IBM Z (S390X)"
--8<-- "docs/getting_started/installation/cpu.s390x.inc.md:pre-built-wheels"
### Build wheel from source
#### Set up using Python-only build (without compilation) {#python-only-build}
@ -87,6 +101,18 @@ VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-images"
=== "ARM AArch64"
--8<-- "docs/getting_started/installation/cpu.arm.inc.md:pre-built-images"
=== "Apple silicon"
--8<-- "docs/getting_started/installation/cpu.apple.inc.md:pre-built-images"
=== "IBM Z (S390X)"
--8<-- "docs/getting_started/installation/cpu.s390x.inc.md:pre-built-images"
### Build image from source
=== "Intel/AMD x86"

View File

@ -4,9 +4,6 @@ vLLM has experimental support for s390x architecture on IBM Z platform. For now,
Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
!!! warning
There are no pre-built wheels or images for this device, so you must build vLLM from source.
# --8<-- [end:installation]
# --8<-- [start:requirements]
@ -21,6 +18,8 @@ Currently, the CPU implementation for s390x architecture supports FP32 datatype
# --8<-- [end:set-up-using-python]
# --8<-- [start:pre-built-wheels]
Currently, there are no pre-built IBM Z CPU wheels.
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]
@ -69,6 +68,8 @@ Execute the following commands to build and install vLLM from source.
# --8<-- [end:build-wheel-from-source]
# --8<-- [start:pre-built-images]
Currently, there are no pre-built IBM Z CPU images.
# --8<-- [end:pre-built-images]
# --8<-- [start:build-image-from-source]

View File

@ -17,6 +17,8 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
# --8<-- [end:set-up-using-python]
# --8<-- [start:pre-built-wheels]
Currently, there are no pre-built x86 CPU wheels.
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]

View File

@ -5,9 +5,6 @@ vLLM supports AMD GPUs with ROCm 6.3 or above, and torch 2.8.0 and above.
!!! tip
[Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.
!!! warning
There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
# --8<-- [end:installation]
# --8<-- [start:requirements]

View File

@ -2,9 +2,6 @@
vLLM initially supports basic model inference and serving on Intel GPU platform.
!!! warning
There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions.
# --8<-- [end:installation]
# --8<-- [start:requirements]

View File

@ -711,7 +711,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
| `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |

View File

@ -23,31 +23,23 @@ def create_test_prompts(
# this is an example of using quantization without LoRA
(
"My name is",
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
),
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
None,
),
# the next three examples use quantization with LoRA
(
"my name is",
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
),
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
LoRARequest("lora-test-1", 1, lora_path),
),
(
"The capital of USA is",
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
),
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
LoRARequest("lora-test-2", 1, lora_path),
),
(
"The capital of France is",
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
),
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
LoRARequest("lora-test-3", 1, lora_path),
),
]

View File

@ -27,9 +27,7 @@ def create_test_prompts(
return [
(
"A robot may not injure a human being",
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
),
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
None,
),
(
@ -41,22 +39,12 @@ def create_test_prompts(
),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
SamplingParams(
temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=128,
),
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
LoRARequest("sql-lora", 1, lora_path),
),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
SamplingParams(
temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=128,
),
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
LoRARequest("sql-lora2", 2, lora_path),
),
]

View File

@ -309,6 +309,28 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
)
# HunyuanOCR
def load_hunyuan_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "tencent/HunyuanOCR"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholder = (
"<hy_place▁holder▁no▁100><hy_place▁holder▁no▁102><hy_place▁holder▁no▁101>" # noqa: E501
) * len(image_urls)
prompt = f"<hy_begin▁of▁sentence>{placeholder}{question}<hy_User>"
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_hyperclovax_seed_vision(
question: str, image_urls: list[str]
) -> ModelRequestData:
@ -1322,6 +1344,7 @@ model_example_map = {
"deepseek_ocr": load_deepseek_ocr,
"gemma3": load_gemma3,
"h2ovl_chat": load_h2ovl,
"hunyuan_vl": load_hunyuan_vl,
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
"idefics3": load_idefics3,
"interns1": load_interns1,

View File

@ -28,13 +28,11 @@ Dependencies:
- openai
"""
import base64
import io
import torch
import transformers
from openai import OpenAI
from vllm.utils.serial_utils import tensor2base64
def main():
client = OpenAI(
@ -58,11 +56,7 @@ def main():
prompt_embeds = embedding_layer(token_ids).squeeze(0)
# Prompt embeddings
buffer = io.BytesIO()
torch.save(prompt_embeds, buffer)
buffer.seek(0)
binary_data = buffer.read()
encoded_embeds = base64.b64encode(binary_data).decode("utf-8")
encoded_embeds = tensor2base64(prompt_embeds)
completion = client.completions.create(
model=model_name,

View File

@ -150,7 +150,8 @@ def run_siglip(client: OpenAI, model: str):
Start the server using:
vllm serve google/siglip-base-patch16-224 \
--runner pooling
--runner pooling \
--chat-template template_basic.jinja
"""
response = create_chat_embeddings(

View File

@ -46,6 +46,7 @@ scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu
pybase64 # fast base64 implementation
cbor2 # Required for cross-language serialization of hashable objects
ijson # Required for mistral streaming tool parser
setproctitle # Used to set process names for better debugging and monitoring
openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic == 0.71.0

View File

@ -3,7 +3,6 @@ ninja
packaging>=24.2
setuptools>=77.0.3,<81.0.0
setuptools-scm>=8
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL)

View File

@ -4,7 +4,6 @@
numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
# Dependencies for CPUs
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"

View File

@ -42,6 +42,6 @@ tritonclient==2.51.0
numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.0
runai-model-streamer[s3,gcs]==0.15.3
fastsafetensors>=0.1.10
pydantic>=2.12 # 2.11 leads to error on python 3.13

View File

@ -12,7 +12,7 @@ tensorizer==2.10.1
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
runai-model-streamer[s3,gcs]==0.15.0
runai-model-streamer[s3,gcs]==0.15.3
conch-triton-kernels==1.2.1
timm>=1.0.17
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459

View File

@ -51,7 +51,7 @@ tritonclient==2.51.0
arctic-inference == 0.1.1 # Required for suffix decoding test
numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.0
runai-model-streamer[s3,gcs]==0.15.3
fastsafetensors>=0.1.10
pydantic>=2.12 # 2.11 leads to error on python 3.13
decord==0.6.0

View File

@ -965,11 +965,11 @@ rsa==4.9.1
# via google-auth
rtree==1.4.0
# via torchgeo
runai-model-streamer==0.15.0
runai-model-streamer==0.15.3
# via -r requirements/test.in
runai-model-streamer-gcs==0.15.0
runai-model-streamer-gcs==0.15.3
# via runai-model-streamer
runai-model-streamer-s3==0.15.0
runai-model-streamer-s3==0.15.3
# via runai-model-streamer
s3transfer==0.10.3
# via boto3

View File

@ -346,10 +346,13 @@ class precompiled_wheel_utils:
The order of preference is:
1. user-specified wheel location (can be either local or remote, via
VLLM_PRECOMPILED_WHEEL_LOCATION)
2. user-specified variant from nightly repo (current main commit via
VLLM_PRECOMPILED_WHEEL_VARIANT)
2. user-specified variant (VLLM_PRECOMPILED_WHEEL_VARIANT) from nightly repo
3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
4. the default variant from nightly repo (current main commit)
4. the default variant from nightly repo
If downloading from the nightly repo, the commit can be specified via
VLLM_PRECOMPILED_WHEEL_COMMIT; otherwise, the head commit in the main branch
is used.
"""
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is not None:
@ -362,10 +365,13 @@ class precompiled_wheel_utils:
# try to fetch the wheel metadata from the nightly wheel repo
main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
commit = os.getenv(
"VLLM_PRECOMPILED_WHEEL_COMMIT",
precompiled_wheel_utils.get_base_commit_in_main_branch(),
)
commit = os.getenv("VLLM_PRECOMPILED_WHEEL_COMMIT", "").lower()
if not commit or len(commit) != 40:
print(
f"VLLM_PRECOMPILED_WHEEL_COMMIT not valid: {commit}"
", trying to fetch base commit in main branch"
)
commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
print(f"Using precompiled wheel commit {commit} with variant {variant}")
try_default = False
wheels, repo_url, download_filename = None, None, None
@ -461,14 +467,22 @@ class precompiled_wheel_utils:
"vllm/cumem_allocator.abi3.so",
]
compiled_regex = re.compile(
flash_attn_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
triton_kernels_regex = re.compile(
r"vllm/third_party/triton_kernels/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist)
)
file_members += list(
filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
filter(lambda x: flash_attn_regex.match(x.filename), wheel.filelist)
)
file_members += list(
filter(
lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
)
)
for file in file_members:
@ -494,10 +508,6 @@ class precompiled_wheel_utils:
@staticmethod
def get_base_commit_in_main_branch() -> str:
# Force to use the nightly wheel. This is mainly used for CI testing.
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
return "nightly"
try:
# Get the latest commit hash of the upstream main branch.
resp_json = subprocess.check_output(
@ -508,6 +518,7 @@ class precompiled_wheel_utils:
]
).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"]
print(f"Upstream main branch latest commit: {upstream_main_commit}")
# In Docker build context, .git may be immutable or missing.
if envs.VLLM_DOCKER_BUILD_CONTEXT:
@ -648,7 +659,7 @@ def get_vllm_version() -> str:
if envs.VLLM_TARGET_DEVICE == "empty":
version += f"{sep}empty"
elif _is_cuda():
if envs.VLLM_USE_PRECOMPILED:
if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
version += f"{sep}precompiled"
else:
cuda_version = str(get_nvcc_cuda_version())
@ -786,7 +797,7 @@ setup(
"bench": ["pandas", "matplotlib", "seaborn", "datasets"],
"tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"],
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
"audio": [
"librosa",
"soundfile",

View File

@ -392,39 +392,48 @@ def test_pass_config_deprecation(caplog_vllm):
assert "enable_fusion is deprecated" in caplog_vllm.text
assert config.fuse_norm_quant is True
assert config.fuse_act_quant is True
assert config.enable_fusion is None
assert config.enable_fusion is True
# Test enable_attn_fusion -> fuse_attn_quant
caplog_vllm.clear()
config = PassConfig(enable_attn_fusion=True)
assert "enable_attn_fusion is deprecated" in caplog_vllm.text
assert config.fuse_attn_quant is True
assert config.enable_attn_fusion is None
assert config.enable_attn_fusion is True
# Test enable_noop -> eliminate_noops
caplog_vllm.clear()
config = PassConfig(enable_noop=True)
assert "enable_noop is deprecated" in caplog_vllm.text
assert config.eliminate_noops is True
assert config.enable_noop is None
assert config.enable_noop is True
# Test enable_sequence_parallelism -> enable_sp
caplog_vllm.clear()
config = PassConfig(enable_sequence_parallelism=True)
assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
assert config.enable_sp is True
assert config.enable_sequence_parallelism is None
assert config.enable_sequence_parallelism is True
# Test enable_async_tp -> fuse_gemm_comms
caplog_vllm.clear()
config = PassConfig(enable_async_tp=True)
assert "enable_async_tp is deprecated" in caplog_vllm.text
assert config.fuse_gemm_comms is True
assert config.enable_async_tp is None
assert config.enable_async_tp is True
# Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
caplog_vllm.clear()
config = PassConfig(enable_fi_allreduce_fusion=True)
assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
assert config.fuse_allreduce_rms is True
assert config.enable_fi_allreduce_fusion is None
assert config.enable_fi_allreduce_fusion is True
# Test hash consistency
config_old = PassConfig(enable_fusion=True)
config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True)
assert config_old.compute_hash() == config_new.compute_hash()
config_old = PassConfig(enable_async_tp=True)
config_new = PassConfig(fuse_gemm_comms=True)
assert config_old.compute_hash() == config_new.compute_hash()

View File

@ -6,6 +6,7 @@ import lm_eval
import pytest
from tests.utils import large_gpu_mark
from vllm.platforms import current_platform
def get_model_args(
@ -45,6 +46,12 @@ def get_model_args(
return model_args
pytestmark = pytest.mark.skipif(
current_platform.is_rocm(),
reason="EPLB with Spec Decode is a work in progress on ROCm.",
)
@pytest.mark.parametrize(
"model_setup",
[

View File

@ -232,7 +232,7 @@ async def test_server_load(server: RemoteOpenAIServer):
@pytest.mark.asyncio
async def test_health_check_engine_dead_error():
# Import the health function directly to test it in isolation
from vllm.entrypoints.openai.api_server import health
from vllm.entrypoints.serve.instrumentator.health import health
# Create a mock request that simulates what FastAPI would provide
mock_request = Mock(spec=Request)

View File

@ -69,9 +69,20 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
stream=True,
)
first_chunk = None
chunk_count = 0
async for chunk in resp:
chunk_count += 1
if first_chunk is None and chunk.type == "message_start":
first_chunk = chunk
print(chunk.model_dump_json())
assert chunk_count > 0
assert first_chunk is not None, "message_start chunk was never observed"
assert first_chunk.usage is not None, "first chunk should include usage stats"
assert first_chunk.usage["output_tokens"] == 0
assert first_chunk.usage["input_tokens"] > 5
@pytest.mark.asyncio
async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):

View File

@ -42,6 +42,24 @@ async def test_basic(client: OpenAI, model_name: str):
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_enable_response_messages(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="Hello?",
extra_body={"enable_response_messages": True},
)
assert response.status == "completed"
assert response.input_messages[0]["type"] == "raw_message_tokens"
assert type(response.input_messages[0]["message"]) is str
assert len(response.input_messages[0]["message"]) > 10
assert type(response.input_messages[0]["tokens"][0]) is int
assert type(response.output_messages[0]["message"]) is str
assert len(response.output_messages[0]["message"]) > 10
assert type(response.output_messages[0]["tokens"][0]) is int
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_reasoning_item(client: OpenAI, model_name: str):

View File

@ -2,64 +2,47 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import numpy as np
import pytest
import requests
import torch
from vllm.utils.serial_utils import tensor2base64
from ...utils import RemoteOpenAIServer
MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
DTYPE = "float16"
def _terratorch_dummy_inputs(model_name: str):
def _terratorch_dummy_messages():
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
buffer_tiff = io.BytesIO()
torch.save(pixel_values, buffer_tiff)
buffer_tiff.seek(0)
binary_data = buffer_tiff.read()
base64_tensor_embedding = base64.b64encode(binary_data).decode("utf-8")
buffer_coord = io.BytesIO()
torch.save(location_coords, buffer_coord)
buffer_coord.seek(0)
binary_data = buffer_coord.read()
base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8")
return {
"model": model_name,
"additional_data": {"prompt_token_ids": [1]},
"encoding_format": "base64",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_embeds",
"image_embeds": {
"pixel_values": base64_tensor_embedding,
"location_coords": base64_coord_embedding,
},
}
],
}
],
}
return [
{
"role": "user",
"content": [
{
"type": "image_embeds",
"image_embeds": {
"pixel_values": tensor2base64(pixel_values),
"location_coords": tensor2base64(location_coords),
},
}
],
}
]
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_request(model_name: str):
@pytest.mark.parametrize(
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
)
def test_single_request(model_name: str):
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"float16",
"--enforce-eager",
"--trust-remote-code",
"--max-num-seqs",
@ -70,11 +53,15 @@ async def test_single_request(model_name: str):
"--enable-mm-embeds",
]
with RemoteOpenAIServer(MODEL_NAME, args) as server:
prompt = _terratorch_dummy_inputs(model_name)
# test single pooling
response = requests.post(server.url_for("pooling"), json=prompt)
with RemoteOpenAIServer(model_name, args) as server:
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"messages": _terratorch_dummy_messages(),
"encoding_format": "base64",
},
)
response.raise_for_status()
output = response.json()["data"][0]["data"]

View File

@ -61,11 +61,8 @@ def test_pooling_params(llm: LLM):
@pytest.mark.skip_global_cleanup
def test_encode_api(llm: LLM):
# chunked prefill does not support all pooling
err_msg = "pooling_task must be one of.+"
with pytest.raises(ValueError, match=err_msg):
llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
def test_token_classify(llm: LLM):
llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
def test_score_api(llm: LLM):

View File

@ -255,21 +255,21 @@ async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
# token_classify uses ALL pooling, which does not support chunked prefill.
task = "token_classify"
input_text = ["This product was excellent and exceeded my expectations"]
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": "test",
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(
f"Task {task} is not supported"
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 8
assert len(poolings.data[0].data[0]) == 2
@pytest.mark.asyncio

View File

@ -42,7 +42,7 @@ def llm():
@pytest.mark.skip_global_cleanup
def test_encode_api(llm: LLM):
def test_token_embed(llm: LLM):
outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
multi_vector = outputs[0].outputs.data
assert multi_vector.shape == (11, 384)

View File

@ -36,6 +36,13 @@ def llm():
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_config(llm: LLM):
vllm_config = llm.llm_engine.vllm_config
assert vllm_config.cache_config.enable_prefix_caching
assert vllm_config.scheduler_config.enable_chunked_prefill
def test_pooling_params(llm: LLM):
def get_outputs(use_activation):
outputs = llm.reward(

View File

@ -29,6 +29,7 @@ from vllm.multimodal.utils import (
encode_video_base64,
)
from vllm.tokenizers import MistralTokenizer, get_tokenizer
from vllm.utils.serial_utils import tensor2base64
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import VLLM_PATH
@ -85,11 +86,6 @@ def phi3v_model_config_image_embeds():
)
@pytest.fixture(scope="module")
def phi3v_tokenizer():
return get_tokenizer(PHI3V_MODEL_ID)
@pytest.fixture(scope="function")
def qwen2_audio_model_config():
return ModelConfig(
@ -115,11 +111,6 @@ def audio_embeds_model_config():
)
@pytest.fixture(scope="module")
def qwen2_audio_tokenizer():
return get_tokenizer(QWEN2AUDIO_MODEL_ID)
@pytest.fixture(scope="function")
def qwen25omni_model_config_mm_interleaved():
return ModelConfig(
@ -134,11 +125,6 @@ def qwen25omni_model_config_mm_interleaved():
)
@pytest.fixture(scope="module")
def qwen25omni_tokenizer():
return get_tokenizer(QWEN25OMNI_MODEL_ID)
@pytest.fixture(scope="function")
def mistral_model_config():
return ModelConfig(
@ -150,11 +136,6 @@ def mistral_model_config():
)
@pytest.fixture(scope="module")
def mistral_tokenizer():
return get_tokenizer(MISTRAL_MODEL_ID)
@pytest.fixture(scope="module")
def image_url():
image = ImageAsset("cherry_blossom")
@ -239,7 +220,6 @@ def _assert_mm_data_inputs(
def test_parse_chat_messages_single_image(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data, mm_uuids = parse_chat_messages(
@ -253,7 +233,6 @@ def test_parse_chat_messages_single_image(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -266,7 +245,6 @@ def test_parse_chat_messages_single_image(
def test_parse_chat_messages_single_image_with_uuid(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid = str(hash(image_url))
@ -287,7 +265,6 @@ def test_parse_chat_messages_single_image_with_uuid(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -300,7 +277,6 @@ def test_parse_chat_messages_single_image_with_uuid(
def test_parse_chat_messages_single_empty_image_with_uuid(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid = str(hash(image_url))
@ -319,7 +295,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -332,7 +307,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
def test_parse_chat_messages_single_image_with_bad_uuid_format(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid = str(hash(image_url))
@ -354,7 +328,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -367,7 +340,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
def test_parse_chat_messages_multiple_images_with_uuids(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid1 = "my_uuid_1"
@ -397,7 +369,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -413,7 +384,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
def test_parse_chat_messages_multiple_empty_images_with_uuids(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid1 = "my_uuid_1"
@ -439,7 +409,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -455,7 +424,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
def test_parse_chat_messages_mixed_empty_images_with_uuids(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid1 = "my_uuid_1"
@ -483,7 +451,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -500,7 +467,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
@pytest.mark.asyncio
async def test_parse_chat_messages_single_image_with_uuid_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid = str(hash(image_url))
@ -519,7 +485,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -533,7 +498,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
@pytest.mark.asyncio
async def test_parse_chat_messages_empty_image_with_uuid_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid = str(hash(image_url))
@ -552,7 +516,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -566,7 +529,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_with_uuids_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid1 = "my_uuid_1"
@ -592,7 +554,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -609,7 +570,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid1 = "my_uuid_1"
@ -635,7 +595,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -652,7 +611,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid2 = "my_uuid_2"
@ -676,7 +634,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -692,7 +649,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
def test_parse_chat_messages_empty_system(
mistral_model_config,
mistral_tokenizer,
):
# Test string format
conversation, _, _ = parse_chat_messages(
@ -704,7 +660,6 @@ def test_parse_chat_messages_empty_system(
},
],
mistral_model_config,
mistral_tokenizer,
content_format="string",
)
assert conversation == [
@ -722,7 +677,6 @@ def test_parse_chat_messages_empty_system(
},
],
mistral_model_config,
mistral_tokenizer,
content_format="openai",
)
assert conversation == [
@ -734,7 +688,6 @@ def test_parse_chat_messages_empty_system(
@pytest.mark.asyncio
async def test_parse_chat_messages_single_image_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@ -748,7 +701,6 @@ async def test_parse_chat_messages_single_image_async(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -761,7 +713,6 @@ async def test_parse_chat_messages_single_image_async(
def test_parse_chat_messages_multiple_images(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data, mm_uuids = parse_chat_messages(
@ -779,7 +730,6 @@ def test_parse_chat_messages_multiple_images(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -795,7 +745,6 @@ def test_parse_chat_messages_multiple_images(
def test_parse_chat_messages_empty_pil_image_with_uuid(
phi3v_model_config,
phi3v_tokenizer,
):
uuid = "abcd"
conversation, mm_data, mm_uuids = parse_chat_messages(
@ -809,7 +758,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -825,7 +773,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
def test_parse_chat_messages_empty_image_embeds_with_uuid(
phi3v_model_config_image_embeds,
phi3v_tokenizer,
):
uuid = "abcd"
conversation, mm_data, mm_uuids = parse_chat_messages(
@ -839,7 +786,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
}
],
phi3v_model_config_image_embeds,
phi3v_tokenizer,
content_format="string",
)
@ -857,7 +803,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
def test_parse_chat_messages_empty_audio_embeds_with_uuid(
audio_embeds_model_config,
qwen2_audio_tokenizer,
):
"""Test audio_embeds with UUID (no actual embeds data)."""
uuid = "test-audio-uuid-123"
@ -873,7 +818,6 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
}
],
audio_embeds_model_config,
qwen2_audio_tokenizer,
content_format="string",
)
@ -889,11 +833,8 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
def test_parse_chat_messages_audio_embeds_with_string(
audio_embeds_model_config,
qwen2_audio_tokenizer,
):
"""Test audio_embeds with base64 string embedding data."""
import base64
import io
import torch
@ -901,11 +842,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
audio_embedding = torch.randn(1, 128, 768)
# Encode it as base64
buffer = io.BytesIO()
torch.save(audio_embedding, buffer)
buffer.seek(0)
binary_data = buffer.read()
base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
base64_audio_embedding = tensor2base64(audio_embedding)
conversation, mm_data, mm_uuids = parse_chat_messages(
[
@ -921,7 +858,6 @@ def test_parse_chat_messages_audio_embeds_with_string(
}
],
audio_embeds_model_config,
qwen2_audio_tokenizer,
content_format="string",
)
@ -939,11 +875,8 @@ def test_parse_chat_messages_audio_embeds_with_string(
@pytest.mark.asyncio
async def test_parse_chat_messages_audio_embeds_async(
audio_embeds_model_config,
qwen2_audio_tokenizer,
):
"""Test audio_embeds with async futures."""
import base64
import io
import torch
@ -951,11 +884,7 @@ async def test_parse_chat_messages_audio_embeds_async(
audio_embedding = torch.randn(1, 128, 768)
# Encode it as base64
buffer = io.BytesIO()
torch.save(audio_embedding, buffer)
buffer.seek(0)
binary_data = buffer.read()
base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
base64_audio_embedding = tensor2base64(audio_embedding)
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
[
@ -971,7 +900,6 @@ async def test_parse_chat_messages_audio_embeds_async(
}
],
audio_embeds_model_config,
qwen2_audio_tokenizer,
content_format="string",
)
@ -990,7 +918,6 @@ async def test_parse_chat_messages_audio_embeds_async(
@pytest.mark.asyncio
async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
phi3v_model_config_image_embeds,
phi3v_tokenizer,
):
uuid = "abcd"
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@ -1004,7 +931,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
}
],
phi3v_model_config_image_embeds,
phi3v_tokenizer,
content_format="string",
)
@ -1024,7 +950,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@ -1042,7 +967,6 @@ async def test_parse_chat_messages_multiple_images_async(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -1058,7 +982,6 @@ async def test_parse_chat_messages_multiple_images_async(
def test_parse_chat_messages_placeholder_already_in_prompt(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1076,7 +999,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
assert conversation == [
@ -1091,7 +1013,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
def test_parse_chat_messages_placeholder_one_already_in_prompt(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1110,7 +1031,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -1127,7 +1047,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
def test_parse_chat_messages_multiple_images_across_messages(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1149,7 +1068,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
},
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -1164,7 +1082,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
image_uuid = str(hash(image_url))
@ -1195,7 +1112,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
},
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -1210,7 +1126,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
def test_parse_chat_messages_context_text_format(
phi3v_model_config,
phi3v_tokenizer,
):
conversation, mm_data, mm_uuids = parse_chat_messages(
[
@ -1222,7 +1137,6 @@ def test_parse_chat_messages_context_text_format(
{"role": "user", "content": "What about this one?"},
],
phi3v_model_config,
phi3v_tokenizer,
content_format="openai",
)
@ -1246,7 +1160,6 @@ def test_parse_chat_messages_context_text_format(
def test_parse_chat_messages_rejects_too_many_images_in_one_message(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
with warnings.catch_warnings():
@ -1277,14 +1190,12 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
def test_parse_chat_messages_rejects_too_many_images_across_messages(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
with warnings.catch_warnings():
@ -1322,14 +1233,12 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
},
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
def test_parse_chat_messages_multiple_images_uncommon_input(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1344,7 +1253,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
}
],
phi3v_model_config,
phi3v_tokenizer,
content_format="string",
)
@ -1360,7 +1268,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
def test_parse_chat_messages_multiple_images_interleave(
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
image_url,
):
conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1380,7 +1287,6 @@ def test_parse_chat_messages_multiple_images_interleave(
}
],
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
content_format="string",
)
@ -1398,7 +1304,6 @@ def test_parse_chat_messages_multiple_images_interleave(
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_interleave_async(
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
image_url,
):
conversation, mm_data, mm_uuids = parse_chat_messages_futures(
@ -1418,7 +1323,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
}
],
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
content_format="string",
)
@ -1436,7 +1340,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
image_url,
):
image_uuid = str(hash(image_url))
@ -1465,7 +1368,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
}
],
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
content_format="string",
)
@ -1482,7 +1384,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
image_url,
):
conversation, mm_data, mm_uuids = parse_chat_messages(
@ -1505,7 +1406,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
},
],
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
content_format="string",
)
@ -1523,7 +1423,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
image_url,
):
image_uuid = str(hash(image_url))
@ -1555,7 +1454,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
},
],
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
content_format="string",
)
@ -1573,7 +1471,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer,
image_url,
video_url,
audio_url,
@ -1601,7 +1498,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
},
],
qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer,
content_format="string",
)
@ -1627,7 +1523,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer,
image_url,
video_url,
audio_url,
@ -1671,7 +1566,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
},
],
qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer,
content_format="string",
)
@ -1699,7 +1593,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave( # noqa: E501
qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer,
image_url,
video_url,
audio_url,
@ -1743,7 +1636,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
},
],
qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer,
content_format="string",
)
@ -1775,7 +1667,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501
qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer,
image_url,
video_url,
audio_url,
@ -1811,7 +1702,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
},
],
qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer,
content_format="string",
)
@ -1837,7 +1727,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
image_url,
):
with pytest.raises(
@ -1861,7 +1750,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
}
],
phi3v_model_config_mm_interleaved,
phi3v_tokenizer,
content_format="string",
)
@ -2237,9 +2125,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
assert resolved_format == expected_format
def test_parse_chat_messages_include_thinking_chunk(
mistral_model_config, mistral_tokenizer
):
def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
messages = [
{
"role": "system",
@ -2269,7 +2155,6 @@ def test_parse_chat_messages_include_thinking_chunk(
conversation_with_thinking, _, _ = parse_chat_messages(
messages,
mistral_model_config,
mistral_tokenizer,
content_format="openai",
)
@ -2353,7 +2238,6 @@ def test_apply_mistral_chat_template_thinking_chunk():
def test_parse_chat_messages_single_empty_audio_with_uuid(
qwen2_audio_model_config,
qwen2_audio_tokenizer,
):
audio_uuid = "abcd"
conversation, mm_data, mm_uuids = parse_chat_messages(
@ -2371,7 +2255,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
}
],
qwen2_audio_model_config,
qwen2_audio_tokenizer,
content_format="string",
)
@ -2389,7 +2272,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
@pytest.mark.asyncio
async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
qwen2_audio_model_config,
qwen2_audio_tokenizer,
):
audio_uuid = "abcd"
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
@ -2407,7 +2289,6 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
}
],
qwen2_audio_model_config,
qwen2_audio_tokenizer,
content_format="string",
)

View File

@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import (
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
BatchedTritonOrDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEQuantConfig,
@ -286,16 +283,6 @@ if has_deep_gemm() and is_deep_gemm_supported():
needs_matching_quant=False,
needs_deep_gemm=True,
)
register_experts(
BatchedTritonOrDeepGemmExperts,
batched_format,
common_float_and_int_types,
blocked_quantization_support=True,
supports_chunking=False,
supports_expert_map=False,
needs_matching_quant=True,
needs_deep_gemm=True,
)
register_experts(
TritonOrDeepGemmExperts,
standard_format,
@ -457,10 +444,6 @@ def make_fused_experts(
kwargs = batch_kwargs | quant_kwargs
print(f"Making BatchedTritonExperts {kwargs} ...")
experts = BatchedTritonExperts(**kwargs)
elif fused_experts_type == BatchedTritonOrDeepGemmExperts:
kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs
print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
experts = BatchedTritonOrDeepGemmExperts(**kwargs)
elif fused_experts_type == DeepGemmExperts:
print(f"Making DeepGemmExperts {quant_config} ...")
experts = DeepGemmExperts(quant_config)

View File

@ -0,0 +1,86 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_per_token_group_quant_fp8_colmajor,
silu_mul_per_token_group_quant_fp8_colmajor,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
FLOAT8_DTYPE = torch.float8_e4m3fn
GROUP_SIZE = 128
def reference_quant(x: torch.Tensor, use_ue8m0: bool):
"""
Reference triton quant kernel from,
vllm.model_executor.layers.quantization.utils.fp8_utils
"""
x_q = torch.empty_like(x, device=x.device, dtype=FLOAT8_DTYPE)
# Allocate the scale tensor in column-major format.
shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
M = x.numel() // GROUP_SIZE
N = GROUP_SIZE
BLOCK = triton.next_power_of_2(N)
# heuristics for number of warps
num_warps = min(max(BLOCK // 256, 1), 8)
num_stages = 1
finfo = torch.finfo(FLOAT8_DTYPE)
fp8_min = finfo.min
fp8_max = finfo.max
_per_token_group_quant_fp8_colmajor[(M,)](
x,
x_q,
x_s,
GROUP_SIZE,
x.shape[1],
x.stride(0),
x_s.stride(1),
eps=1e-10,
fp8_min=fp8_min,
fp8_max=fp8_max,
use_ue8m0=use_ue8m0,
BLOCK=BLOCK,
num_warps=num_warps,
num_stages=num_stages,
)
return x_q, x_s
def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Tensor]:
T, N = x.size()
ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
torch.ops._C.silu_and_mul(ref_act_out, x)
return reference_quant(ref_act_out, use_ue8m0)
@pytest.mark.parametrize("T", [128, 256, 512])
@pytest.mark.parametrize("N", [128 * 2, 256 * 2, 768 * 2, 2048 * 2, 7168 * 2])
def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int):
current_platform.seed_everything(42)
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
use_ue8m0 = is_deep_gemm_e8m0_used()
# Test
output, output_scales = silu_mul_per_token_group_quant_fp8_colmajor(
input, use_ue8m0=use_ue8m0
)
# Reference
ref_output, ref_output_scales = reference(input, use_ue8m0)
torch.testing.assert_close(output.to(torch.float32), ref_output.to(torch.float32))
torch.testing.assert_close(output_scales, ref_output_scales)

View File

@ -0,0 +1,39 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
from vllm.v1.executor import UniProcExecutor
from vllm.v1.worker.worker_base import WorkerWrapperBase
# This is a dummy executor for patching in test_runai_model_streamer_s3.py.
# We cannot use vllm_runner fixture here, because it spawns worker process.
# The worker process reimports the patched entities, and the patch is not applied.
class RunaiDummyExecutor(UniProcExecutor):
def _init_executor(self) -> None:
distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
local_rank = 0
rank = 0
is_driver_worker = True
device_info = self.vllm_config.device_config.device.__str__().split(":")
if len(device_info) > 1:
local_rank = int(device_info[1])
worker_rpc_kwargs = dict(
vllm_config=self.vllm_config,
local_rank=local_rank,
rank=rank,
distributed_init_method=distributed_init_method,
is_driver_worker=is_driver_worker,
)
wrapper_kwargs = {
"vllm_config": self.vllm_config,
}
self.driver_worker = WorkerWrapperBase(**wrapper_kwargs)
self.collective_rpc("init_worker", args=([worker_rpc_kwargs],))
self.collective_rpc("init_device")

View File

@ -0,0 +1,52 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
from huggingface_hub import snapshot_download
from runai_model_streamer.safetensors_streamer.streamer_mock import StreamerPatcher
from vllm.engine.arg_utils import EngineArgs
from .conftest import RunaiDummyExecutor
load_format = "runai_streamer"
test_model = "openai-community/gpt2"
def test_runai_model_loader_download_files_s3_mocked_with_patch(
vllm_runner,
tmp_path: Path,
monkeypatch,
):
patcher = StreamerPatcher(str(tmp_path))
test_mock_s3_model = "s3://my-mock-bucket/gpt2/"
# Download model from HF
mock_model_dir = f"{tmp_path}/gpt2"
snapshot_download(repo_id=test_model, local_dir=mock_model_dir)
monkeypatch.setattr(
"vllm.transformers_utils.runai_utils.runai_list_safetensors",
patcher.shim_list_safetensors,
)
monkeypatch.setattr(
"vllm.transformers_utils.runai_utils.runai_pull_files",
patcher.shim_pull_files,
)
monkeypatch.setattr(
"vllm.model_executor.model_loader.weight_utils.SafetensorsStreamer",
patcher.create_mock_streamer,
)
engine_args = EngineArgs(
model=test_mock_s3_model,
load_format=load_format,
tensor_parallel_size=1,
)
vllm_config = engine_args.create_engine_config()
executor = RunaiDummyExecutor(vllm_config)
executor.driver_worker.load_model()

View File

@ -0,0 +1,53 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from transformers import AutoModel
from tests.models.utils import check_embeddings_close
from vllm import TokensPrompt
@pytest.mark.parametrize(
"model",
["Qwen/Qwen3-Embedding-0.6B"],
)
@torch.inference_mode
def test_embed_models(hf_runner, vllm_runner, model: str):
chunk_size = 10
n_prompt_tokens = [55, 56, 57]
token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
with vllm_runner(
model,
runner="pooling",
max_model_len=128,
max_num_batched_tokens=chunk_size,
enforce_eager=True,
# `enable_chunked_prefill`: Set to `False` instead of `None` in VllmRunner
enable_chunked_prefill=True,
enable_prefix_caching=True,
) as vllm_model:
vllm_outputs = vllm_model.token_embed(
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
)
with hf_runner(
model,
auto_cls=AutoModel,
) as hf_model:
hf_outputs = []
for token_prompt in token_prompts:
inputs = hf_model.wrap_device({"input_ids": torch.tensor([token_prompt])})
input_ids = inputs["input_ids"]
output = hf_model.model(input_ids)
hf_outputs.append(output.last_hidden_state.cpu().float()[0])
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
check_embeddings_close(
embeddings_0_lst=hf_output,
embeddings_1_lst=vllm_output,
name_0="hf",
name_1="vllm",
tol=1e-2,
)

View File

@ -20,7 +20,6 @@ def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
max_model_len=128,
enforce_eager=True,
runner="pooling",
enable_chunked_prefill=False,
enable_prefix_caching=True,
) as vllm_model:
pooling_outputs = vllm_model.llm.encode(

View File

@ -2,6 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM tests."""
import warnings
import torch
from vllm.platforms import current_platform
@ -14,6 +16,20 @@ def pytest_configure(config):
if not current_platform.is_rocm():
return
skip_patterns = ["test_granite_speech.py"]
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
# Skip disabling SDP for Granite Speech tests on ROCm
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
warnings.warn(
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
"to avoid HuggingFace Transformers accuracy issues",
UserWarning,
stacklevel=1,
)

View File

@ -137,7 +137,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
"qwen2_5_omni": VLMTestInfo(
@ -152,7 +152,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForTextToWaveform,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
"qwen3_vl": VLMTestInfo(
@ -173,7 +173,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[
pytest.mark.core_model,
],
@ -350,7 +350,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<end▁of▁sentence>", "<begin▁of▁sentence>"],
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
image_size_factors=[(1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
),
"fuyu": VLMTestInfo(
models=["adept/fuyu-8b"],
@ -403,12 +403,13 @@ VLM_TEST_SETTINGS = {
# So, we need to reduce the number of tokens for the test to pass.
max_tokens=8,
num_logprobs=10,
auto_cls=AutoModelForCausalLM,
marks=[large_gpu_mark(min_gb=32)],
),
"glm4_1v": VLMTestInfo(
models=["zai-org/GLM-4.1V-9B-Thinking"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
max_model_len=2048,
@ -423,6 +424,7 @@ VLM_TEST_SETTINGS = {
models=["zai-org/GLM-4.1V-9B-Thinking"],
# GLM4.1V require include video metadata for input
test_type=VLMTestType.CUSTOM_INPUTS,
prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@ -707,7 +709,7 @@ VLM_TEST_SETTINGS = {
max_model_len=8192,
max_num_seqs=2,
auto_cls=AutoModelForCausalLM,
image_size_factors=[(), (0.25,)],
image_size_factors=[(0.25,)],
marks=[
pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
@ -737,7 +739,13 @@ VLM_TEST_SETTINGS = {
max_model_len=8192,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=48)],
marks=[
large_gpu_mark(min_gb=48),
pytest.mark.skipif(
current_platform.is_rocm(),
reason="Model produces a vector of <UNK> output in HF on ROCm",
),
],
),
"qwen_vl": VLMTestInfo(
models=["Qwen/Qwen-VL"],
@ -760,7 +768,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.cpu_model],
),
"skywork_r1v": VLMTestInfo(
@ -812,7 +820,7 @@ VLM_TEST_SETTINGS = {
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.skip("Model initialization hangs")],
),
### Tensor parallel / multi-gpu broadcast tests

View File

@ -8,6 +8,7 @@ from transformers import AutoModelForSpeechSeq2Seq
from vllm.logprobs import SampleLogprobs
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
from ...registry import HF_EXAMPLE_MODELS
@ -34,6 +35,12 @@ audio_lora_path = MODEL_NAME
models = [MODEL_NAME]
@pytest.fixture(autouse=True)
def set_attention_backend_for_rocm(monkeypatch):
if current_platform.is_rocm():
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
@ -111,8 +118,12 @@ def run_test(
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_model_len", [2048])
@pytest.mark.parametrize(
"dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
)
@pytest.mark.parametrize(
"max_model_len", [512] if current_platform.is_rocm() else [2048]
)
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(

View File

@ -1,281 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from collections.abc import Sequence
import librosa
import pytest
from huggingface_hub import snapshot_download
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.multimodal.image import rescale_image_size
from ....conftest import (
IMAGE_ASSETS,
HfRunner,
PromptAudioInput,
PromptImageInput,
VllmRunner,
)
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
}
)
HF_MULTIIMAGE_IMAGE_PROMPT = (
"<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
)
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
speech_question = os.path.join(
model_path, "examples", "what_is_shown_in_this_image.wav"
)
models = [model_path]
target_dtype = "half"
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
model: str,
*,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
task="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
trust_remote_code=False,
) as vllm_model:
lora_request = LoRARequest("vision", 1, vision_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request,
)
for prompts, images, audios in inputs
]
with hf_runner(model, dtype=dtype) as hf_model:
hf_model.model.load_adapter(
vision_lora_path,
adapter_name="vision",
)
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id,
)
for prompts, images, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
)
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
]
run_test(
hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
# [],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
(
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
None,
),
]
run_test(
hf_runner,
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=2,
tensor_parallel_size=1,
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_vision_speech_models(
hf_runner,
vllm_runner,
model,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
# use the example speech question so that the model outputs are reasonable
audio = librosa.load(speech_question, sr=16000)
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
inputs_vision_speech = [
(
["<|user|><|image|><|audio|><|end|><|assistant|>"],
[image],
[audio],
),
]
run_test(
hf_runner,
vllm_runner,
inputs_vision_speech,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)

View File

@ -15,6 +15,7 @@ from transformers import AutoProcessor
from vllm import SamplingParams, TextPrompt, TokensPrompt
from vllm.logprobs import Logprob, SampleLogprobs
from vllm.multimodal import MultiModalDataBuiltins
from vllm.platforms import current_platform
from ....utils import VLLM_PATH, large_gpu_test
from ...utils import check_logprobs_close
@ -165,6 +166,15 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
def test_chat(
vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
) -> None:
if (
model == MISTRAL_SMALL_3_1_ID
and max_model_len == 65536
and current_platform.is_rocm()
):
pytest.skip(
"OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
)
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
with vllm_runner(
model,

View File

@ -62,6 +62,65 @@ def get_filtered_test_settings(
return matching_tests
def get_model_type_cases(
model_type: str,
test_info: VLMTestInfo,
test_type: VLMTestType,
):
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs = OrderedDict(
[
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
(
"distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend),
),
]
)
# num_frames is video only
if test_type == VLMTestType.VIDEO:
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
iter_kwargs["needs_video_metadata"] = ensure_wrapped(
test_info.needs_video_metadata
)
# No sizes passed for custom inputs, since inputs are directly provided
if test_type not in (
VLMTestType.CUSTOM_INPUTS,
VLMTestType.AUDIO,
):
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
if wrapped_sizes is None:
raise ValueError(f"Sizes must be set for test type {test_type}")
iter_kwargs["size_wrapper"] = wrapped_sizes
# Otherwise expand the custom test options instead
elif test_type == VLMTestType.CUSTOM_INPUTS:
if test_info.custom_test_opts is None:
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
# Wrap all model cases in a pytest parameter & pass marks through
return [
pytest.param(
model_type,
ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}),
marks=test_info.marks if test_info.marks is not None else [],
)
for case in list(itertools.product(*iter_kwargs.values()))
]
def get_parametrized_options(
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
@ -76,64 +135,11 @@ def get_parametrized_options(
test_settings, test_type, create_new_process_for_each_test
)
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs = OrderedDict(
[
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
(
"distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend),
),
]
)
# num_frames is video only
if test_type == VLMTestType.VIDEO:
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
iter_kwargs["needs_video_metadata"] = ensure_wrapped(
test_info.needs_video_metadata
)
# No sizes passed for custom inputs, since inputs are directly provided
if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
if wrapped_sizes is None:
raise ValueError(f"Sizes must be set for test type {test_type}")
iter_kwargs["size_wrapper"] = wrapped_sizes
# Otherwise expand the custom test options instead
elif test_type == VLMTestType.CUSTOM_INPUTS:
if test_info.custom_test_opts is None:
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
# Wrap all model cases in a pytest parameter & pass marks through
return [
pytest.param(
model_type,
ExpandableVLMTestArgs(
**{k: v for k, v in zip(iter_kwargs.keys(), case)}
),
marks=test_info.marks if test_info.marks is not None else [],
)
for case in list(itertools.product(*iter_kwargs.values()))
]
# Get a list per model type, where each entry contains a tuple of all of
# that model type's cases, then flatten them into the top level so that
# we can consume them in one mark.parametrize call.
cases_by_model_type = [
get_model_type_cases(model_type, test_info)
get_model_type_cases(model_type, test_info, test_type)
for model_type, test_info in matching_tests.items()
]
return list(itertools.chain(*cases_by_model_type))

View File

@ -140,7 +140,7 @@ def video_with_metadata_glm4_1v():
metadata = VIDEO_ASSETS[0].metadata
question = "Describe the video."
video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
scales = [0.1, 0.2, 0.25]
video_input = [

View File

@ -25,6 +25,7 @@ from transformers import (
from transformers.video_utils import VideoMetadata
from vllm.logprobs import SampleLogprobs
from vllm.platforms import current_platform
from vllm.utils.collection_utils import is_list_of
from .....conftest import HfRunner, ImageAsset, ImageTestAssets
@ -366,6 +367,40 @@ def gemma3_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOut
def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
if current_platform.is_rocm():
import types
config = hf_model.model.config
if hasattr(config, "num_layers") and not hasattr(config, "num_hidden_layers"):
config.num_hidden_layers = config.num_layers
config.output_hidden_states = True
def patched_prepare_cache(
self, generation_config, model_kwargs, *args, **kwargs
):
model_kwargs["past_key_values"] = None
model_kwargs["use_cache"] = False
return model_kwargs
hf_model.model._prepare_cache_for_generation = types.MethodType(
patched_prepare_cache, hf_model.model
)
original_generate = hf_model.model.generate
def patched_generate(*args, **kwargs):
kwargs["output_hidden_states"] = True
kwargs["return_dict_in_generate"] = True
return original_generate(*args, **kwargs)
hf_model.model.generate = patched_generate
original_forward = hf_model.model.forward
def patched_forward(*args, **kwargs):
kwargs["output_hidden_states"] = True
return original_forward(*args, **kwargs)
hf_model.model.forward = patched_forward
hf_processor = hf_model.processor
def processor(*args, text="", images=None, **kwargs):
@ -406,7 +441,15 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
if videos is not None and is_list_of(videos, tuple):
# If videos is a list of tuples, we assume each tuple contains
# (video_array, metadata) as in the case of GLM4.1V.
video_metadata = [[VideoMetadata(**video[1])] for video in videos]
# Filter out 'do_sample_frames' as it's not a valid VideoMetadata arg
video_metadata = [
[
VideoMetadata(
**{k: v for k, v in video[1].items() if k != "do_sample_frames"}
)
]
for video in videos
]
videos = [[video[0]] for video in videos]
else:
video_metadata = None

View File

@ -50,8 +50,8 @@ MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PL
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)]
IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)]
RunnerOutput = tuple[list[int], str, SampleLogprobs | None]

View File

@ -0,0 +1,24 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
import os
import warnings
from vllm.platforms import current_platform
def pytest_collection_modifyitems(config, items):
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
if not current_platform.is_rocm():
return
siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
if siglip_tests:
os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
warnings.warn(
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
UserWarning,
stacklevel=1,
)

View File

@ -396,28 +396,6 @@ def test_processing_correctness(
)
# Phi4MultimodalForCausalLM share same model repo with original format
# Phi4MMForCausalLM, so we add it as a separate test case
# Remove this test after conversion PR merged:
# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
def test_processing_correctness_phi4_multimodal(
model_arch: str,
hit_rate: float,
num_batches: int,
simplify_rate: float,
):
_test_processing_correctness(
model_arch,
hit_rate=hit_rate,
num_batches=num_batches,
simplify_rate=simplify_rate,
)
def _assert_inputs_equal(
a: MultiModalInputs,
b: MultiModalInputs,

View File

@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
)
QWEN3_CONFIG = GGUFTestConfig(
original_model="Qwen/Qwen3-0.6B",
gguf_repo="unsloth/Qwen3-0.6B-GGUF",
gguf_filename="Qwen3-0.6B-BF16.gguf",
)
PHI3_CONFIG = GGUFTestConfig(
original_model="microsoft/Phi-3.5-mini-instruct",
gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
MODELS = [
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
QWEN2_CONFIG,
QWEN3_CONFIG,
PHI3_CONFIG,
GPT2_CONFIG,
STABLELM_CONFIG,

View File

@ -667,6 +667,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"moonshotai/Kimi-VL-A3B-Instruct",
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},
trust_remote_code=True,
max_transformers_version="4.53.3",
transformers_version_reason="HF model uses deprecated transformers API "
"(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
),
"LightOnOCRForConditionalGeneration": _HfExamplesInfo(
"lightonai/LightOnOCR-1B",
@ -767,10 +771,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Phi4MMForCausalLM": _HfExamplesInfo(
"microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
),
"Phi4MultimodalForCausalLM": _HfExamplesInfo(
"microsoft/Phi-4-multimodal-instruct",
revision="refs/pr/70",
),
"PixtralForConditionalGeneration": _HfExamplesInfo(
"mistralai/Pixtral-12B-2409",
extras={

View File

@ -112,7 +112,7 @@ class TestBaseThinkingReasoningParserMethods:
"""Test the is_reasoning_end method."""
parser = TestThinkingReasoningParser(test_tokenizer)
end_token_id = parser.end_token_id
start_token_id = parser.start_token_id
# Test with end token present
assert parser.is_reasoning_end([1, 2, end_token_id, 4]) is True
@ -122,6 +122,16 @@ class TestBaseThinkingReasoningParserMethods:
# Test with empty list
assert parser.is_reasoning_end([]) is False
# Test with interleaved thinking
assert parser.is_reasoning_end([1, start_token_id, 2, end_token_id]) is True
assert parser.is_reasoning_end([1, start_token_id, 2, 3]) is False
assert (
parser.is_reasoning_end(
[1, start_token_id, 2, end_token_id, 2, 2, start_token_id]
)
is False
)
def test_extract_content_ids(self, test_tokenizer):
"""Test the extract_content_ids method."""
parser = TestThinkingReasoningParser(test_tokenizer)

View File

@ -5,6 +5,10 @@
set -e
set -x
merge_base_commit=$(git merge-base HEAD origin/main)
echo "Current merge base commit with main: $merge_base_commit"
git show --oneline -s $merge_base_commit
cd /vllm-workspace/
# uninstall vllm
@ -18,7 +22,7 @@ apt autoremove -y
echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
# Run the script
python3 -c 'import vllm'

View File

@ -629,8 +629,8 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
(
"internlm/internlm2-1_8b-reward",
"decoder",
False,
"Pooling models with all pooling does not support chunked prefill.",
True,
"Pooling models with causal attn and all pooling support chunked prefill.",
),
(
"BAAI/bge-base-en",
@ -748,8 +748,8 @@ def test_is_chunked_prefill_supported(
(
"internlm/internlm2-1_8b-reward",
"decoder",
False,
"Pooling models with all pooling does not support prefix caching.",
True,
"Pooling models with causal attn and all pooling support prefix caching.",
),
(
"BAAI/bge-base-en",

View File

@ -365,3 +365,54 @@ class TestEnvSetWithChoices:
with patch.dict(os.environ, {"TEST_ENV": "option1,option1,option2"}):
env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"])
assert env_func() == {"option1", "option2"}
class TestVllmConfigureLogging:
"""Test cases for VLLM_CONFIGURE_LOGGING environment variable."""
def test_configure_logging_defaults_to_true(self):
"""Test that VLLM_CONFIGURE_LOGGING defaults to True when not set."""
# Ensure the env var is not set
with patch.dict(os.environ, {}, clear=False):
if "VLLM_CONFIGURE_LOGGING" in os.environ:
del os.environ["VLLM_CONFIGURE_LOGGING"]
# Clear cache if it exists
if hasattr(envs.__getattr__, "cache_clear"):
envs.__getattr__.cache_clear()
result = envs.VLLM_CONFIGURE_LOGGING
assert result is True
assert isinstance(result, bool)
def test_configure_logging_with_zero_string(self):
"""Test that VLLM_CONFIGURE_LOGGING='0' evaluates to False."""
with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "0"}):
# Clear cache if it exists
if hasattr(envs.__getattr__, "cache_clear"):
envs.__getattr__.cache_clear()
result = envs.VLLM_CONFIGURE_LOGGING
assert result is False
assert isinstance(result, bool)
def test_configure_logging_with_one_string(self):
"""Test that VLLM_CONFIGURE_LOGGING='1' evaluates to True."""
with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "1"}):
# Clear cache if it exists
if hasattr(envs.__getattr__, "cache_clear"):
envs.__getattr__.cache_clear()
result = envs.VLLM_CONFIGURE_LOGGING
assert result is True
assert isinstance(result, bool)
def test_configure_logging_with_invalid_value_raises_error(self):
"""Test that invalid VLLM_CONFIGURE_LOGGING value raises ValueError."""
with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "invalid"}):
# Clear cache if it exists
if hasattr(envs.__getattr__, "cache_clear"):
envs.__getattr__.cache_clear()
with pytest.raises(ValueError, match="invalid literal for int"):
_ = envs.VLLM_CONFIGURE_LOGGING

View File

@ -0,0 +1,847 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Generator
import partial_json_parser
import pytest
from mistral_common.protocol.instruct.messages import AssistantMessage
from mistral_common.protocol.instruct.request import InstructRequest
from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall
from partial_json_parser.core.options import Allow
from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser
from vllm.tokenizers import (
MistralTokenizer,
TokenizerLike,
get_tokenizer,
)
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
@pytest.fixture(scope="module")
def mistral_pre_v11_tokenizer():
MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
return get_tokenizer(tokenizer_name=MODEL)
@pytest.fixture(scope="module")
def mistral_tokenizer():
MODEL = "mistralai/Mistral-Small-3.2-24B-Instruct-2506"
return get_tokenizer(tokenizer_name=MODEL, tokenizer_mode="mistral")
@pytest.fixture
def mistral_pre_v11_tool_parser(mistral_pre_v11_tokenizer):
return MistralToolParser(mistral_pre_v11_tokenizer)
@pytest.fixture
def mistral_tool_parser(mistral_tokenizer):
return MistralToolParser(mistral_tokenizer)
def assert_tool_calls(
actual_tool_calls: list[ToolCall] | list[DeltaToolCall],
expected_tool_calls: list[ToolCall],
):
assert len(actual_tool_calls) == len(expected_tool_calls)
for actual_tool_call, expected_tool_call in zip(
actual_tool_calls, expected_tool_calls
):
assert isinstance(actual_tool_call.id, str)
assert len(actual_tool_call.id) == 9
if isinstance(actual_tool_call, ToolCall):
assert actual_tool_call.type == "function"
elif isinstance(actual_tool_call, DeltaToolCall):
assert actual_tool_call.function is not None
assert actual_tool_call.function.name is not None
assert actual_tool_call.function.arguments is not None
assert actual_tool_call.function is not None
assert actual_tool_call.function.name == expected_tool_call.function.name, (
f"got wrong function name:${actual_tool_call.function.name}"
)
assert (
actual_tool_call.function.arguments == expected_tool_call.function.arguments
), f"got wrong function argument:${actual_tool_call.function.arguments}"
def fix_tool_call_tokenization(
tokens: list[int],
mistral_tool_parser: MistralToolParser,
mistral_tokenizer: TokenizerLike,
):
"""
Replaces the textual token sequence for [TOOL_CALLS]
with its single special token ID.
"""
textual_tool_call_token_ids = mistral_tokenizer.encode(
text=mistral_tool_parser.bot_token,
add_special_tokens=False,
)
# textual_tool_call_token_ids must not contain special tokens like bos, eos etc
special_tool_call_token_ids = [mistral_tool_parser.bot_token_id]
# If the input is too short to contain the sequence, no replacement is possible
if not tokens or len(tokens) < len(textual_tool_call_token_ids):
return tokens
result_tokens = []
i = 0
target_len = len(textual_tool_call_token_ids)
while i < len(tokens):
# Check if the slice from the current position matches the target sequence
if tokens[i : i + target_len] == textual_tool_call_token_ids:
# If it matches, add the replacement and jump the index forward
result_tokens.extend(special_tool_call_token_ids)
i += target_len
else:
# Otherwise, just add the current token and move to the next one
result_tokens.append(tokens[i])
i += 1
return result_tokens
def stream_delta_message_generator(
mistral_tool_parser: MistralToolParser,
mistral_tokenizer: TokenizerLike,
model_output: str | None,
tools: list[tuple[str, str]] | None,
) -> Generator[DeltaMessage, None, None]:
if (
isinstance(mistral_tokenizer, MistralTokenizer)
and mistral_tokenizer.version >= 11
):
# With the newer versions of the tokenizer,
# we cannot tokenize free text
# so we need to create a list of messages to get tokenized
assert tools is not None
assistant_msg = AssistantMessage(
tool_calls=[
ToolCall(
function=FunctionCall(
name=name,
arguments=arg,
)
)
for (name, arg) in tools
],
)
request = InstructRequest(
messages=[assistant_msg],
)
all_token_ids = mistral_tokenizer.instruct.encode_instruct(request).tokens
else:
# Older versions of the tokenizer are
# able to encode directly the model's output (free text) into tokens
assert model_output is not None
all_token_ids = mistral_tokenizer.encode(model_output, add_special_tokens=False)
all_token_ids = fix_tool_call_tokenization(
all_token_ids, mistral_tool_parser, mistral_tokenizer
)
previous_text = ""
previous_tokens = None
prefix_offset = 0
read_offset = 0
for i, delta_token in enumerate(all_token_ids):
delta_token_ids = [delta_token]
previous_token_ids = all_token_ids[:i]
current_token_ids = all_token_ids[: i + 1]
(new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
detokenize_incrementally(
tokenizer=mistral_tokenizer,
all_input_ids=current_token_ids,
prev_tokens=previous_tokens,
prefix_offset=prefix_offset,
read_offset=read_offset,
skip_special_tokens=isinstance(mistral_tokenizer, MistralTokenizer),
spaces_between_special_tokens=True,
)
)
current_text = previous_text + delta_text
delta_message = mistral_tool_parser.extract_tool_calls_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
request=None, # type: ignore[arg-type]
)
if delta_message:
yield delta_message
previous_text = current_text
previous_tokens = (
previous_tokens + new_tokens if previous_tokens else new_tokens
)
prefix_offset = new_prefix_offset
read_offset = new_read_offset
def test_extract_tool_calls_no_tools(mistral_pre_v11_tool_parser):
model_output = "This is a test"
extracted_tool_calls = mistral_pre_v11_tool_parser.extract_tool_calls(
model_output, request=None
) # type: ignore[arg-type]
assert not extracted_tool_calls.tools_called
assert extracted_tool_calls.tool_calls == []
assert extracted_tool_calls.content == model_output
@pytest.mark.parametrize(
ids=[
"single_tool_add",
"single_tool_weather",
"argument_before_name",
"argument_before_name_and_name_in_argument",
],
argnames=["model_output", "expected_tool_calls", "expected_content"],
argvalues=[
(
"""[TOOL_CALLS][{"name": "add", "arguments":{"a": 3.5, "b": 4}}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
)
)
],
None,
),
(
"""[TOOL_CALLS] [{"name": "get_current_weather", "arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="get_current_weather",
arguments=json.dumps(
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
),
)
)
],
None,
),
(
"""[TOOL_CALLS] [{"arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="get_current_weather",
arguments=json.dumps(
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
),
)
)
],
None,
),
(
"""[TOOL_CALLS] [{"arguments":{"name": "John Doe"}, "name": "get_age"}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="get_age",
arguments=json.dumps(
{
"name": "John Doe",
}
),
)
)
],
None,
),
],
)
def test_extract_tool_calls_pre_v11_tokenizer(
mistral_pre_v11_tool_parser, model_output, expected_tool_calls, expected_content
):
extracted_tool_calls = mistral_pre_v11_tool_parser.extract_tool_calls(
model_output, request=None
) # type: ignore[arg-type]
assert extracted_tool_calls.tools_called
assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
assert extracted_tool_calls.content == expected_content
@pytest.mark.parametrize(
ids=[
"single_tool_add",
"single_tool_weather",
"multiple_tool_calls",
],
argnames=["model_output", "expected_tool_calls", "expected_content"],
argvalues=[
(
"""[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add_this_and_that",
arguments=json.dumps({"a": 3.5, "b": 4}),
)
)
],
None,
),
(
"""[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="get_current_weather",
arguments=json.dumps(
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
),
)
)
],
None,
),
(
"""[TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]multiply{"a": 3, "b": 6}""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
)
),
ToolCall(
function=FunctionCall(
name="multiply", arguments=json.dumps({"a": 3, "b": 6})
)
),
],
None,
),
],
)
def test_extract_tool_calls(
mistral_tool_parser, model_output, expected_tool_calls, expected_content
):
extracted_tool_calls = mistral_tool_parser.extract_tool_calls(
model_output, request=None
) # type: ignore[arg-type]
assert extracted_tool_calls.tools_called
assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
assert extracted_tool_calls.content == expected_content
def _test_extract_tool_calls_streaming(
tool_parser, tokenizer, model_output, tools, expected_tool_calls, expected_content
):
other_content: str = ""
function_names: list[str] = []
function_args_strs: list[str] = []
tool_call_idx: int = -1
tool_call_ids: list[str | None] = []
for delta_message in stream_delta_message_generator(
tool_parser, tokenizer, model_output, tools
):
# role should never be streamed from tool parser
assert not delta_message.role
if delta_message.content:
other_content += delta_message.content
streamed_tool_calls = delta_message.tool_calls
if streamed_tool_calls and len(streamed_tool_calls) > 0:
# make sure only one diff is present - correct even for parallel
assert len(streamed_tool_calls) == 1
tool_call = streamed_tool_calls[0]
assert len(tool_parser.prev_tool_call_arr) > 0
# if a new tool is being called, set up empty arguments
if tool_call.index != tool_call_idx:
tool_call_idx = tool_call.index
function_args_strs.append("")
tool_call_ids.append(None)
# if a tool call ID is streamed, make sure one hasn't been already
if tool_call.id and not tool_call_ids[tool_call.index]:
tool_call_ids[tool_call.index] = tool_call.id
# if parts of the function start being streamed
if tool_call.function:
# if the function name is defined, set it. it should be streamed
# IN ENTIRETY, exactly one time.
if tool_call.function.name:
assert isinstance(tool_call.function.name, str)
function_names.append(tool_call.function.name)
if tool_call.function.arguments:
# make sure they're a string and then add them to the list
assert isinstance(tool_call.function.arguments, str)
function_args_strs[tool_call.index] += tool_call.function.arguments
assert other_content == expected_content
actual_tool_calls = [
ToolCall(
id=tool_call_id,
function=FunctionCall(
name=function_name,
arguments=partial_json_parser.ensure_json(
function_args_str, Allow.OBJ | Allow.STR
),
),
)
for tool_call_id, function_name, function_args_str in zip(
tool_call_ids, function_names, function_args_strs
)
]
assert_tool_calls(actual_tool_calls, expected_tool_calls)
@pytest.mark.parametrize(
ids=[
"no_tools",
"single_tool_add",
"single_tool_add_strings",
"single_tool_weather",
"argument_before_name",
"argument_before_name_and_name_in_argument",
"multiple_tools",
],
argnames=["model_output", "expected_tool_calls", "expected_content"],
argvalues=[
("""This is a test""", [], """This is a test"""),
(
"""[TOOL_CALLS] [ {"name":"add" , "arguments" : {"a": 3, "b": 4} } ]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add", arguments=json.dumps({"a": 3, "b": 4})
)
)
],
"",
),
(
"""[TOOL_CALLS] [{"name": "add", "arguments":{"a": "3", "b": "4"}}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add", arguments=json.dumps({"a": "3", "b": "4"})
)
)
],
"",
),
(
"""[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="get_current_weather",
arguments=json.dumps(
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
),
)
)
],
"",
),
(
"""[TOOL_CALLS] [{"arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="get_current_weather",
arguments=json.dumps(
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
),
)
)
],
"",
),
(
"""[TOOL_CALLS] [{"arguments": {"name": "John Doe"}, "name": "get_age"}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="get_age",
arguments=json.dumps(
{
"name": "John Doe",
}
),
)
)
],
"",
),
(
"""[TOOL_CALLS] [{"name": "add", "arguments": {"a": 3.5, "b": 4}}, {"name": "get_current_weather", "arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
)
),
ToolCall(
function=FunctionCall(
name="get_current_weather",
arguments=json.dumps(
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
),
)
),
],
"",
),
],
)
def test_extract_tool_calls_streaming_pre_v11_tokenizer(
mistral_pre_v11_tool_parser,
mistral_pre_v11_tokenizer,
model_output,
expected_tool_calls,
expected_content,
):
_test_extract_tool_calls_streaming(
mistral_pre_v11_tool_parser,
mistral_pre_v11_tokenizer,
model_output,
None,
expected_tool_calls,
expected_content,
)
@pytest.mark.parametrize(
ids=[
"single_tool_add",
"single_tool_add_strings",
"multiple_tools",
],
argnames=["tools", "expected_tool_calls", "expected_content"],
argvalues=[
(
[("add", '{"a": 3, "b": 4}')],
# [TOOL_CALLS]add{"a": 3, "b": 4}
[
ToolCall(
function=FunctionCall(
name="add", arguments=json.dumps({"a": 3, "b": 4})
)
)
],
"",
),
(
[("add_two_strings", '{"a": "3", "b": "4"}')],
# [TOOL_CALLS]add_two_strings{"a": "3", "b": "4"}
[
ToolCall(
function=FunctionCall(
name="add_two_strings",
arguments=json.dumps({"a": "3", "b": "4"}),
)
)
],
"",
),
(
[
("add", '{"a": 3.5, "b": 4}'),
(
"get_current_weather",
'{"city": "San Francisco", "state": "CA", "unit": "celsius"}', # noqa: E501
),
],
# [TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"} # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
)
),
ToolCall(
function=FunctionCall(
name="get_current_weather",
arguments=json.dumps(
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
),
)
),
],
"",
),
],
)
def test_extract_tool_calls_streaming(
mistral_tool_parser,
mistral_tokenizer,
tools,
expected_tool_calls,
expected_content,
):
_test_extract_tool_calls_streaming(
mistral_tool_parser,
mistral_tokenizer,
None,
tools,
expected_tool_calls,
expected_content,
)
@pytest.mark.parametrize(
ids=[
"single_tool_add",
"single_tool_weather",
"multiple_tool_calls",
"content_before_tool",
],
argnames=["model_output", "expected_tool_calls", "expected_content"],
argvalues=[
(
"""[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add_this_and_that",
arguments=json.dumps({"a": 3.5, "b": 4}),
)
)
],
"",
),
(
"""[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="get_current_weather",
arguments=json.dumps(
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
),
)
)
],
"",
),
(
"""[TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]multiply{"a": 3, "b": 6}""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
)
),
ToolCall(
function=FunctionCall(
name="multiply", arguments=json.dumps({"a": 3, "b": 6})
)
),
],
"",
),
(
# Additional content should not be after the tool calls
"""bla[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add_this_and_that",
arguments=json.dumps({"a": 3.5, "b": 4}),
)
)
],
"bla",
),
],
)
def test_extract_tool_calls_streaming_one_chunk(
mistral_tool_parser,
mistral_tokenizer,
model_output,
expected_tool_calls,
expected_content,
):
if isinstance(mistral_tokenizer, MistralTokenizer):
all_token_ids = mistral_tokenizer.encode(model_output)
else:
all_token_ids = mistral_tokenizer.encode(model_output, add_special_tokens=False)
all_token_ids = fix_tool_call_tokenization(
all_token_ids, mistral_tool_parser, mistral_tokenizer
)
delta_message = mistral_tool_parser.extract_tool_calls_streaming(
previous_text="",
current_text=model_output,
delta_text=model_output,
previous_token_ids=[],
current_token_ids=all_token_ids,
delta_token_ids=all_token_ids,
request=None,
) # type: ignore[arg-type]
assert isinstance(delta_message, DeltaMessage)
assert len(delta_message.tool_calls) == len(expected_tool_calls)
assert_tool_calls(delta_message.tool_calls, expected_tool_calls)
if delta_message.content is None:
assert expected_content == ""
else:
assert delta_message.content == expected_content
@pytest.mark.parametrize(
ids=[
"no_tools",
"single_tool_add",
"single_tool_add_strings",
"single_tool_weather",
"argument_before_name",
"argument_before_name_and_name_in_argument",
"multiple_tools",
],
argnames=["model_output", "expected_tool_calls", "expected_content"],
argvalues=[
("""This is a test""", [], """This is a test"""),
(
"""[TOOL_CALLS] [ {"name":"add" , "arguments" : {"a": 3, "b": 4} } ]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add", arguments=json.dumps({"a": 3, "b": 4})
)
)
],
"",
),
(
"""[TOOL_CALLS] [{"name": "add", "arguments":{"a": "3", "b": "4"}}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add", arguments=json.dumps({"a": "3", "b": "4"})
)
)
],
"",
),
(
"""[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="get_current_weather",
arguments=json.dumps(
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
),
)
)
],
"",
),
(
"""[TOOL_CALLS] [{"arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="get_current_weather",
arguments=json.dumps(
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
),
)
)
],
"",
),
(
"""[TOOL_CALLS] [{"arguments": {"name": "John Doe"}, "name": "get_age"}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="get_age",
arguments=json.dumps(
{
"name": "John Doe",
}
),
)
)
],
"",
),
(
"""[TOOL_CALLS] [{"arguments": {"a": 3.5, "b": 4}, "name": "add"}, {"arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
)
),
ToolCall(
function=FunctionCall(
name="get_current_weather",
arguments=json.dumps(
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
),
)
),
],
"",
),
],
)
def test_extract_tool_calls_streaming_pre_v11_tokenizer_one_chunk(
mistral_pre_v11_tool_parser,
mistral_pre_v11_tokenizer,
model_output,
expected_tool_calls,
expected_content,
):
if isinstance(mistral_pre_v11_tokenizer, MistralTokenizer):
all_token_ids = mistral_pre_v11_tokenizer.encode(model_output)
else:
all_token_ids = mistral_pre_v11_tokenizer.encode(
model_output, add_special_tokens=False
)
all_token_ids = fix_tool_call_tokenization(
all_token_ids, mistral_pre_v11_tool_parser, mistral_pre_v11_tokenizer
)
delta_message = mistral_pre_v11_tool_parser.extract_tool_calls_streaming(
previous_text="",
current_text=model_output,
delta_text=model_output,
previous_token_ids=[],
current_token_ids=all_token_ids,
delta_token_ids=all_token_ids,
request=None,
) # type: ignore[arg-type]
assert isinstance(delta_message, DeltaMessage)
assert len(delta_message.tool_calls) == len(expected_tool_calls)
assert_tool_calls(delta_message.tool_calls, expected_tool_calls)
if delta_message.content is None:
assert expected_content == ""
else:
assert delta_message.content == expected_content

View File

@ -123,7 +123,7 @@ CONFIGS: dict[str, ServerConfig] = {
"supports_parallel": True,
"extended": True,
},
"mistral": {
"mistral-7b": {
"model": "mistralai/Mistral-7B-Instruct-v0.3",
"arguments": [
"--enforce-eager",
@ -145,6 +145,32 @@ CONFIGS: dict[str, ServerConfig] = {
"call the tool. Otherwise, answer the user's query directly "
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
"to the user's question - just respond to it normally.",
"supports_parallel": True,
},
"mistral-small-3.2": {
"model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
"arguments": [
"--enforce-eager",
"--no-enable-prefix-caching",
"--tool-call-parser",
"mistral",
"--tokenizer-mode",
"mistral",
"--config-format",
"mistral",
"--load-format",
"mistral",
"--tensor-parallel-size",
"4",
'--ignore-patterns="consolidated.safetensors"',
],
"system_prompt": "You are a helpful assistant with access to tools. If a tool"
" that you have would be helpful to answer a user query, "
"call the tool. Otherwise, answer the user's query directly "
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
"to the user's question - just respond to it normally.",
"supports_parallel": True,
"extended": True,
},
# FIXME: This test currently fails, need to debug why.
# "granite20b": {

View File

@ -11,7 +11,9 @@ PROMPTS = [
]
def test_reset_prefix_cache_e2e():
def test_reset_prefix_cache_e2e(monkeypatch):
# "spawn" is required for test to be deterministic
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
engine_args = EngineArgs(
model="Qwen/Qwen3-0.6B",
gpu_memory_utilization=0.2,

View File

@ -9,6 +9,7 @@ from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.usage.usage_lib import UsageContext
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.hashing import _xxhash
def test_prefix_caching_from_cli():
@ -48,6 +49,21 @@ def test_prefix_caching_from_cli():
args = parser.parse_args(["--prefix-caching-hash-algo", "invalid"])
@pytest.mark.skipif(_xxhash is None, reason="xxhash not installed")
def test_prefix_caching_xxhash_from_cli():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
# set hash algorithm to xxhash (pickle)
args = parser.parse_args(["--prefix-caching-hash-algo", "xxhash"])
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
assert vllm_config.cache_config.prefix_caching_hash_algo == "xxhash"
# set hash algorithm to xxhash_cbor
args = parser.parse_args(["--prefix-caching-hash-algo", "xxhash_cbor"])
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
assert vllm_config.cache_config.prefix_caching_hash_algo == "xxhash_cbor"
def test_defaults_with_usage_context():
engine_args = EngineArgs(model="facebook/opt-125m")
vllm_config: VllmConfig = engine_args.create_engine_config(UsageContext.LLM_CLASS)

View File

@ -1,8 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import json
import openai # use the official client for correctness check
@ -13,6 +11,7 @@ from transformers import AutoConfig
from tests.conftest import ImageTestAssets
from tests.utils import RemoteOpenAIServer
from vllm.utils.serial_utils import tensor2base64
# any model with a chat template should work here
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
@ -50,18 +49,6 @@ async def client_with_image_embeds(server_with_image_embeds):
yield async_client
def encode_image_embedding_to_base64(image_embedding) -> str:
"""
Encode image embedding to base64 string
"""
buffer = io.BytesIO()
torch.save(image_embedding, buffer)
buffer.seek(0)
binary_data = buffer.read()
base64_image_embedding = base64.b64encode(binary_data).decode("utf-8")
return base64_image_embedding
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("dtype", [torch.half, torch.float16, torch.float32])
@ -73,7 +60,7 @@ async def test_completions_with_image_embeds(
):
# Test case: Single image embeds input
image_embeds = image_assets[0].image_embeds.to(dtype=dtype)
base64_image_embedding = encode_image_embedding_to_base64(image_embeds)
base64_image_embedding = tensor2base64(image_embeds)
chat_completion = await client_with_image_embeds.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},

View File

@ -3,12 +3,14 @@
from dataclasses import asdict
from typing import NamedTuple
import pytest
from PIL import Image
from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.config import KVTransferConfig
from vllm.multimodal.utils import encode_image_base64
from vllm.platforms import current_platform
MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
@ -108,6 +110,13 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
print("-" * 50)
@pytest.mark.skipif(
current_platform.is_rocm(),
reason=(
"hipErrorLaunchFailure when running this test, see issue:"
"https://github.com/ROCm/pytorch/issues/2822"
),
)
def test_shared_storage_connector_hashes(tmp_path):
"""
Tests that SharedStorageConnector saves KV to the storage locations

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import sys
from typing import Any
import pytest
@ -10,7 +9,6 @@ from tests.utils import create_new_process_for_each_test
from tests.v1.logits_processors.utils import (
DUMMY_LOGITPROC_ARG,
DUMMY_LOGITPROC_FQCN,
DUMMY_LOGITPROC_MODULE,
MAX_TOKENS,
MODEL_NAME,
POOLING_MODEL_NAME,
@ -18,7 +16,6 @@ from tests.v1.logits_processors.utils import (
CustomLogitprocSource,
DummyLogitsProcessor,
WrappedPerReqLogitsProcessor,
dummy_module,
prompts,
)
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
@ -162,8 +159,6 @@ def test_custom_logitsprocs(monkeypatch, logitproc_source: CustomLogitprocSource
kwargs: dict[str, list[str | type[LogitsProcessor]]] = {}
if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
# Scenario: load logitproc based on fully-qualified class name (FQCN)
# Inject dummy module which defines logitproc
sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
# Scenario: load logitproc from provided class object

View File

@ -14,11 +14,9 @@ from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_te
from tests.v1.logits_processors.utils import (
DUMMY_LOGITPROC_ARG,
DUMMY_LOGITPROC_FQCN,
DUMMY_LOGITPROC_MODULE,
MAX_TOKENS,
MODEL_NAME,
TEMP_GREEDY,
dummy_module,
prompts,
)
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
@ -47,20 +45,14 @@ def _server_with_logitproc_entrypoint(
main.main()
def _server_with_logitproc_module(
def _server_with_logitproc_fqcn(
env_dict: dict[str, str] | None,
model: str,
vllm_serve_args: list[str],
) -> None:
"""Start vLLM server, inject module with dummy logitproc"""
# Patch `modules` to inject dummy logitproc module
from vllm.entrypoints.cli import main
sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
# fork is required for workers to see entrypoint patch
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "fork"
if env_dict is not None:
os.environ.update(env_dict)
@ -99,7 +91,7 @@ def server(default_server_args, request, monkeypatch):
if request.param:
# Launch server, append FQCN argument, inject dummy logitproc module
args = default_server_args + request.param
_server_fxn = _server_with_logitproc_module
_server_fxn = _server_with_logitproc_fqcn
else:
# Launch server, inject dummy logitproc entrypoint
args = default_server_args

View File

@ -27,7 +27,7 @@ DUMMY_LOGITPROC_ARG = "target_token"
TEMP_GREEDY = 0.0
MAX_TOKENS = 20
DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
DUMMY_LOGITPROC_MODULE = "DummyModule"
DUMMY_LOGITPROC_MODULE = "tests.v1.logits_processors.utils"
DUMMY_LOGITPROC_FQCN = f"{DUMMY_LOGITPROC_MODULE}:DummyLogitsProcessor"

View File

@ -5,6 +5,7 @@ import torch
from vllm.config import SpeculativeConfig
from vllm.model_executor.models.interfaces import supports_eagle3
from vllm.platforms import current_platform
@pytest.mark.parametrize(
@ -21,6 +22,10 @@ from vllm.model_executor.models.interfaces import supports_eagle3
pytest.param(
"nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized-w4a16",
id="qwen3-eagle3-speculator-w4a16-verifier",
marks=pytest.mark.skipif(
current_platform.is_rocm(),
reason="The tests are skipped on rocm platform.",
),
),
],
)

View File

@ -761,6 +761,10 @@ def test_init_kv_cache_with_kv_sharing_valid():
assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[1] == layer_1
@pytest.mark.skipif(
current_platform.is_rocm(),
reason="Attention backend FLASHINFER is not supported on ROCm.",
)
def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
"""
The GPU model runner creates different views into the

View File

@ -283,6 +283,28 @@ def _rocm_aiter_grouped_topk_fake(
pass
# Cache whether aiter supports FP8 MLA parameters
_AITER_MLA_SUPPORTS_FP8: bool | None = None
def _check_aiter_mla_fp8_support() -> bool:
"""Check if aiter.mla.mla_decode_fwd supports q_scale and kv_scale parameters."""
global _AITER_MLA_SUPPORTS_FP8
if _AITER_MLA_SUPPORTS_FP8 is None:
try:
import inspect
from aiter.mla import mla_decode_fwd
sig = inspect.signature(mla_decode_fwd)
_AITER_MLA_SUPPORTS_FP8 = (
"q_scale" in sig.parameters and "kv_scale" in sig.parameters
)
except Exception:
_AITER_MLA_SUPPORTS_FP8 = False
return _AITER_MLA_SUPPORTS_FP8
def _rocm_aiter_mla_decode_fwd_impl(
q: torch.Tensor,
kv_buffer: torch.Tensor,
@ -299,6 +321,16 @@ def _rocm_aiter_mla_decode_fwd_impl(
) -> None:
from aiter.mla import mla_decode_fwd
kwargs = {
"sm_scale": sm_scale,
"logit_cap": logit_cap,
}
# Only pass q_scale and kv_scale if the aiter library supports them
if _check_aiter_mla_fp8_support():
kwargs["q_scale"] = q_scale
kwargs["kv_scale"] = kv_scale
mla_decode_fwd(
q,
kv_buffer.view(-1, 1, 1, q.shape[-1]),
@ -308,10 +340,7 @@ def _rocm_aiter_mla_decode_fwd_impl(
kv_indices,
kv_last_page_lens,
max_seqlen_qo,
sm_scale=sm_scale,
logit_cap=logit_cap,
q_scale=q_scale,
kv_scale=kv_scale,
**kwargs,
)

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
from collections import Counter
from collections.abc import Callable
from contextlib import ExitStack
from typing import Any
@ -22,6 +23,99 @@ from vllm.utils.torch_utils import weak_ref_tensors
logger = init_logger(__name__)
@dataclasses.dataclass(frozen=True)
class CUDAGraphStat:
num_unpadded_tokens: int
num_padded_tokens: int
num_paddings: int
runtime_mode: str
class CUDAGraphLogging:
"""Aggregate and log cudagraph metrics"""
COLUMN_HEADERS = [
"Unpadded Tokens",
"Padded Tokens",
"Num Paddings",
"Runtime Mode",
"Count",
]
def __init__(self, cg_mode: CUDAGraphMode, cg_capture_sizes: list[int] | None):
self.reset()
self.cg_mode = str(cg_mode)
self.cg_capture_sizes = str(cg_capture_sizes or [])
self.settings_header = (
"**CUDAGraph Config Settings:**\n\n"
f"- Mode: {self.cg_mode}\n"
f"- Capture sizes: {self.cg_capture_sizes}\n\n"
"**CUDAGraph Stats:**\n\n"
)
def reset(self):
self.stats = []
def observe(self, cudagraph_stat: CUDAGraphStat):
self.stats.append(cudagraph_stat)
def generate_metric_table(self) -> str:
stats_counts = Counter(self.stats)
# Convert stats to rows of strings, in descending order of observed frequencies
rows = []
for stat, count in sorted(
stats_counts.items(), key=lambda item: item[1], reverse=True
):
rows.append(
[
str(stat.num_unpadded_tokens),
str(stat.num_padded_tokens),
str(stat.num_paddings),
stat.runtime_mode,
str(count),
]
)
# Calculate column widths (max of header and data)
col_widths = []
for i, header_text in enumerate(self.COLUMN_HEADERS):
max_width = len(header_text)
for row in rows:
max_width = max(max_width, len(row[i]))
col_widths.append(max_width)
table_header_list = [
h.ljust(w) for h, w in zip(self.COLUMN_HEADERS, col_widths)
]
table_header = "| " + " | ".join(table_header_list) + " |\n"
table_separator = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|\n"
# Create data rows with proper alignment
data_rows = []
for row in rows:
formatted_row = [
str(val).ljust(width) for val, width in zip(row, col_widths)
]
data_rows.append("| " + " | ".join(formatted_row) + " |")
return (
self.settings_header
+ table_header
+ table_separator
+ "\n".join(data_rows)
+ "\n"
)
def log(self, log_fn=logger.info):
if not self.stats:
return
log_fn(self.generate_metric_table())
self.reset()
@dataclasses.dataclass
class CUDAGraphEntry:
batch_descriptor: BatchDescriptor

View File

@ -104,7 +104,8 @@ class FixFunctionalizationPass(VllmInductorPass):
mutated_args = {1: "result"}
self.defunctionalize(graph, node, mutated_args)
elif (
at_target
hasattr(torch.ops.vllm, "flashinfer_trtllm_fused_allreduce_norm")
and at_target
== torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default
):
mutated_args = {

View File

@ -30,7 +30,7 @@ CacheDType = Literal[
"fp8_ds_mla",
]
MambaDType = Literal["auto", "float32"]
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
KVOffloadingBackend = Literal["native", "lmcache"]
@ -77,9 +77,21 @@ class CacheConfig:
"""Whether to enable prefix caching."""
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
"""Set the hash algorithm for prefix caching:\n
- "sha256" uses Pickle for object serialization before hashing.\n
- "sha256" uses Pickle for object serialization before hashing. This is the
current default, as SHA256 is the most secure choice to avoid potential
hash collisions.\n
- "sha256_cbor" provides a reproducible, cross-language compatible hash. It
serializes objects using canonical CBOR and hashes them with SHA-256."""
serializes objects using canonical CBOR and hashes them with SHA-256.\n
- "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
non-cryptographic hashing. Requires the optional ``xxhash`` package.
IMPORTANT: Use of a hashing algorithm that is not considered
cryptographically secure theoretically increases the risk of hash collisions,
which can cause undefined behavior or even leak private information in
multi-tenant environments. Even if collisions are still very unlikely, it is
important to consider your security risk tolerance against the performance
benefits before turning this on.\n
- "xxhash_cbor" combines canonical CBOR serialization with xxHash for
reproducible hashing. Requires the optional ``xxhash`` package."""
cpu_offload_gb: float = Field(default=0, ge=0)
"""The space in GiB to offload to CPU, per GPU. Default is 0, which means
no offloading. Intuitively, this argument can be seen as a virtual way to

View File

@ -4,7 +4,7 @@
import enum
from collections import Counter
from collections.abc import Callable
from dataclasses import asdict, field
from dataclasses import field
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar, Literal
@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass
import vllm.envs as envs
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.config.utils import config, handle_deprecated
from vllm.config.utils import config, get_hash_factors, handle_deprecated, hash_factors
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils.import_utils import resolve_obj_by_qualname
@ -196,7 +196,16 @@ class PassConfig:
Any new fields that affect compilation should be added to the hash.
Any future fields that don't affect compilation should be excluded.
"""
return InductorPass.hash_dict(asdict(self))
ignored_fields = [
"enable_fusion",
"enable_attn_fusion",
"enable_noop",
"enable_sequence_parallelism",
"enable_async_tp",
"enable_fi_allreduce_fusion",
]
return hash_factors(get_hash_factors(self, ignored_factors=ignored_fields))
@field_validator(
"fuse_norm_quant",
@ -267,14 +276,6 @@ class PassConfig:
"v0.13.0 or v1.0.0, whichever is sooner",
)
# Force old flags to None to ensure they are not used
self.enable_fusion = None
self.enable_attn_fusion = None
self.enable_noop = None
self.enable_sequence_parallelism = None
self.enable_async_tp = None
self.enable_fi_allreduce_fusion = None
if not self.eliminate_noops:
if self.fuse_norm_quant or self.fuse_act_quant:
logger.warning_once(

View File

@ -84,7 +84,7 @@ TaskOption = Literal[
"transcription",
"draft",
]
TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
LogprobsMode = Literal[
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@ -141,6 +141,7 @@ class ModelConfig:
- "hf" will use the fast tokenizer if available.\n
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- Other custom values can be supported via plugins."""
trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
@ -1779,20 +1780,22 @@ class ModelConfig:
return False
elif attn_type == "decoder":
pooling_type = self.pooler_config.pooling_type.lower()
if pooling_type in ["all", "mean", "step", "cls"]:
if pooling_type in ["mean", "step", "cls"]:
logger.debug(
"Pooling models with %s pooling does not "
"support chunked prefill.",
pooling_type,
)
return False
else:
# pooling_type == "last"
elif pooling_type in ["all", "last"]:
logger.debug(
"Pooling models with causal attn and last pooling support "
"chunked prefill."
"Pooling models with causal attn and %s pooling support "
"chunked prefill.",
pooling_type,
)
return True
else:
raise ValueError(f"{pooling_type=} not supported.")
# vllm currently does not have pooling models using hybrid,
# attention_free or encoder_decoder attn types.
return attn_type != "encoder_decoder"
@ -1816,20 +1819,22 @@ class ModelConfig:
return False
elif attn_type == "decoder":
pooling_type = self.pooler_config.pooling_type.lower()
if pooling_type in ["all", "mean", "step", "cls"]:
if pooling_type in ["mean", "step", "cls"]:
logger.debug(
"Pooling models with %s pooling does not "
"support prefix caching.",
pooling_type,
)
return False
else:
# pooling_type == "last"
elif pooling_type in ["all", "last"]:
logger.debug(
"Pooling models with causal attn and last pooling support "
"prefix caching."
"Pooling models with causal attn and %s pooling support "
"prefix caching.",
pooling_type,
)
return True
else:
raise ValueError(f"{pooling_type=} not supported.")
# vllm currently does not have pooling models using hybrid,
# attention_free or encoder_decoder attn types.
return False

View File

@ -55,6 +55,10 @@ class ObservabilityConfig:
kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
"""Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""
cudagraph_metrics: bool = False
"""Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
dispatch modes, and their observed frequencies at every logging interval)."""
@cached_property
def collect_model_forward_time(self) -> bool:
"""Whether to collect model forward time for the request."""

View File

@ -593,10 +593,14 @@ class ParallelConfig:
"max_parallel_loading_workers is currently "
"not supported and will be ignored."
)
if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
allowed_backends = ("mp", "uni", "external_launcher")
if (
self.distributed_executor_backend not in allowed_backends
and self.nnodes > 1
):
raise ValueError(
"nnodes > 1 can only be set when distributed executor "
"backend is mp or uni."
"backend is mp, uni or external_launcher."
)
@property

Some files were not shown because too many files have changed in this diff Show More