mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-24 06:03:40 +08:00
Merge branch 'main' into mlm-full-lora-support
This commit is contained in:
commit
c94cdf1c50
@ -1,46 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
template = """<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<h1>Links for vLLM</h1/>
|
||||
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
|
||||
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
filename = os.path.basename(args.wheel)
|
||||
|
||||
with open("index.html", "w") as f:
|
||||
print(f"Generated index.html for {args.wheel}")
|
||||
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
|
||||
if "x86_64" in filename:
|
||||
x86_wheel = filename
|
||||
arm_wheel = filename.replace("x86_64", "aarch64").replace(
|
||||
"manylinux1", "manylinux2014"
|
||||
)
|
||||
elif "aarch64" in filename:
|
||||
x86_wheel = filename.replace("aarch64", "x86_64").replace(
|
||||
"manylinux2014", "manylinux1"
|
||||
)
|
||||
arm_wheel = filename
|
||||
else:
|
||||
raise ValueError(f"Unsupported wheel: {filename}")
|
||||
# cloudfront requires escaping the '+' character
|
||||
f.write(
|
||||
template.format(
|
||||
x86_wheel=x86_wheel,
|
||||
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
|
||||
arm_wheel=arm_wheel,
|
||||
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
|
||||
)
|
||||
)
|
||||
@ -7,13 +7,14 @@
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import quote
|
||||
|
||||
import regex as re
|
||||
|
||||
if not sys.version_info >= (3, 12):
|
||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||
|
||||
|
||||
@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
|
||||
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV SOC_VERSION="ascend910b1"
|
||||
|
||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
||||
|
||||
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8030}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
--data-parallel-size 2 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
sleep 1
|
||||
PORT=$((PORT+1))
|
||||
done
|
||||
@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--data-parallel-size 2 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":200,"step_interval":600}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
|
||||
@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8040}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 4 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
sleep 1
|
||||
PORT=$((PORT+1))
|
||||
done
|
||||
@ -81,7 +81,7 @@ else
|
||||
alias_arg=""
|
||||
fi
|
||||
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
|
||||
$PYTHON pip install regex && .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
|
||||
|
||||
# copy indices to /<commit>/ unconditionally
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
|
||||
@ -987,7 +987,8 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||
timeout_in_minutes: 120
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@ -1011,7 +1012,8 @@ steps:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 3
|
||||
- label: Multi-Modal Models Test (Extended) 3 # 75min
|
||||
timeout_in_minutes: 150
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
|
||||
@ -387,6 +387,7 @@ steps:
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- vllm/multimodal
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
@ -1373,4 +1374,22 @@ steps:
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
|
||||
@ -137,6 +137,7 @@ Compute Resources:
|
||||
- Alibaba Cloud
|
||||
- AMD
|
||||
- Anyscale
|
||||
- Arm
|
||||
- AWS
|
||||
- Crusoe Cloud
|
||||
- Databricks
|
||||
|
||||
120
benchmarks/benchmark_hash.py
Normal file
120
benchmarks/benchmark_hash.py
Normal file
@ -0,0 +1,120 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
|
||||
|
||||
This focuses on a single test payload shaped like the prefix-cache hash input:
|
||||
(32-byte bytes object, 32-int tuple)
|
||||
|
||||
Usage:
|
||||
python benchmarks/hash_micro_benchmark.py --iterations 20000
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import random
|
||||
import statistics
|
||||
import time
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
from vllm.utils.hashing import sha256, xxhash
|
||||
|
||||
|
||||
def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
|
||||
"""Generate a deterministic test payload."""
|
||||
random.seed(seed)
|
||||
bytes_data = bytes(random.getrandbits(8) for _ in range(32))
|
||||
int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
|
||||
return (bytes_data, int_tuple)
|
||||
|
||||
|
||||
def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
|
||||
"""Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
|
||||
times: list[float] = []
|
||||
|
||||
# Warm-up to avoid first-run noise.
|
||||
for _ in range(200):
|
||||
func(data)
|
||||
|
||||
for _ in range(iterations):
|
||||
start = time.perf_counter()
|
||||
func(data)
|
||||
end = time.perf_counter()
|
||||
times.append(end - start)
|
||||
|
||||
avg = statistics.mean(times)
|
||||
std = statistics.stdev(times) if len(times) > 1 else 0.0
|
||||
return avg, std
|
||||
|
||||
|
||||
def _run_benchmarks(
|
||||
benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
|
||||
data: tuple,
|
||||
iterations: int,
|
||||
):
|
||||
"""Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
|
||||
for name, func in benchmarks:
|
||||
try:
|
||||
avg, std = _benchmark_func(func, data, iterations)
|
||||
except ModuleNotFoundError as exc:
|
||||
print(f"Skipping {name}: {exc}")
|
||||
continue
|
||||
yield name, avg, std
|
||||
|
||||
|
||||
def builtin_hash(data: tuple) -> int:
|
||||
"""Wrapper for Python's built-in hash()."""
|
||||
return hash(data)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--iterations",
|
||||
type=int,
|
||||
default=10_000,
|
||||
help="Number of measured iterations per hash function.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed", type=int, default=42, help="Random seed for test payload."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
data = _generate_test_data(args.seed)
|
||||
benchmarks = (
|
||||
("SHA256 (pickle)", sha256),
|
||||
("xxHash (pickle)", xxhash),
|
||||
("built-in hash()", builtin_hash),
|
||||
)
|
||||
|
||||
print("=" * 60)
|
||||
print("HASH FUNCTION MICRO BENCHMARK")
|
||||
print("=" * 60)
|
||||
print("Test data: (32-byte bytes object, 32-int tuple)")
|
||||
print(f"Iterations: {args.iterations:,}")
|
||||
print("=" * 60)
|
||||
|
||||
results = list(_run_benchmarks(benchmarks, data, args.iterations))
|
||||
builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
|
||||
|
||||
print("\nResults:")
|
||||
for name, avg, std in results:
|
||||
print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
|
||||
|
||||
if builtin_entry:
|
||||
_, builtin_avg, _ = builtin_entry
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY (relative to built-in hash())")
|
||||
print("=" * 60)
|
||||
for name, avg, _ in results:
|
||||
if name == "built-in hash()":
|
||||
continue
|
||||
speed_ratio = avg / builtin_avg
|
||||
print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
|
||||
else:
|
||||
print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
110
benchmarks/benchmark_prefix_block_hash.py
Normal file
110
benchmarks/benchmark_prefix_block_hash.py
Normal file
@ -0,0 +1,110 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Simple benchmark to compare prefix-cache block hashing algorithms.
|
||||
|
||||
Example:
|
||||
python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import random
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from collections.abc import Callable, Iterable, Sequence
|
||||
|
||||
from vllm.utils.hashing import get_hash_fn_by_name
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
|
||||
|
||||
SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
|
||||
|
||||
|
||||
def _generate_blocks(
|
||||
num_blocks: int, block_size: int, vocab_size: int, seed: int
|
||||
) -> list[list[int]]:
|
||||
rng = random.Random(seed)
|
||||
return [
|
||||
[rng.randrange(vocab_size) for _ in range(block_size)]
|
||||
for _ in range(num_blocks)
|
||||
]
|
||||
|
||||
|
||||
def _hash_all_blocks(
|
||||
hash_fn: Callable[[object], bytes],
|
||||
blocks: Iterable[Sequence[int]],
|
||||
) -> float:
|
||||
parent_hash: BlockHash | None = None
|
||||
start = time.perf_counter()
|
||||
for block in blocks:
|
||||
parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
|
||||
end = time.perf_counter()
|
||||
return end - start
|
||||
|
||||
|
||||
def _benchmark(
|
||||
hash_algo: str,
|
||||
blocks: list[list[int]],
|
||||
trials: int,
|
||||
) -> tuple[float, float, float] | None:
|
||||
try:
|
||||
hash_fn = get_hash_fn_by_name(hash_algo)
|
||||
init_none_hash(hash_fn)
|
||||
timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
|
||||
except ModuleNotFoundError as exc:
|
||||
print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
avg = statistics.mean(timings)
|
||||
best = min(timings)
|
||||
# throughput: tokens / second
|
||||
tokens_hashed = len(blocks) * len(blocks[0])
|
||||
throughput = tokens_hashed / best
|
||||
return avg, best, throughput
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
|
||||
parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
|
||||
parser.add_argument(
|
||||
"--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=0, help="Random seed.")
|
||||
parser.add_argument(
|
||||
"--trials", type=int, default=5, help="Number of timed trials per algorithm."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--algorithms",
|
||||
nargs="+",
|
||||
default=SUPPORTED_ALGOS,
|
||||
choices=SUPPORTED_ALGOS,
|
||||
help="Hash algorithms to benchmark.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
blocks = _generate_blocks(
|
||||
args.num_blocks, args.block_size, args.vocab_size, args.seed
|
||||
)
|
||||
print(
|
||||
f"Benchmarking {len(args.algorithms)} algorithms on "
|
||||
f"{args.num_blocks} blocks (block size={args.block_size})."
|
||||
)
|
||||
|
||||
for algo in args.algorithms:
|
||||
result = _benchmark(algo, blocks, args.trials)
|
||||
if result is None:
|
||||
continue
|
||||
|
||||
avg, best, throughput = result
|
||||
print(
|
||||
f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s "
|
||||
f"throughput: {throughput / 1e6:.2f}M tokens/s"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
244
benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
Normal file
244
benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
Normal file
@ -0,0 +1,244 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from itertools import product
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
from torch.utils.benchmark import Measurement as TMeasurement
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
_per_token_group_quant_fp8_colmajor,
|
||||
silu_mul_per_token_group_quant_fp8_colmajor,
|
||||
)
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
|
||||
|
||||
from .utils import ArgPool, Bench, CudaGraphBenchParams
|
||||
|
||||
GROUP_SIZE = 128
|
||||
FLOAT8_T = torch.float8_e4m3fn
|
||||
|
||||
|
||||
def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
|
||||
print(
|
||||
f"Note : The timings reported above is for {cuda_graph_nops} "
|
||||
"consecutive invocations of the benchmarking functions. "
|
||||
f"Please divide by {cuda_graph_nops} for single invocation "
|
||||
"timings."
|
||||
)
|
||||
compare = TBenchmark.Compare(timers)
|
||||
compare.print()
|
||||
|
||||
|
||||
class ImplType(Enum):
|
||||
SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
|
||||
REFERENCE = 2
|
||||
|
||||
def get_impl(self):
|
||||
if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
|
||||
return silu_mul_per_token_group_quant_fp8_colmajor
|
||||
elif self == ImplType.REFERENCE:
|
||||
return reference
|
||||
raise ValueError(f"Unrecognized ImplType {self}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkTensors:
|
||||
input: torch.Tensor
|
||||
output: torch.Tensor
|
||||
|
||||
# Reference act output tensor
|
||||
ref_act_out: torch.Tensor
|
||||
ref_quant_out: torch.Tensor
|
||||
|
||||
@staticmethod
|
||||
def make(T: int, N: int) -> "BenchmarkTensors":
|
||||
assert T % GROUP_SIZE == 0
|
||||
assert N % (GROUP_SIZE * 2) == 0
|
||||
|
||||
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
|
||||
|
||||
# silu_mul_per_token_group_quant_fp8_colmajor output.
|
||||
output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
|
||||
FLOAT8_T
|
||||
)
|
||||
|
||||
# reference output.
|
||||
ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
|
||||
ref_quant_out = torch.empty(
|
||||
(T, N // 2), dtype=torch.bfloat16, device="cuda"
|
||||
).to(FLOAT8_T)
|
||||
|
||||
return BenchmarkTensors(
|
||||
input=input,
|
||||
output=output,
|
||||
ref_act_out=ref_act_out,
|
||||
ref_quant_out=ref_quant_out,
|
||||
)
|
||||
|
||||
@property
|
||||
def T(self):
|
||||
return self.input.size(0)
|
||||
|
||||
@property
|
||||
def N(self):
|
||||
return self.input.size(1)
|
||||
|
||||
def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
|
||||
if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
|
||||
return {
|
||||
"input": self.input,
|
||||
"output": self.output,
|
||||
"use_ue8m0": is_deep_gemm_e8m0_used(),
|
||||
}
|
||||
elif impl_type == ImplType.REFERENCE:
|
||||
return {
|
||||
"input": self.input,
|
||||
"act_out": self.ref_act_out,
|
||||
"quant_out": self.ref_quant_out,
|
||||
"use_ue8m0": is_deep_gemm_e8m0_used(),
|
||||
}
|
||||
raise ValueError(f"Unrecognized impl_type {impl_type}")
|
||||
|
||||
|
||||
def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
|
||||
"""
|
||||
Reference triton quant kernel from,
|
||||
vllm.model_executor.layers.quantization.utils.fp8_utils
|
||||
"""
|
||||
assert quant_out.size() == x.size()
|
||||
# Allocate the scale tensor column-major format.
|
||||
shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
|
||||
x_q = quant_out
|
||||
x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
|
||||
|
||||
M = x.numel() // GROUP_SIZE
|
||||
N = GROUP_SIZE
|
||||
BLOCK = triton.next_power_of_2(N)
|
||||
# heuristics for number of warps
|
||||
num_warps = min(max(BLOCK // 256, 1), 8)
|
||||
num_stages = 1
|
||||
|
||||
finfo = torch.finfo(FLOAT8_T)
|
||||
fp8_min = finfo.min
|
||||
fp8_max = finfo.max
|
||||
|
||||
_per_token_group_quant_fp8_colmajor[(M,)](
|
||||
x,
|
||||
x_q,
|
||||
x_s,
|
||||
GROUP_SIZE,
|
||||
x.shape[1],
|
||||
x.stride(0),
|
||||
x_s.stride(1),
|
||||
eps=1e-10,
|
||||
fp8_min=fp8_min,
|
||||
fp8_max=fp8_max,
|
||||
use_ue8m0=use_ue8m0,
|
||||
BLOCK=BLOCK,
|
||||
num_warps=num_warps,
|
||||
num_stages=num_stages,
|
||||
)
|
||||
return x_q, x_s
|
||||
|
||||
|
||||
def reference(
|
||||
input: torch.Tensor,
|
||||
act_out: torch.Tensor,
|
||||
quant_out: torch.Tensor,
|
||||
use_ue8m0: bool,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
torch.ops._C.silu_and_mul(act_out, input)
|
||||
return reference_quant(act_out, quant_out, use_ue8m0)
|
||||
|
||||
|
||||
def bench_impl(
|
||||
bench_tensors: list[BenchmarkTensors], impl_type: ImplType
|
||||
) -> TMeasurement:
|
||||
T = bench_tensors[0].T
|
||||
N = bench_tensors[0].N
|
||||
|
||||
arg_pool_size = len(bench_tensors)
|
||||
kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
|
||||
|
||||
# warmup
|
||||
for kwargs in kwargs_list:
|
||||
impl_type.get_impl()(**kwargs)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Merge into a single kwargs and qualify arguments as ArgPool
|
||||
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
||||
for _kwargs in kwargs_list:
|
||||
for k, v in _kwargs.items():
|
||||
kwargs[k].values.append(v)
|
||||
|
||||
cuda_graph_params = None
|
||||
cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
|
||||
timer = None
|
||||
with Bench(
|
||||
cuda_graph_params,
|
||||
"silu-mul-quant",
|
||||
f"num_tokens={T}, N={N}",
|
||||
impl_type.name,
|
||||
impl_type.get_impl(),
|
||||
**kwargs,
|
||||
) as bench:
|
||||
timer = bench.run()
|
||||
return timer
|
||||
|
||||
|
||||
def test_correctness(T: int, N: int):
|
||||
print(f"Testing num_tokens={T}, N={N} ...")
|
||||
|
||||
bench_tensor = BenchmarkTensors.make(T, N)
|
||||
|
||||
def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
|
||||
|
||||
# reference output
|
||||
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
|
||||
|
||||
# test ouptut
|
||||
out_q, out_s = output_from_impl(
|
||||
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
||||
)
|
||||
|
||||
torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
|
||||
torch.testing.assert_close(ref_out_s, out_s)
|
||||
|
||||
|
||||
def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
|
||||
timers = []
|
||||
for N, T in product(Ns, Ts):
|
||||
test_correctness(T, N)
|
||||
|
||||
bench_tensors: list[BenchmarkTensors] = [
|
||||
BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
|
||||
]
|
||||
|
||||
silu_mul_quant_timer = bench_impl(
|
||||
bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
||||
)
|
||||
timers.append(silu_mul_quant_timer)
|
||||
reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
|
||||
timers.append(reference_timer)
|
||||
|
||||
print_timers(
|
||||
[silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
|
||||
)
|
||||
|
||||
print_timers(timers, cuda_graph_nops=arg_pool_size)
|
||||
|
||||
return timers
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
|
||||
N = [2048, 4096, 8192]
|
||||
|
||||
print(f"T = {T}, N = {N}")
|
||||
run(T, N, arg_pool_size=8)
|
||||
@ -150,6 +150,97 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
|
||||
#################### CSRC BUILD IMAGE ####################
|
||||
FROM base AS csrc-build
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
|
||||
# install build dependencies
|
||||
COPY requirements/build.txt requirements/build.txt
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY pyproject.toml setup.py CMakeLists.txt ./
|
||||
COPY cmake cmake/
|
||||
COPY csrc csrc/
|
||||
COPY vllm/envs.py vllm/envs.py
|
||||
COPY vllm/__init__.py vllm/__init__.py
|
||||
|
||||
# max jobs used by Ninja to build extensions
|
||||
ARG max_jobs=2
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
# number of threads used by nvcc
|
||||
ARG nvcc_threads=8
|
||||
ENV NVCC_THREADS=$nvcc_threads
|
||||
|
||||
ARG USE_SCCACHE
|
||||
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
|
||||
ARG SCCACHE_ENDPOINT
|
||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
# Flag to control whether to use pre-built vLLM wheels
|
||||
ARG VLLM_USE_PRECOMPILED=""
|
||||
ARG VLLM_MERGE_BASE_COMMIT=""
|
||||
ARG VLLM_MAIN_CUDA_VERSION=""
|
||||
|
||||
# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
|
||||
ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
|
||||
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
|
||||
&& tar -xzf sccache.tar.gz \
|
||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
|
||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||
&& export CMAKE_BUILD_TYPE=Release \
|
||||
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||
&& export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" \
|
||||
&& export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
|
||||
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||
&& sccache --show-stats \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||
&& sccache --show-stats; \
|
||||
fi
|
||||
|
||||
ARG vllm_target_device="cuda"
|
||||
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||
# Clean any existing CMake artifacts
|
||||
rm -rf .deps && \
|
||||
mkdir -p .deps && \
|
||||
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
|
||||
export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" && \
|
||||
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
fi
|
||||
#################### CSRC BUILD IMAGE ####################
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
FROM base AS build
|
||||
ARG TARGETPLATFORM
|
||||
@ -172,66 +263,28 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY --from=csrc-build /workspace/dist /precompiled-wheels
|
||||
|
||||
COPY . .
|
||||
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
# max jobs used by Ninja to build extensions
|
||||
ARG max_jobs=2
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
# number of threads used by nvcc
|
||||
ARG nvcc_threads=8
|
||||
ENV NVCC_THREADS=$nvcc_threads
|
||||
|
||||
ARG USE_SCCACHE
|
||||
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
|
||||
ARG SCCACHE_ENDPOINT
|
||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
# Flag to control whether to use pre-built vLLM wheels
|
||||
ARG VLLM_USE_PRECOMPILED=""
|
||||
ARG VLLM_MAIN_CUDA_VERSION=""
|
||||
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
|
||||
&& tar -xzf sccache.tar.gz \
|
||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
|
||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||
&& export CMAKE_BUILD_TYPE=Release \
|
||||
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||
&& export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
|
||||
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||
&& sccache --show-stats \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||
&& sccache --show-stats; \
|
||||
fi
|
||||
|
||||
ARG vllm_target_device="cuda"
|
||||
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||
# Clean any existing CMake artifacts
|
||||
rm -rf .deps && \
|
||||
mkdir -p .deps && \
|
||||
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
|
||||
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
fi
|
||||
|
||||
# Skip adding +precompiled suffix to version (preserves git-derived version)
|
||||
ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "${vllm_target_device}" = "cuda" ]; then \
|
||||
export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
|
||||
fi && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
|
||||
|
||||
# Install DeepGEMM from source
|
||||
ARG DEEPGEMM_GIT_REF
|
||||
@ -527,7 +580,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
else \
|
||||
BITSANDBYTES_VERSION="0.46.1"; \
|
||||
fi; \
|
||||
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0'
|
||||
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 146 KiB After Width: | Height: | Size: 174 KiB |
@ -670,6 +670,35 @@ vllm bench serve \
|
||||
|
||||
</details>
|
||||
|
||||
### 🧪 Hashing Benchmarks
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
<summary>Show more</summary>
|
||||
|
||||
Two helper scripts live in `benchmarks/` to compare hashing options used by prefix caching and related utilities. They are standalone (no server required) and help choose a hash algorithm before enabling prefix caching in production.
|
||||
|
||||
- `benchmarks/benchmark_hash.py`: Micro-benchmark that measures per-call latency of three implementations on a representative `(bytes, tuple[int])` payload.
|
||||
|
||||
```bash
|
||||
python benchmarks/benchmark_hash.py --iterations 20000 --seed 42
|
||||
```
|
||||
|
||||
- `benchmarks/benchmark_prefix_block_hash.py`: End-to-end block hashing benchmark that runs the full prefix-cache hash pipeline (`hash_block_tokens`) across many fake blocks and reports throughput.
|
||||
|
||||
```bash
|
||||
python benchmarks/benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32 --trials 5
|
||||
```
|
||||
|
||||
Supported algorithms: `sha256`, `sha256_cbor`, `xxhash`, `xxhash_cbor`. Install optional deps to exercise all variants:
|
||||
|
||||
```bash
|
||||
uv pip install xxhash cbor2
|
||||
```
|
||||
|
||||
If an algorithm’s dependency is missing, the script will skip it and continue.
|
||||
|
||||
</details>
|
||||
|
||||
### ⚡ Request Prioritization Benchmark
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
|
||||
@ -18,6 +18,7 @@ Compute Resources:
|
||||
- Alibaba Cloud
|
||||
- AMD
|
||||
- Anyscale
|
||||
- Arm
|
||||
- AWS
|
||||
- Crusoe Cloud
|
||||
- Databricks
|
||||
|
||||
@ -57,15 +57,15 @@ vLLM also provides [a reference example](../../examples/online_serving/prometheu
|
||||
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
|
||||
|
||||
- `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds.
|
||||
- `vllm:prompt_tokens_total` - Prompt tokens.
|
||||
- `vllm:generation_tokens_total` - Generation tokens.
|
||||
- `vllm:prompt_tokens` - Prompt tokens.
|
||||
- `vllm:generation_tokens` - Generation tokens.
|
||||
- `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
|
||||
- `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
|
||||
- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
|
||||
- `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
|
||||
- `vllm:request_prompt_tokens` - Request prompt length.
|
||||
- `vllm:request_generation_tokens` - Request generation length.
|
||||
- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
|
||||
- `vllm:request_success` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
|
||||
- `vllm:request_queue_time_seconds` - Queue time.
|
||||
- `vllm:request_prefill_time_seconds` - Requests prefill time.
|
||||
- `vllm:request_decode_time_seconds` - Requests decode time.
|
||||
@ -571,9 +571,9 @@ model and then validate those tokens with the larger model.
|
||||
|
||||
- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
|
||||
- `vllm:spec_decode_efficiency` (Gauge)
|
||||
- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
|
||||
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
||||
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
||||
- `vllm:spec_decode_num_accepted_tokens` (Counter)
|
||||
- `vllm:spec_decode_num_draft_tokens` (Counter)
|
||||
- `vllm:spec_decode_num_emitted_tokens` (Counter)
|
||||
|
||||
There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
|
||||
speculative decoding to v1. Other techniques will follow. We should
|
||||
|
||||
@ -90,7 +90,6 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
|
||||
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
|
||||
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
||||
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
||||
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
|
||||
| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
|
||||
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
||||
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
|
||||
@ -114,5 +113,5 @@ The following table shows "families" of modular kernels that are intended to wor
|
||||
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
||||
|---------|-----------------------------------------|----------------------------------------------|
|
||||
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
||||
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
|
||||
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
|
||||
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
||||
|
||||
@ -54,7 +54,7 @@ th:not(:first-child) {
|
||||
| beam-search | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/7968) | ❔ | ✅ | ✅ | |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
||||
|
||||
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
||||
\* Chunked prefill and prefix caching are only applicable to last-token or all pooling with causal attention.
|
||||
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
||||
|
||||
### Feature x Hardware
|
||||
|
||||
58
docs/features/mooncake_connector_usage.md
Normal file
58
docs/features/mooncake_connector_usage.md
Normal file
@ -0,0 +1,58 @@
|
||||
# MooncakeConnector Usage Guide
|
||||
|
||||
## About Mooncake
|
||||
|
||||
Mooncake aims to enhance the inference efficiency of large language models (LLMs), especially in slow object storage environments, by constructing a multi-level caching pool on high-speed interconnected DRAM/SSD resources. Compared to traditional caching systems, Mooncake utilizes (GPUDirect) RDMA technology to transfer data directly in a zero-copy manner, while maximizing the use of multi-NIC resources on a single machine.
|
||||
|
||||
For more details about Mooncake, please refer to [Mooncake project](https://github.com/kvcache-ai/Mooncake) and [Mooncake documents](https://kvcache-ai.github.io/Mooncake/).
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### Installation
|
||||
|
||||
Install mooncake through pip: `uv pip install mooncake-transfer-engine`.
|
||||
|
||||
Refer to [Mooncake official repository](https://github.com/kvcache-ai/Mooncake) for more installation instructions
|
||||
|
||||
## Usage
|
||||
|
||||
### Prefiller Node (192.168.0.2)
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-7B-Instruct --port 8010 --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_producer"}'
|
||||
```
|
||||
|
||||
### Decoder Node (192.168.0.3)
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-7B-Instruct --port 8020 --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_consumer"}'
|
||||
```
|
||||
|
||||
### Proxy
|
||||
|
||||
```bash
|
||||
python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --prefiller-host 192.168.0.2 --prefiller-port 8010 --decoder-host 192.168.0.3 --decoder-port 8020
|
||||
```
|
||||
|
||||
> NOTE: The Mooncake Connector currently uses the proxy from nixl_integration. This will be replaced with a self-developed proxy in the future.
|
||||
|
||||
Now you can send requests to the proxy server through port 8000.
|
||||
|
||||
## Environment Variables
|
||||
|
||||
- `VLLM_MOONCAKE_BOOTSTRAP_PORT`: Port for Mooncake bootstrap server
|
||||
- Default: 8998
|
||||
- Required only for prefiller instances
|
||||
- Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine
|
||||
- For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank
|
||||
- Used for the decoder notifying the prefiller
|
||||
|
||||
- `VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
|
||||
- Default: 480
|
||||
- If a request is aborted and the decoder has not yet notified the prefiller, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
|
||||
|
||||
## KV Role Options
|
||||
|
||||
- **kv_producer**: For prefiller instances that generate KV caches
|
||||
- **kv_consumer**: For decoder instances that consume KV caches from prefiller
|
||||
- **kv_both**: Enables symmetric functionality where the connector can act as both producer and consumer. This provides flexibility for experimental setups and scenarios where the role distinction is not predetermined.
|
||||
@ -795,14 +795,12 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
|
||||
??? code
|
||||
|
||||
```python
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
image_embedding = torch.load(...)
|
||||
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||
|
||||
buffer = io.BytesIO()
|
||||
torch.save(image_embedding, buffer)
|
||||
buffer.seek(0)
|
||||
binary_data = buffer.read()
|
||||
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
|
||||
base64_image_embedding = tensor2base64(image_embedding)
|
||||
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
|
||||
@ -4,9 +4,6 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must
|
||||
|
||||
Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
|
||||
@ -20,6 +17,8 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
|
||||
# --8<-- [end:set-up-using-python]
|
||||
# --8<-- [start:pre-built-wheels]
|
||||
|
||||
Currently, there are no pre-built Apple silicon CPU wheels.
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
@ -78,6 +77,8 @@ uv pip install -e .
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
Currently, there are no pre-built Arm silicon CPU images.
|
||||
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
|
||||
|
||||
@ -1,11 +1,6 @@
|
||||
# --8<-- [start:installation]
|
||||
|
||||
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform.
|
||||
|
||||
ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
vLLM offers basic model inferencing and serving on Arm CPU platform, with support NEON, data types FP32, FP16 and BF16.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
@ -20,6 +15,23 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
|
||||
# --8<-- [end:set-up-using-python]
|
||||
# --8<-- [start:pre-built-wheels]
|
||||
|
||||
Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels contain pre-compiled C++ binaries.
|
||||
Please replace `<version>` in the commands below with a specific version string (e.g., `0.11.2`).
|
||||
|
||||
```bash
|
||||
uv pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.ai/<version>%2Bcpu/
|
||||
```
|
||||
|
||||
The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
|
||||
|
||||
!!! note
|
||||
Nightly wheels are currently unsupported for this architecture. (e.g. to bisect the behavior change, performance regression).
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
@ -69,6 +81,8 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
Currently, there are no pre-built Arm CPU images.
|
||||
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
```bash
|
||||
|
||||
@ -46,11 +46,25 @@ vLLM is a Python library that supports the following CPU variants. Select your C
|
||||
|
||||
### Pre-built wheels
|
||||
|
||||
Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
|
||||
|
||||
When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
|
||||
For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
|
||||
|
||||
=== "Intel/AMD x86"
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-wheels"
|
||||
|
||||
=== "ARM AArch64"
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu.arm.inc.md:pre-built-wheels"
|
||||
|
||||
=== "Apple silicon"
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu.apple.inc.md:pre-built-wheels"
|
||||
|
||||
=== "IBM Z (S390X)"
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu.s390x.inc.md:pre-built-wheels"
|
||||
|
||||
### Build wheel from source
|
||||
|
||||
#### Set up using Python-only build (without compilation) {#python-only-build}
|
||||
@ -87,6 +101,18 @@ VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-images"
|
||||
|
||||
=== "ARM AArch64"
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu.arm.inc.md:pre-built-images"
|
||||
|
||||
=== "Apple silicon"
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu.apple.inc.md:pre-built-images"
|
||||
|
||||
=== "IBM Z (S390X)"
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu.s390x.inc.md:pre-built-images"
|
||||
|
||||
### Build image from source
|
||||
|
||||
=== "Intel/AMD x86"
|
||||
|
||||
@ -4,9 +4,6 @@ vLLM has experimental support for s390x architecture on IBM Z platform. For now,
|
||||
|
||||
Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
|
||||
@ -21,6 +18,8 @@ Currently, the CPU implementation for s390x architecture supports FP32 datatype
|
||||
# --8<-- [end:set-up-using-python]
|
||||
# --8<-- [start:pre-built-wheels]
|
||||
|
||||
Currently, there are no pre-built IBM Z CPU wheels.
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
@ -69,6 +68,8 @@ Execute the following commands to build and install vLLM from source.
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
Currently, there are no pre-built IBM Z CPU images.
|
||||
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
|
||||
|
||||
@ -17,6 +17,8 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
|
||||
# --8<-- [end:set-up-using-python]
|
||||
# --8<-- [start:pre-built-wheels]
|
||||
|
||||
Currently, there are no pre-built x86 CPU wheels.
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
|
||||
@ -5,9 +5,6 @@ vLLM supports AMD GPUs with ROCm 6.3 or above, and torch 2.8.0 and above.
|
||||
!!! tip
|
||||
[Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
|
||||
|
||||
@ -2,9 +2,6 @@
|
||||
|
||||
vLLM initially supports basic model inference and serving on Intel GPU platform.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
|
||||
|
||||
@ -711,7 +711,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
|
||||
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
|
||||
|
||||
@ -23,31 +23,23 @@ def create_test_prompts(
|
||||
# this is an example of using quantization without LoRA
|
||||
(
|
||||
"My name is",
|
||||
SamplingParams(
|
||||
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
|
||||
),
|
||||
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
|
||||
None,
|
||||
),
|
||||
# the next three examples use quantization with LoRA
|
||||
(
|
||||
"my name is",
|
||||
SamplingParams(
|
||||
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
|
||||
),
|
||||
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
|
||||
LoRARequest("lora-test-1", 1, lora_path),
|
||||
),
|
||||
(
|
||||
"The capital of USA is",
|
||||
SamplingParams(
|
||||
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
|
||||
),
|
||||
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
|
||||
LoRARequest("lora-test-2", 1, lora_path),
|
||||
),
|
||||
(
|
||||
"The capital of France is",
|
||||
SamplingParams(
|
||||
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
|
||||
),
|
||||
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
|
||||
LoRARequest("lora-test-3", 1, lora_path),
|
||||
),
|
||||
]
|
||||
|
||||
@ -27,9 +27,7 @@ def create_test_prompts(
|
||||
return [
|
||||
(
|
||||
"A robot may not injure a human being",
|
||||
SamplingParams(
|
||||
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
|
||||
),
|
||||
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
|
||||
None,
|
||||
),
|
||||
(
|
||||
@ -41,22 +39,12 @@ def create_test_prompts(
|
||||
),
|
||||
(
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
SamplingParams(
|
||||
temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=128,
|
||||
),
|
||||
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
|
||||
LoRARequest("sql-lora", 1, lora_path),
|
||||
),
|
||||
(
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
SamplingParams(
|
||||
temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=128,
|
||||
),
|
||||
SamplingParams(temperature=0.0, logprobs=1, max_tokens=128),
|
||||
LoRARequest("sql-lora2", 2, lora_path),
|
||||
),
|
||||
]
|
||||
|
||||
@ -309,6 +309,28 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# HunyuanOCR
|
||||
def load_hunyuan_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "tencent/HunyuanOCR"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholder = (
|
||||
"<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501
|
||||
) * len(image_urls)
|
||||
prompt = f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>"
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_hyperclovax_seed_vision(
|
||||
question: str, image_urls: list[str]
|
||||
) -> ModelRequestData:
|
||||
@ -1322,6 +1344,7 @@ model_example_map = {
|
||||
"deepseek_ocr": load_deepseek_ocr,
|
||||
"gemma3": load_gemma3,
|
||||
"h2ovl_chat": load_h2ovl,
|
||||
"hunyuan_vl": load_hunyuan_vl,
|
||||
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
|
||||
"idefics3": load_idefics3,
|
||||
"interns1": load_interns1,
|
||||
|
||||
@ -28,13 +28,11 @@ Dependencies:
|
||||
- openai
|
||||
"""
|
||||
|
||||
import base64
|
||||
import io
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
from openai import OpenAI
|
||||
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
@ -58,11 +56,7 @@ def main():
|
||||
prompt_embeds = embedding_layer(token_ids).squeeze(0)
|
||||
|
||||
# Prompt embeddings
|
||||
buffer = io.BytesIO()
|
||||
torch.save(prompt_embeds, buffer)
|
||||
buffer.seek(0)
|
||||
binary_data = buffer.read()
|
||||
encoded_embeds = base64.b64encode(binary_data).decode("utf-8")
|
||||
encoded_embeds = tensor2base64(prompt_embeds)
|
||||
|
||||
completion = client.completions.create(
|
||||
model=model_name,
|
||||
|
||||
@ -150,7 +150,8 @@ def run_siglip(client: OpenAI, model: str):
|
||||
Start the server using:
|
||||
|
||||
vllm serve google/siglip-base-patch16-224 \
|
||||
--runner pooling
|
||||
--runner pooling \
|
||||
--chat-template template_basic.jinja
|
||||
"""
|
||||
|
||||
response = create_chat_embeddings(
|
||||
|
||||
@ -46,6 +46,7 @@ scipy # Required for phi-4-multimodal-instruct
|
||||
ninja # Required for xgrammar, rocm, tpu, xpu
|
||||
pybase64 # fast base64 implementation
|
||||
cbor2 # Required for cross-language serialization of hashable objects
|
||||
ijson # Required for mistral streaming tool parser
|
||||
setproctitle # Used to set process names for better debugging and monitoring
|
||||
openai-harmony >= 0.0.3 # Required for gpt-oss
|
||||
anthropic == 0.71.0
|
||||
|
||||
@ -3,7 +3,6 @@ ninja
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<81.0.0
|
||||
setuptools-scm>=8
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
|
||||
torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL)
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
|
||||
|
||||
# Dependencies for CPUs
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
|
||||
torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
|
||||
|
||||
@ -42,6 +42,6 @@ tritonclient==2.51.0
|
||||
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
numpy
|
||||
runai-model-streamer[s3,gcs]==0.15.0
|
||||
runai-model-streamer[s3,gcs]==0.15.3
|
||||
fastsafetensors>=0.1.10
|
||||
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
||||
|
||||
@ -12,7 +12,7 @@ tensorizer==2.10.1
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
runai-model-streamer[s3,gcs]==0.15.0
|
||||
runai-model-streamer[s3,gcs]==0.15.3
|
||||
conch-triton-kernels==1.2.1
|
||||
timm>=1.0.17
|
||||
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
|
||||
|
||||
@ -51,7 +51,7 @@ tritonclient==2.51.0
|
||||
arctic-inference == 0.1.1 # Required for suffix decoding test
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
numpy
|
||||
runai-model-streamer[s3,gcs]==0.15.0
|
||||
runai-model-streamer[s3,gcs]==0.15.3
|
||||
fastsafetensors>=0.1.10
|
||||
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
||||
decord==0.6.0
|
||||
|
||||
@ -965,11 +965,11 @@ rsa==4.9.1
|
||||
# via google-auth
|
||||
rtree==1.4.0
|
||||
# via torchgeo
|
||||
runai-model-streamer==0.15.0
|
||||
runai-model-streamer==0.15.3
|
||||
# via -r requirements/test.in
|
||||
runai-model-streamer-gcs==0.15.0
|
||||
runai-model-streamer-gcs==0.15.3
|
||||
# via runai-model-streamer
|
||||
runai-model-streamer-s3==0.15.0
|
||||
runai-model-streamer-s3==0.15.3
|
||||
# via runai-model-streamer
|
||||
s3transfer==0.10.3
|
||||
# via boto3
|
||||
|
||||
41
setup.py
41
setup.py
@ -346,10 +346,13 @@ class precompiled_wheel_utils:
|
||||
The order of preference is:
|
||||
1. user-specified wheel location (can be either local or remote, via
|
||||
VLLM_PRECOMPILED_WHEEL_LOCATION)
|
||||
2. user-specified variant from nightly repo (current main commit via
|
||||
VLLM_PRECOMPILED_WHEEL_VARIANT)
|
||||
2. user-specified variant (VLLM_PRECOMPILED_WHEEL_VARIANT) from nightly repo
|
||||
3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
|
||||
4. the default variant from nightly repo (current main commit)
|
||||
4. the default variant from nightly repo
|
||||
|
||||
If downloading from the nightly repo, the commit can be specified via
|
||||
VLLM_PRECOMPILED_WHEEL_COMMIT; otherwise, the head commit in the main branch
|
||||
is used.
|
||||
"""
|
||||
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
|
||||
if wheel_location is not None:
|
||||
@ -362,10 +365,13 @@ class precompiled_wheel_utils:
|
||||
# try to fetch the wheel metadata from the nightly wheel repo
|
||||
main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
|
||||
variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
|
||||
commit = os.getenv(
|
||||
"VLLM_PRECOMPILED_WHEEL_COMMIT",
|
||||
precompiled_wheel_utils.get_base_commit_in_main_branch(),
|
||||
)
|
||||
commit = os.getenv("VLLM_PRECOMPILED_WHEEL_COMMIT", "").lower()
|
||||
if not commit or len(commit) != 40:
|
||||
print(
|
||||
f"VLLM_PRECOMPILED_WHEEL_COMMIT not valid: {commit}"
|
||||
", trying to fetch base commit in main branch"
|
||||
)
|
||||
commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
|
||||
print(f"Using precompiled wheel commit {commit} with variant {variant}")
|
||||
try_default = False
|
||||
wheels, repo_url, download_filename = None, None, None
|
||||
@ -461,14 +467,22 @@ class precompiled_wheel_utils:
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
]
|
||||
|
||||
compiled_regex = re.compile(
|
||||
flash_attn_regex = re.compile(
|
||||
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
|
||||
)
|
||||
triton_kernels_regex = re.compile(
|
||||
r"vllm/third_party/triton_kernels/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
|
||||
)
|
||||
file_members = list(
|
||||
filter(lambda x: x.filename in files_to_copy, wheel.filelist)
|
||||
)
|
||||
file_members += list(
|
||||
filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
|
||||
filter(lambda x: flash_attn_regex.match(x.filename), wheel.filelist)
|
||||
)
|
||||
file_members += list(
|
||||
filter(
|
||||
lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
|
||||
)
|
||||
)
|
||||
|
||||
for file in file_members:
|
||||
@ -494,10 +508,6 @@ class precompiled_wheel_utils:
|
||||
|
||||
@staticmethod
|
||||
def get_base_commit_in_main_branch() -> str:
|
||||
# Force to use the nightly wheel. This is mainly used for CI testing.
|
||||
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
|
||||
return "nightly"
|
||||
|
||||
try:
|
||||
# Get the latest commit hash of the upstream main branch.
|
||||
resp_json = subprocess.check_output(
|
||||
@ -508,6 +518,7 @@ class precompiled_wheel_utils:
|
||||
]
|
||||
).decode("utf-8")
|
||||
upstream_main_commit = json.loads(resp_json)["sha"]
|
||||
print(f"Upstream main branch latest commit: {upstream_main_commit}")
|
||||
|
||||
# In Docker build context, .git may be immutable or missing.
|
||||
if envs.VLLM_DOCKER_BUILD_CONTEXT:
|
||||
@ -648,7 +659,7 @@ def get_vllm_version() -> str:
|
||||
if envs.VLLM_TARGET_DEVICE == "empty":
|
||||
version += f"{sep}empty"
|
||||
elif _is_cuda():
|
||||
if envs.VLLM_USE_PRECOMPILED:
|
||||
if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
|
||||
version += f"{sep}precompiled"
|
||||
else:
|
||||
cuda_version = str(get_nvcc_cuda_version())
|
||||
@ -786,7 +797,7 @@ setup(
|
||||
"bench": ["pandas", "matplotlib", "seaborn", "datasets"],
|
||||
"tensorizer": ["tensorizer==2.10.1"],
|
||||
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
||||
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"],
|
||||
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
|
||||
"audio": [
|
||||
"librosa",
|
||||
"soundfile",
|
||||
|
||||
@ -392,39 +392,48 @@ def test_pass_config_deprecation(caplog_vllm):
|
||||
assert "enable_fusion is deprecated" in caplog_vllm.text
|
||||
assert config.fuse_norm_quant is True
|
||||
assert config.fuse_act_quant is True
|
||||
assert config.enable_fusion is None
|
||||
assert config.enable_fusion is True
|
||||
|
||||
# Test enable_attn_fusion -> fuse_attn_quant
|
||||
caplog_vllm.clear()
|
||||
config = PassConfig(enable_attn_fusion=True)
|
||||
assert "enable_attn_fusion is deprecated" in caplog_vllm.text
|
||||
assert config.fuse_attn_quant is True
|
||||
assert config.enable_attn_fusion is None
|
||||
assert config.enable_attn_fusion is True
|
||||
|
||||
# Test enable_noop -> eliminate_noops
|
||||
caplog_vllm.clear()
|
||||
config = PassConfig(enable_noop=True)
|
||||
assert "enable_noop is deprecated" in caplog_vllm.text
|
||||
assert config.eliminate_noops is True
|
||||
assert config.enable_noop is None
|
||||
assert config.enable_noop is True
|
||||
|
||||
# Test enable_sequence_parallelism -> enable_sp
|
||||
caplog_vllm.clear()
|
||||
config = PassConfig(enable_sequence_parallelism=True)
|
||||
assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
|
||||
assert config.enable_sp is True
|
||||
assert config.enable_sequence_parallelism is None
|
||||
assert config.enable_sequence_parallelism is True
|
||||
|
||||
# Test enable_async_tp -> fuse_gemm_comms
|
||||
caplog_vllm.clear()
|
||||
config = PassConfig(enable_async_tp=True)
|
||||
assert "enable_async_tp is deprecated" in caplog_vllm.text
|
||||
assert config.fuse_gemm_comms is True
|
||||
assert config.enable_async_tp is None
|
||||
assert config.enable_async_tp is True
|
||||
|
||||
# Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
|
||||
caplog_vllm.clear()
|
||||
config = PassConfig(enable_fi_allreduce_fusion=True)
|
||||
assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
|
||||
assert config.fuse_allreduce_rms is True
|
||||
assert config.enable_fi_allreduce_fusion is None
|
||||
assert config.enable_fi_allreduce_fusion is True
|
||||
|
||||
# Test hash consistency
|
||||
config_old = PassConfig(enable_fusion=True)
|
||||
config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True)
|
||||
assert config_old.compute_hash() == config_new.compute_hash()
|
||||
|
||||
config_old = PassConfig(enable_async_tp=True)
|
||||
config_new = PassConfig(fuse_gemm_comms=True)
|
||||
assert config_old.compute_hash() == config_new.compute_hash()
|
||||
|
||||
@ -6,6 +6,7 @@ import lm_eval
|
||||
import pytest
|
||||
|
||||
from tests.utils import large_gpu_mark
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def get_model_args(
|
||||
@ -45,6 +46,12 @@ def get_model_args(
|
||||
return model_args
|
||||
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason="EPLB with Spec Decode is a work in progress on ROCm.",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_setup",
|
||||
[
|
||||
|
||||
@ -232,7 +232,7 @@ async def test_server_load(server: RemoteOpenAIServer):
|
||||
@pytest.mark.asyncio
|
||||
async def test_health_check_engine_dead_error():
|
||||
# Import the health function directly to test it in isolation
|
||||
from vllm.entrypoints.openai.api_server import health
|
||||
from vllm.entrypoints.serve.instrumentator.health import health
|
||||
|
||||
# Create a mock request that simulates what FastAPI would provide
|
||||
mock_request = Mock(spec=Request)
|
||||
|
||||
@ -69,9 +69,20 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
|
||||
stream=True,
|
||||
)
|
||||
|
||||
first_chunk = None
|
||||
chunk_count = 0
|
||||
async for chunk in resp:
|
||||
chunk_count += 1
|
||||
if first_chunk is None and chunk.type == "message_start":
|
||||
first_chunk = chunk
|
||||
print(chunk.model_dump_json())
|
||||
|
||||
assert chunk_count > 0
|
||||
assert first_chunk is not None, "message_start chunk was never observed"
|
||||
assert first_chunk.usage is not None, "first chunk should include usage stats"
|
||||
assert first_chunk.usage["output_tokens"] == 0
|
||||
assert first_chunk.usage["input_tokens"] > 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
|
||||
|
||||
@ -42,6 +42,24 @@ async def test_basic(client: OpenAI, model_name: str):
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_enable_response_messages(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="Hello?",
|
||||
extra_body={"enable_response_messages": True},
|
||||
)
|
||||
assert response.status == "completed"
|
||||
assert response.input_messages[0]["type"] == "raw_message_tokens"
|
||||
assert type(response.input_messages[0]["message"]) is str
|
||||
assert len(response.input_messages[0]["message"]) > 10
|
||||
assert type(response.input_messages[0]["tokens"][0]) is int
|
||||
assert type(response.output_messages[0]["message"]) is str
|
||||
assert len(response.output_messages[0]["message"]) > 10
|
||||
assert type(response.output_messages[0]["tokens"][0]) is int
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_reasoning_item(client: OpenAI, model_name: str):
|
||||
|
||||
@ -2,64 +2,47 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
import io
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
|
||||
DTYPE = "float16"
|
||||
|
||||
|
||||
def _terratorch_dummy_inputs(model_name: str):
|
||||
def _terratorch_dummy_messages():
|
||||
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
|
||||
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
|
||||
|
||||
buffer_tiff = io.BytesIO()
|
||||
torch.save(pixel_values, buffer_tiff)
|
||||
buffer_tiff.seek(0)
|
||||
binary_data = buffer_tiff.read()
|
||||
base64_tensor_embedding = base64.b64encode(binary_data).decode("utf-8")
|
||||
|
||||
buffer_coord = io.BytesIO()
|
||||
torch.save(location_coords, buffer_coord)
|
||||
buffer_coord.seek(0)
|
||||
binary_data = buffer_coord.read()
|
||||
base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8")
|
||||
|
||||
return {
|
||||
"model": model_name,
|
||||
"additional_data": {"prompt_token_ids": [1]},
|
||||
"encoding_format": "base64",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"pixel_values": base64_tensor_embedding,
|
||||
"location_coords": base64_coord_embedding,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"pixel_values": tensor2base64(pixel_values),
|
||||
"location_coords": tensor2base64(location_coords),
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_request(model_name: str):
|
||||
@pytest.mark.parametrize(
|
||||
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
|
||||
)
|
||||
def test_single_request(model_name: str):
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
DTYPE,
|
||||
"float16",
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--max-num-seqs",
|
||||
@ -70,11 +53,15 @@ async def test_single_request(model_name: str):
|
||||
"--enable-mm-embeds",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as server:
|
||||
prompt = _terratorch_dummy_inputs(model_name)
|
||||
|
||||
# test single pooling
|
||||
response = requests.post(server.url_for("pooling"), json=prompt)
|
||||
with RemoteOpenAIServer(model_name, args) as server:
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"messages": _terratorch_dummy_messages(),
|
||||
"encoding_format": "base64",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
output = response.json()["data"][0]["data"]
|
||||
|
||||
@ -61,11 +61,8 @@ def test_pooling_params(llm: LLM):
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_encode_api(llm: LLM):
|
||||
# chunked prefill does not support all pooling
|
||||
err_msg = "pooling_task must be one of.+"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
|
||||
def test_token_classify(llm: LLM):
|
||||
llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
|
||||
|
||||
|
||||
def test_score_api(llm: LLM):
|
||||
|
||||
@ -255,21 +255,21 @@ async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
|
||||
# token_classify uses ALL pooling, which does not support chunked prefill.
|
||||
task = "token_classify"
|
||||
input_text = ["This product was excellent and exceeded my expectations"]
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": "test",
|
||||
"input": input_text,
|
||||
"encoding_format": "float",
|
||||
"task": task,
|
||||
},
|
||||
)
|
||||
assert response.json()["error"]["type"] == "BadRequestError"
|
||||
assert response.json()["error"]["message"].startswith(
|
||||
f"Task {task} is not supported"
|
||||
)
|
||||
poolings = PoolingResponse.model_validate(response.json())
|
||||
assert len(poolings.data) == 1
|
||||
assert len(poolings.data[0].data) == 8
|
||||
assert len(poolings.data[0].data[0]) == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@ -42,7 +42,7 @@ def llm():
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_encode_api(llm: LLM):
|
||||
def test_token_embed(llm: LLM):
|
||||
outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
|
||||
multi_vector = outputs[0].outputs.data
|
||||
assert multi_vector.shape == (11, 384)
|
||||
|
||||
@ -36,6 +36,13 @@ def llm():
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_config(llm: LLM):
|
||||
vllm_config = llm.llm_engine.vllm_config
|
||||
assert vllm_config.cache_config.enable_prefix_caching
|
||||
assert vllm_config.scheduler_config.enable_chunked_prefill
|
||||
|
||||
|
||||
def test_pooling_params(llm: LLM):
|
||||
def get_outputs(use_activation):
|
||||
outputs = llm.reward(
|
||||
|
||||
@ -29,6 +29,7 @@ from vllm.multimodal.utils import (
|
||||
encode_video_base64,
|
||||
)
|
||||
from vllm.tokenizers import MistralTokenizer, get_tokenizer
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import VLLM_PATH
|
||||
@ -85,11 +86,6 @@ def phi3v_model_config_image_embeds():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def phi3v_tokenizer():
|
||||
return get_tokenizer(PHI3V_MODEL_ID)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def qwen2_audio_model_config():
|
||||
return ModelConfig(
|
||||
@ -115,11 +111,6 @@ def audio_embeds_model_config():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def qwen2_audio_tokenizer():
|
||||
return get_tokenizer(QWEN2AUDIO_MODEL_ID)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def qwen25omni_model_config_mm_interleaved():
|
||||
return ModelConfig(
|
||||
@ -134,11 +125,6 @@ def qwen25omni_model_config_mm_interleaved():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def qwen25omni_tokenizer():
|
||||
return get_tokenizer(QWEN25OMNI_MODEL_ID)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def mistral_model_config():
|
||||
return ModelConfig(
|
||||
@ -150,11 +136,6 @@ def mistral_model_config():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def mistral_tokenizer():
|
||||
return get_tokenizer(MISTRAL_MODEL_ID)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def image_url():
|
||||
image = ImageAsset("cherry_blossom")
|
||||
@ -239,7 +220,6 @@ def _assert_mm_data_inputs(
|
||||
|
||||
def test_parse_chat_messages_single_image(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
@ -253,7 +233,6 @@ def test_parse_chat_messages_single_image(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -266,7 +245,6 @@ def test_parse_chat_messages_single_image(
|
||||
|
||||
def test_parse_chat_messages_single_image_with_uuid(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
@ -287,7 +265,6 @@ def test_parse_chat_messages_single_image_with_uuid(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -300,7 +277,6 @@ def test_parse_chat_messages_single_image_with_uuid(
|
||||
|
||||
def test_parse_chat_messages_single_empty_image_with_uuid(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
@ -319,7 +295,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -332,7 +307,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
|
||||
|
||||
def test_parse_chat_messages_single_image_with_bad_uuid_format(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
@ -354,7 +328,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -367,7 +340,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid1 = "my_uuid_1"
|
||||
@ -397,7 +369,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -413,7 +384,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
|
||||
|
||||
def test_parse_chat_messages_multiple_empty_images_with_uuids(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid1 = "my_uuid_1"
|
||||
@ -439,7 +409,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -455,7 +424,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
|
||||
|
||||
def test_parse_chat_messages_mixed_empty_images_with_uuids(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid1 = "my_uuid_1"
|
||||
@ -483,7 +451,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -500,7 +467,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_single_image_with_uuid_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
@ -519,7 +485,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -533,7 +498,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_empty_image_with_uuid_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
@ -552,7 +516,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -566,7 +529,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_uuids_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid1 = "my_uuid_1"
|
||||
@ -592,7 +554,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -609,7 +570,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid1 = "my_uuid_1"
|
||||
@ -635,7 +595,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -652,7 +611,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid2 = "my_uuid_2"
|
||||
@ -676,7 +634,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -692,7 +649,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
|
||||
|
||||
def test_parse_chat_messages_empty_system(
|
||||
mistral_model_config,
|
||||
mistral_tokenizer,
|
||||
):
|
||||
# Test string format
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
@ -704,7 +660,6 @@ def test_parse_chat_messages_empty_system(
|
||||
},
|
||||
],
|
||||
mistral_model_config,
|
||||
mistral_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
assert conversation == [
|
||||
@ -722,7 +677,6 @@ def test_parse_chat_messages_empty_system(
|
||||
},
|
||||
],
|
||||
mistral_model_config,
|
||||
mistral_tokenizer,
|
||||
content_format="openai",
|
||||
)
|
||||
assert conversation == [
|
||||
@ -734,7 +688,6 @@ def test_parse_chat_messages_empty_system(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_single_image_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
@ -748,7 +701,6 @@ async def test_parse_chat_messages_single_image_async(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -761,7 +713,6 @@ async def test_parse_chat_messages_single_image_async(
|
||||
|
||||
def test_parse_chat_messages_multiple_images(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
@ -779,7 +730,6 @@ def test_parse_chat_messages_multiple_images(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -795,7 +745,6 @@ def test_parse_chat_messages_multiple_images(
|
||||
|
||||
def test_parse_chat_messages_empty_pil_image_with_uuid(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
):
|
||||
uuid = "abcd"
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
@ -809,7 +758,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -825,7 +773,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
|
||||
|
||||
def test_parse_chat_messages_empty_image_embeds_with_uuid(
|
||||
phi3v_model_config_image_embeds,
|
||||
phi3v_tokenizer,
|
||||
):
|
||||
uuid = "abcd"
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
@ -839,7 +786,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
|
||||
}
|
||||
],
|
||||
phi3v_model_config_image_embeds,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -857,7 +803,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
|
||||
|
||||
def test_parse_chat_messages_empty_audio_embeds_with_uuid(
|
||||
audio_embeds_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
):
|
||||
"""Test audio_embeds with UUID (no actual embeds data)."""
|
||||
uuid = "test-audio-uuid-123"
|
||||
@ -873,7 +818,6 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
|
||||
}
|
||||
],
|
||||
audio_embeds_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -889,11 +833,8 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
|
||||
|
||||
def test_parse_chat_messages_audio_embeds_with_string(
|
||||
audio_embeds_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
):
|
||||
"""Test audio_embeds with base64 string embedding data."""
|
||||
import base64
|
||||
import io
|
||||
|
||||
import torch
|
||||
|
||||
@ -901,11 +842,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
|
||||
audio_embedding = torch.randn(1, 128, 768)
|
||||
|
||||
# Encode it as base64
|
||||
buffer = io.BytesIO()
|
||||
torch.save(audio_embedding, buffer)
|
||||
buffer.seek(0)
|
||||
binary_data = buffer.read()
|
||||
base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
|
||||
base64_audio_embedding = tensor2base64(audio_embedding)
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
@ -921,7 +858,6 @@ def test_parse_chat_messages_audio_embeds_with_string(
|
||||
}
|
||||
],
|
||||
audio_embeds_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -939,11 +875,8 @@ def test_parse_chat_messages_audio_embeds_with_string(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_audio_embeds_async(
|
||||
audio_embeds_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
):
|
||||
"""Test audio_embeds with async futures."""
|
||||
import base64
|
||||
import io
|
||||
|
||||
import torch
|
||||
|
||||
@ -951,11 +884,7 @@ async def test_parse_chat_messages_audio_embeds_async(
|
||||
audio_embedding = torch.randn(1, 128, 768)
|
||||
|
||||
# Encode it as base64
|
||||
buffer = io.BytesIO()
|
||||
torch.save(audio_embedding, buffer)
|
||||
buffer.seek(0)
|
||||
binary_data = buffer.read()
|
||||
base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
|
||||
base64_audio_embedding = tensor2base64(audio_embedding)
|
||||
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[
|
||||
@ -971,7 +900,6 @@ async def test_parse_chat_messages_audio_embeds_async(
|
||||
}
|
||||
],
|
||||
audio_embeds_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -990,7 +918,6 @@ async def test_parse_chat_messages_audio_embeds_async(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
|
||||
phi3v_model_config_image_embeds,
|
||||
phi3v_tokenizer,
|
||||
):
|
||||
uuid = "abcd"
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
@ -1004,7 +931,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
|
||||
}
|
||||
],
|
||||
phi3v_model_config_image_embeds,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1024,7 +950,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
@ -1042,7 +967,6 @@ async def test_parse_chat_messages_multiple_images_async(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1058,7 +982,6 @@ async def test_parse_chat_messages_multiple_images_async(
|
||||
|
||||
def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
@ -1076,7 +999,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
assert conversation == [
|
||||
@ -1091,7 +1013,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
|
||||
def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
@ -1110,7 +1031,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1127,7 +1047,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
|
||||
def test_parse_chat_messages_multiple_images_across_messages(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
@ -1149,7 +1068,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
|
||||
},
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1164,7 +1082,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
@ -1195,7 +1112,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
|
||||
},
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1210,7 +1126,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
|
||||
|
||||
def test_parse_chat_messages_context_text_format(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
@ -1222,7 +1137,6 @@ def test_parse_chat_messages_context_text_format(
|
||||
{"role": "user", "content": "What about this one?"},
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="openai",
|
||||
)
|
||||
|
||||
@ -1246,7 +1160,6 @@ def test_parse_chat_messages_context_text_format(
|
||||
|
||||
def test_parse_chat_messages_rejects_too_many_images_in_one_message(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
with warnings.catch_warnings():
|
||||
@ -1277,14 +1190,12 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
|
||||
def test_parse_chat_messages_rejects_too_many_images_across_messages(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
with warnings.catch_warnings():
|
||||
@ -1322,14 +1233,12 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
|
||||
},
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_uncommon_input(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
@ -1344,7 +1253,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
|
||||
}
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1360,7 +1268,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
|
||||
|
||||
def test_parse_chat_messages_multiple_images_interleave(
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
@ -1380,7 +1287,6 @@ def test_parse_chat_messages_multiple_images_interleave(
|
||||
}
|
||||
],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1398,7 +1304,6 @@ def test_parse_chat_messages_multiple_images_interleave(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_interleave_async(
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages_futures(
|
||||
@ -1418,7 +1323,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
|
||||
}
|
||||
],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1436,7 +1340,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
@ -1465,7 +1368,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
|
||||
}
|
||||
],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1482,7 +1384,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
|
||||
|
||||
def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
@ -1505,7 +1406,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
},
|
||||
],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1523,7 +1423,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
@ -1555,7 +1454,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
|
||||
},
|
||||
],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1573,7 +1471,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
image_url,
|
||||
video_url,
|
||||
audio_url,
|
||||
@ -1601,7 +1498,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
},
|
||||
],
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1627,7 +1523,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
image_url,
|
||||
video_url,
|
||||
audio_url,
|
||||
@ -1671,7 +1566,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
|
||||
},
|
||||
],
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1699,7 +1593,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave( # noqa: E501
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
image_url,
|
||||
video_url,
|
||||
audio_url,
|
||||
@ -1743,7 +1636,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
|
||||
},
|
||||
],
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1775,7 +1667,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
image_url,
|
||||
video_url,
|
||||
audio_url,
|
||||
@ -1811,7 +1702,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
|
||||
},
|
||||
],
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1837,7 +1727,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
|
||||
|
||||
def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
with pytest.raises(
|
||||
@ -1861,7 +1750,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
|
||||
}
|
||||
],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -2237,9 +2125,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
|
||||
assert resolved_format == expected_format
|
||||
|
||||
|
||||
def test_parse_chat_messages_include_thinking_chunk(
|
||||
mistral_model_config, mistral_tokenizer
|
||||
):
|
||||
def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
@ -2269,7 +2155,6 @@ def test_parse_chat_messages_include_thinking_chunk(
|
||||
conversation_with_thinking, _, _ = parse_chat_messages(
|
||||
messages,
|
||||
mistral_model_config,
|
||||
mistral_tokenizer,
|
||||
content_format="openai",
|
||||
)
|
||||
|
||||
@ -2353,7 +2238,6 @@ def test_apply_mistral_chat_template_thinking_chunk():
|
||||
|
||||
def test_parse_chat_messages_single_empty_audio_with_uuid(
|
||||
qwen2_audio_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
):
|
||||
audio_uuid = "abcd"
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
@ -2371,7 +2255,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
|
||||
}
|
||||
],
|
||||
qwen2_audio_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -2389,7 +2272,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
|
||||
qwen2_audio_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
):
|
||||
audio_uuid = "abcd"
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
@ -2407,7 +2289,6 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
|
||||
}
|
||||
],
|
||||
qwen2_audio_model_config,
|
||||
qwen2_audio_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
|
||||
@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import (
|
||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
||||
BatchedDeepGemmExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
|
||||
BatchedTritonOrDeepGemmExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEConfig,
|
||||
FusedMoEQuantConfig,
|
||||
@ -286,16 +283,6 @@ if has_deep_gemm() and is_deep_gemm_supported():
|
||||
needs_matching_quant=False,
|
||||
needs_deep_gemm=True,
|
||||
)
|
||||
register_experts(
|
||||
BatchedTritonOrDeepGemmExperts,
|
||||
batched_format,
|
||||
common_float_and_int_types,
|
||||
blocked_quantization_support=True,
|
||||
supports_chunking=False,
|
||||
supports_expert_map=False,
|
||||
needs_matching_quant=True,
|
||||
needs_deep_gemm=True,
|
||||
)
|
||||
register_experts(
|
||||
TritonOrDeepGemmExperts,
|
||||
standard_format,
|
||||
@ -457,10 +444,6 @@ def make_fused_experts(
|
||||
kwargs = batch_kwargs | quant_kwargs
|
||||
print(f"Making BatchedTritonExperts {kwargs} ...")
|
||||
experts = BatchedTritonExperts(**kwargs)
|
||||
elif fused_experts_type == BatchedTritonOrDeepGemmExperts:
|
||||
kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs
|
||||
print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
|
||||
experts = BatchedTritonOrDeepGemmExperts(**kwargs)
|
||||
elif fused_experts_type == DeepGemmExperts:
|
||||
print(f"Making DeepGemmExperts {quant_config} ...")
|
||||
experts = DeepGemmExperts(quant_config)
|
||||
|
||||
@ -0,0 +1,86 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
_per_token_group_quant_fp8_colmajor,
|
||||
silu_mul_per_token_group_quant_fp8_colmajor,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
|
||||
|
||||
FLOAT8_DTYPE = torch.float8_e4m3fn
|
||||
GROUP_SIZE = 128
|
||||
|
||||
|
||||
def reference_quant(x: torch.Tensor, use_ue8m0: bool):
|
||||
"""
|
||||
Reference triton quant kernel from,
|
||||
vllm.model_executor.layers.quantization.utils.fp8_utils
|
||||
"""
|
||||
|
||||
x_q = torch.empty_like(x, device=x.device, dtype=FLOAT8_DTYPE)
|
||||
|
||||
# Allocate the scale tensor in column-major format.
|
||||
shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
|
||||
x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
|
||||
|
||||
M = x.numel() // GROUP_SIZE
|
||||
N = GROUP_SIZE
|
||||
BLOCK = triton.next_power_of_2(N)
|
||||
# heuristics for number of warps
|
||||
num_warps = min(max(BLOCK // 256, 1), 8)
|
||||
num_stages = 1
|
||||
|
||||
finfo = torch.finfo(FLOAT8_DTYPE)
|
||||
fp8_min = finfo.min
|
||||
fp8_max = finfo.max
|
||||
|
||||
_per_token_group_quant_fp8_colmajor[(M,)](
|
||||
x,
|
||||
x_q,
|
||||
x_s,
|
||||
GROUP_SIZE,
|
||||
x.shape[1],
|
||||
x.stride(0),
|
||||
x_s.stride(1),
|
||||
eps=1e-10,
|
||||
fp8_min=fp8_min,
|
||||
fp8_max=fp8_max,
|
||||
use_ue8m0=use_ue8m0,
|
||||
BLOCK=BLOCK,
|
||||
num_warps=num_warps,
|
||||
num_stages=num_stages,
|
||||
)
|
||||
return x_q, x_s
|
||||
|
||||
|
||||
def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
T, N = x.size()
|
||||
ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
|
||||
torch.ops._C.silu_and_mul(ref_act_out, x)
|
||||
return reference_quant(ref_act_out, use_ue8m0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("T", [128, 256, 512])
|
||||
@pytest.mark.parametrize("N", [128 * 2, 256 * 2, 768 * 2, 2048 * 2, 7168 * 2])
|
||||
def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int):
|
||||
current_platform.seed_everything(42)
|
||||
|
||||
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
|
||||
|
||||
use_ue8m0 = is_deep_gemm_e8m0_used()
|
||||
|
||||
# Test
|
||||
output, output_scales = silu_mul_per_token_group_quant_fp8_colmajor(
|
||||
input, use_ue8m0=use_ue8m0
|
||||
)
|
||||
|
||||
# Reference
|
||||
ref_output, ref_output_scales = reference(input, use_ue8m0)
|
||||
|
||||
torch.testing.assert_close(output.to(torch.float32), ref_output.to(torch.float32))
|
||||
torch.testing.assert_close(output_scales, ref_output_scales)
|
||||
@ -0,0 +1,39 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
|
||||
from vllm.v1.executor import UniProcExecutor
|
||||
from vllm.v1.worker.worker_base import WorkerWrapperBase
|
||||
|
||||
|
||||
# This is a dummy executor for patching in test_runai_model_streamer_s3.py.
|
||||
# We cannot use vllm_runner fixture here, because it spawns worker process.
|
||||
# The worker process reimports the patched entities, and the patch is not applied.
|
||||
class RunaiDummyExecutor(UniProcExecutor):
|
||||
def _init_executor(self) -> None:
|
||||
distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
|
||||
|
||||
local_rank = 0
|
||||
rank = 0
|
||||
is_driver_worker = True
|
||||
|
||||
device_info = self.vllm_config.device_config.device.__str__().split(":")
|
||||
if len(device_info) > 1:
|
||||
local_rank = int(device_info[1])
|
||||
|
||||
worker_rpc_kwargs = dict(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=local_rank,
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
is_driver_worker=is_driver_worker,
|
||||
)
|
||||
|
||||
wrapper_kwargs = {
|
||||
"vllm_config": self.vllm_config,
|
||||
}
|
||||
|
||||
self.driver_worker = WorkerWrapperBase(**wrapper_kwargs)
|
||||
|
||||
self.collective_rpc("init_worker", args=([worker_rpc_kwargs],))
|
||||
self.collective_rpc("init_device")
|
||||
@ -0,0 +1,52 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from runai_model_streamer.safetensors_streamer.streamer_mock import StreamerPatcher
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
|
||||
from .conftest import RunaiDummyExecutor
|
||||
|
||||
load_format = "runai_streamer"
|
||||
test_model = "openai-community/gpt2"
|
||||
|
||||
|
||||
def test_runai_model_loader_download_files_s3_mocked_with_patch(
|
||||
vllm_runner,
|
||||
tmp_path: Path,
|
||||
monkeypatch,
|
||||
):
|
||||
patcher = StreamerPatcher(str(tmp_path))
|
||||
|
||||
test_mock_s3_model = "s3://my-mock-bucket/gpt2/"
|
||||
|
||||
# Download model from HF
|
||||
mock_model_dir = f"{tmp_path}/gpt2"
|
||||
snapshot_download(repo_id=test_model, local_dir=mock_model_dir)
|
||||
|
||||
monkeypatch.setattr(
|
||||
"vllm.transformers_utils.runai_utils.runai_list_safetensors",
|
||||
patcher.shim_list_safetensors,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"vllm.transformers_utils.runai_utils.runai_pull_files",
|
||||
patcher.shim_pull_files,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"vllm.model_executor.model_loader.weight_utils.SafetensorsStreamer",
|
||||
patcher.create_mock_streamer,
|
||||
)
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=test_mock_s3_model,
|
||||
load_format=load_format,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
|
||||
executor = RunaiDummyExecutor(vllm_config)
|
||||
executor.driver_worker.load_model()
|
||||
@ -0,0 +1,53 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
|
||||
from tests.models.utils import check_embeddings_close
|
||||
from vllm import TokensPrompt
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["Qwen/Qwen3-Embedding-0.6B"],
|
||||
)
|
||||
@torch.inference_mode
|
||||
def test_embed_models(hf_runner, vllm_runner, model: str):
|
||||
chunk_size = 10
|
||||
n_prompt_tokens = [55, 56, 57]
|
||||
token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
max_model_len=128,
|
||||
max_num_batched_tokens=chunk_size,
|
||||
enforce_eager=True,
|
||||
# `enable_chunked_prefill`: Set to `False` instead of `None` in VllmRunner
|
||||
enable_chunked_prefill=True,
|
||||
enable_prefix_caching=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.token_embed(
|
||||
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
|
||||
)
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
auto_cls=AutoModel,
|
||||
) as hf_model:
|
||||
hf_outputs = []
|
||||
for token_prompt in token_prompts:
|
||||
inputs = hf_model.wrap_device({"input_ids": torch.tensor([token_prompt])})
|
||||
input_ids = inputs["input_ids"]
|
||||
output = hf_model.model(input_ids)
|
||||
hf_outputs.append(output.last_hidden_state.cpu().float()[0])
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_output,
|
||||
embeddings_1_lst=vllm_output,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
@ -20,7 +20,6 @@ def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
|
||||
max_model_len=128,
|
||||
enforce_eager=True,
|
||||
runner="pooling",
|
||||
enable_chunked_prefill=False,
|
||||
enable_prefix_caching=True,
|
||||
) as vllm_model:
|
||||
pooling_outputs = vllm_model.llm.encode(
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Pytest configuration for vLLM tests."""
|
||||
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
@ -14,6 +16,20 @@ def pytest_configure(config):
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
skip_patterns = ["test_granite_speech.py"]
|
||||
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
|
||||
# Skip disabling SDP for Granite Speech tests on ROCm
|
||||
return
|
||||
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
# accuracy issues
|
||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
torch.backends.cuda.enable_math_sdp(True)
|
||||
warnings.warn(
|
||||
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
|
||||
"to avoid HuggingFace Transformers accuracy issues",
|
||||
UserWarning,
|
||||
stacklevel=1,
|
||||
)
|
||||
|
||||
@ -137,7 +137,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
"qwen2_5_omni": VLMTestInfo(
|
||||
@ -152,7 +152,7 @@ VLM_TEST_SETTINGS = {
|
||||
auto_cls=AutoModelForTextToWaveform,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
"qwen3_vl": VLMTestInfo(
|
||||
@ -173,7 +173,7 @@ VLM_TEST_SETTINGS = {
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
],
|
||||
@ -350,7 +350,7 @@ VLM_TEST_SETTINGS = {
|
||||
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
||||
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
||||
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"],
|
||||
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
||||
image_size_factors=[(1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
||||
),
|
||||
"fuyu": VLMTestInfo(
|
||||
models=["adept/fuyu-8b"],
|
||||
@ -403,12 +403,13 @@ VLM_TEST_SETTINGS = {
|
||||
# So, we need to reduce the number of tokens for the test to pass.
|
||||
max_tokens=8,
|
||||
num_logprobs=10,
|
||||
auto_cls=AutoModelForCausalLM,
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"glm4_1v": VLMTestInfo(
|
||||
models=["zai-org/GLM-4.1V-9B-Thinking"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
|
||||
prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
|
||||
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
|
||||
max_model_len=2048,
|
||||
@ -423,6 +424,7 @@ VLM_TEST_SETTINGS = {
|
||||
models=["zai-org/GLM-4.1V-9B-Thinking"],
|
||||
# GLM4.1V require include video metadata for input
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@ -707,7 +709,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForCausalLM,
|
||||
image_size_factors=[(), (0.25,)],
|
||||
image_size_factors=[(0.25,)],
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
|
||||
@ -737,7 +739,13 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
marks=[large_gpu_mark(min_gb=48)],
|
||||
marks=[
|
||||
large_gpu_mark(min_gb=48),
|
||||
pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason="Model produces a vector of <UNK> output in HF on ROCm",
|
||||
),
|
||||
],
|
||||
),
|
||||
"qwen_vl": VLMTestInfo(
|
||||
models=["Qwen/Qwen-VL"],
|
||||
@ -760,7 +768,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[pytest.mark.cpu_model],
|
||||
),
|
||||
"skywork_r1v": VLMTestInfo(
|
||||
@ -812,7 +820,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[pytest.mark.skip("Model initialization hangs")],
|
||||
),
|
||||
### Tensor parallel / multi-gpu broadcast tests
|
||||
|
||||
@ -8,6 +8,7 @@ from transformers import AutoModelForSpeechSeq2Seq
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
@ -34,6 +35,12 @@ audio_lora_path = MODEL_NAME
|
||||
models = [MODEL_NAME]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def set_attention_backend_for_rocm(monkeypatch):
|
||||
if current_platform.is_rocm():
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
@ -111,8 +118,12 @@ def run_test(
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_model_len", [2048])
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"max_model_len", [512] if current_platform.is_rocm() else [2048]
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(
|
||||
|
||||
@ -1,281 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from collections.abc import Sequence
|
||||
|
||||
import librosa
|
||||
import pytest
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
}
|
||||
)
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = (
|
||||
"<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
)
|
||||
|
||||
model_path = snapshot_download(
|
||||
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
|
||||
)
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
speech_question = os.path.join(
|
||||
model_path, "examples", "what_is_shown_in_this_image.wav"
|
||||
)
|
||||
models = [model_path]
|
||||
|
||||
target_dtype = "half"
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
mm_limit: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: str | None = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
task="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
trust_remote_code=False,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_model.model.load_adapter(
|
||||
vision_lora_path,
|
||||
adapter_name="vision",
|
||||
)
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
)
|
||||
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_image,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
# [],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [25600])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_multi_images_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case = [
|
||||
(
|
||||
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
None,
|
||||
),
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_vision_speech_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# use the example speech question so that the model outputs are reasonable
|
||||
audio = librosa.load(speech_question, sr=16000)
|
||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||
|
||||
inputs_vision_speech = [
|
||||
(
|
||||
["<|user|><|image|><|audio|><|end|><|assistant|>"],
|
||||
[image],
|
||||
[audio],
|
||||
),
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_vision_speech,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
@ -15,6 +15,7 @@ from transformers import AutoProcessor
|
||||
from vllm import SamplingParams, TextPrompt, TokensPrompt
|
||||
from vllm.logprobs import Logprob, SampleLogprobs
|
||||
from vllm.multimodal import MultiModalDataBuiltins
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import VLLM_PATH, large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
@ -165,6 +166,15 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
|
||||
def test_chat(
|
||||
vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
|
||||
) -> None:
|
||||
if (
|
||||
model == MISTRAL_SMALL_3_1_ID
|
||||
and max_model_len == 65536
|
||||
and current_platform.is_rocm()
|
||||
):
|
||||
pytest.skip(
|
||||
"OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
|
||||
)
|
||||
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
|
||||
with vllm_runner(
|
||||
model,
|
||||
|
||||
@ -62,6 +62,65 @@ def get_filtered_test_settings(
|
||||
return matching_tests
|
||||
|
||||
|
||||
def get_model_type_cases(
|
||||
model_type: str,
|
||||
test_info: VLMTestInfo,
|
||||
test_type: VLMTestType,
|
||||
):
|
||||
# Ensure that something is wrapped as an iterable it's not already
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
|
||||
|
||||
# This is essentially the same as nesting a bunch of mark.parametrize
|
||||
# decorators, but we do it programmatically to allow overrides for on
|
||||
# a per-model basis, while still being able to execute each of these
|
||||
# as individual test cases in pytest.
|
||||
iter_kwargs = OrderedDict(
|
||||
[
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
(
|
||||
"distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# num_frames is video only
|
||||
if test_type == VLMTestType.VIDEO:
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
|
||||
iter_kwargs["needs_video_metadata"] = ensure_wrapped(
|
||||
test_info.needs_video_metadata
|
||||
)
|
||||
|
||||
# No sizes passed for custom inputs, since inputs are directly provided
|
||||
if test_type not in (
|
||||
VLMTestType.CUSTOM_INPUTS,
|
||||
VLMTestType.AUDIO,
|
||||
):
|
||||
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
|
||||
if wrapped_sizes is None:
|
||||
raise ValueError(f"Sizes must be set for test type {test_type}")
|
||||
iter_kwargs["size_wrapper"] = wrapped_sizes
|
||||
|
||||
# Otherwise expand the custom test options instead
|
||||
elif test_type == VLMTestType.CUSTOM_INPUTS:
|
||||
if test_info.custom_test_opts is None:
|
||||
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
|
||||
|
||||
# Wrap all model cases in a pytest parameter & pass marks through
|
||||
return [
|
||||
pytest.param(
|
||||
model_type,
|
||||
ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}),
|
||||
marks=test_info.marks if test_info.marks is not None else [],
|
||||
)
|
||||
for case in list(itertools.product(*iter_kwargs.values()))
|
||||
]
|
||||
|
||||
|
||||
def get_parametrized_options(
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
@ -76,64 +135,11 @@ def get_parametrized_options(
|
||||
test_settings, test_type, create_new_process_for_each_test
|
||||
)
|
||||
|
||||
# Ensure that something is wrapped as an iterable it's not already
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
|
||||
|
||||
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
|
||||
# This is essentially the same as nesting a bunch of mark.parametrize
|
||||
# decorators, but we do it programmatically to allow overrides for on
|
||||
# a per-model basis, while still being able to execute each of these
|
||||
# as individual test cases in pytest.
|
||||
iter_kwargs = OrderedDict(
|
||||
[
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
(
|
||||
"distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# num_frames is video only
|
||||
if test_type == VLMTestType.VIDEO:
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
|
||||
iter_kwargs["needs_video_metadata"] = ensure_wrapped(
|
||||
test_info.needs_video_metadata
|
||||
)
|
||||
|
||||
# No sizes passed for custom inputs, since inputs are directly provided
|
||||
if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
|
||||
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
|
||||
if wrapped_sizes is None:
|
||||
raise ValueError(f"Sizes must be set for test type {test_type}")
|
||||
iter_kwargs["size_wrapper"] = wrapped_sizes
|
||||
|
||||
# Otherwise expand the custom test options instead
|
||||
elif test_type == VLMTestType.CUSTOM_INPUTS:
|
||||
if test_info.custom_test_opts is None:
|
||||
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
|
||||
|
||||
# Wrap all model cases in a pytest parameter & pass marks through
|
||||
return [
|
||||
pytest.param(
|
||||
model_type,
|
||||
ExpandableVLMTestArgs(
|
||||
**{k: v for k, v in zip(iter_kwargs.keys(), case)}
|
||||
),
|
||||
marks=test_info.marks if test_info.marks is not None else [],
|
||||
)
|
||||
for case in list(itertools.product(*iter_kwargs.values()))
|
||||
]
|
||||
|
||||
# Get a list per model type, where each entry contains a tuple of all of
|
||||
# that model type's cases, then flatten them into the top level so that
|
||||
# we can consume them in one mark.parametrize call.
|
||||
cases_by_model_type = [
|
||||
get_model_type_cases(model_type, test_info)
|
||||
get_model_type_cases(model_type, test_info, test_type)
|
||||
for model_type, test_info in matching_tests.items()
|
||||
]
|
||||
return list(itertools.chain(*cases_by_model_type))
|
||||
|
||||
@ -140,7 +140,7 @@ def video_with_metadata_glm4_1v():
|
||||
metadata = VIDEO_ASSETS[0].metadata
|
||||
question = "Describe the video."
|
||||
video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||
formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
|
||||
formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
|
||||
|
||||
scales = [0.1, 0.2, 0.25]
|
||||
video_input = [
|
||||
|
||||
@ -25,6 +25,7 @@ from transformers import (
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
|
||||
from .....conftest import HfRunner, ImageAsset, ImageTestAssets
|
||||
@ -366,6 +367,40 @@ def gemma3_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOut
|
||||
|
||||
def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
|
||||
if current_platform.is_rocm():
|
||||
import types
|
||||
|
||||
config = hf_model.model.config
|
||||
if hasattr(config, "num_layers") and not hasattr(config, "num_hidden_layers"):
|
||||
config.num_hidden_layers = config.num_layers
|
||||
config.output_hidden_states = True
|
||||
|
||||
def patched_prepare_cache(
|
||||
self, generation_config, model_kwargs, *args, **kwargs
|
||||
):
|
||||
model_kwargs["past_key_values"] = None
|
||||
model_kwargs["use_cache"] = False
|
||||
return model_kwargs
|
||||
|
||||
hf_model.model._prepare_cache_for_generation = types.MethodType(
|
||||
patched_prepare_cache, hf_model.model
|
||||
)
|
||||
original_generate = hf_model.model.generate
|
||||
|
||||
def patched_generate(*args, **kwargs):
|
||||
kwargs["output_hidden_states"] = True
|
||||
kwargs["return_dict_in_generate"] = True
|
||||
return original_generate(*args, **kwargs)
|
||||
|
||||
hf_model.model.generate = patched_generate
|
||||
original_forward = hf_model.model.forward
|
||||
|
||||
def patched_forward(*args, **kwargs):
|
||||
kwargs["output_hidden_states"] = True
|
||||
return original_forward(*args, **kwargs)
|
||||
|
||||
hf_model.model.forward = patched_forward
|
||||
|
||||
hf_processor = hf_model.processor
|
||||
|
||||
def processor(*args, text="", images=None, **kwargs):
|
||||
@ -406,7 +441,15 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
if videos is not None and is_list_of(videos, tuple):
|
||||
# If videos is a list of tuples, we assume each tuple contains
|
||||
# (video_array, metadata) as in the case of GLM4.1V.
|
||||
video_metadata = [[VideoMetadata(**video[1])] for video in videos]
|
||||
# Filter out 'do_sample_frames' as it's not a valid VideoMetadata arg
|
||||
video_metadata = [
|
||||
[
|
||||
VideoMetadata(
|
||||
**{k: v for k, v in video[1].items() if k != "do_sample_frames"}
|
||||
)
|
||||
]
|
||||
for video in videos
|
||||
]
|
||||
videos = [[video[0]] for video in videos]
|
||||
else:
|
||||
video_metadata = None
|
||||
|
||||
@ -50,8 +50,8 @@ MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PL
|
||||
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
|
||||
|
||||
|
||||
IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
|
||||
EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)]
|
||||
IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
|
||||
EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)]
|
||||
RunnerOutput = tuple[list[int], str, SampleLogprobs | None]
|
||||
|
||||
|
||||
|
||||
24
tests/models/multimodal/pooling/conftest.py
Normal file
24
tests/models/multimodal/pooling/conftest.py
Normal file
@ -0,0 +1,24 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Pytest configuration for vLLM pooling tests."""
|
||||
|
||||
import os
|
||||
import warnings
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
|
||||
|
||||
if siglip_tests:
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
|
||||
warnings.warn(
|
||||
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
|
||||
UserWarning,
|
||||
stacklevel=1,
|
||||
)
|
||||
@ -396,28 +396,6 @@ def test_processing_correctness(
|
||||
)
|
||||
|
||||
|
||||
# Phi4MultimodalForCausalLM share same model repo with original format
|
||||
# Phi4MMForCausalLM, so we add it as a separate test case
|
||||
# Remove this test after conversion PR merged:
|
||||
# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/70
|
||||
@pytest.mark.parametrize("model_arch", ["Phi4MultimodalForCausalLM"])
|
||||
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||
@pytest.mark.parametrize("num_batches", [32])
|
||||
@pytest.mark.parametrize("simplify_rate", [1.0])
|
||||
def test_processing_correctness_phi4_multimodal(
|
||||
model_arch: str,
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
):
|
||||
_test_processing_correctness(
|
||||
model_arch,
|
||||
hit_rate=hit_rate,
|
||||
num_batches=num_batches,
|
||||
simplify_rate=simplify_rate,
|
||||
)
|
||||
|
||||
|
||||
def _assert_inputs_equal(
|
||||
a: MultiModalInputs,
|
||||
b: MultiModalInputs,
|
||||
|
||||
@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
|
||||
gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
|
||||
)
|
||||
|
||||
QWEN3_CONFIG = GGUFTestConfig(
|
||||
original_model="Qwen/Qwen3-0.6B",
|
||||
gguf_repo="unsloth/Qwen3-0.6B-GGUF",
|
||||
gguf_filename="Qwen3-0.6B-BF16.gguf",
|
||||
)
|
||||
|
||||
PHI3_CONFIG = GGUFTestConfig(
|
||||
original_model="microsoft/Phi-3.5-mini-instruct",
|
||||
gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
|
||||
@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
|
||||
MODELS = [
|
||||
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
|
||||
QWEN2_CONFIG,
|
||||
QWEN3_CONFIG,
|
||||
PHI3_CONFIG,
|
||||
GPT2_CONFIG,
|
||||
STABLELM_CONFIG,
|
||||
|
||||
@ -667,6 +667,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"moonshotai/Kimi-VL-A3B-Instruct",
|
||||
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.53.3",
|
||||
transformers_version_reason="HF model uses deprecated transformers API "
|
||||
"(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
|
||||
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
|
||||
),
|
||||
"LightOnOCRForConditionalGeneration": _HfExamplesInfo(
|
||||
"lightonai/LightOnOCR-1B",
|
||||
@ -767,10 +771,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"Phi4MMForCausalLM": _HfExamplesInfo(
|
||||
"microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
|
||||
),
|
||||
"Phi4MultimodalForCausalLM": _HfExamplesInfo(
|
||||
"microsoft/Phi-4-multimodal-instruct",
|
||||
revision="refs/pr/70",
|
||||
),
|
||||
"PixtralForConditionalGeneration": _HfExamplesInfo(
|
||||
"mistralai/Pixtral-12B-2409",
|
||||
extras={
|
||||
|
||||
@ -112,7 +112,7 @@ class TestBaseThinkingReasoningParserMethods:
|
||||
"""Test the is_reasoning_end method."""
|
||||
parser = TestThinkingReasoningParser(test_tokenizer)
|
||||
end_token_id = parser.end_token_id
|
||||
|
||||
start_token_id = parser.start_token_id
|
||||
# Test with end token present
|
||||
assert parser.is_reasoning_end([1, 2, end_token_id, 4]) is True
|
||||
|
||||
@ -122,6 +122,16 @@ class TestBaseThinkingReasoningParserMethods:
|
||||
# Test with empty list
|
||||
assert parser.is_reasoning_end([]) is False
|
||||
|
||||
# Test with interleaved thinking
|
||||
assert parser.is_reasoning_end([1, start_token_id, 2, end_token_id]) is True
|
||||
assert parser.is_reasoning_end([1, start_token_id, 2, 3]) is False
|
||||
assert (
|
||||
parser.is_reasoning_end(
|
||||
[1, start_token_id, 2, end_token_id, 2, 2, start_token_id]
|
||||
)
|
||||
is False
|
||||
)
|
||||
|
||||
def test_extract_content_ids(self, test_tokenizer):
|
||||
"""Test the extract_content_ids method."""
|
||||
parser = TestThinkingReasoningParser(test_tokenizer)
|
||||
|
||||
@ -5,6 +5,10 @@
|
||||
set -e
|
||||
set -x
|
||||
|
||||
merge_base_commit=$(git merge-base HEAD origin/main)
|
||||
echo "Current merge base commit with main: $merge_base_commit"
|
||||
git show --oneline -s $merge_base_commit
|
||||
|
||||
cd /vllm-workspace/
|
||||
|
||||
# uninstall vllm
|
||||
@ -18,7 +22,7 @@ apt autoremove -y
|
||||
|
||||
echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
|
||||
|
||||
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
|
||||
VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
|
||||
|
||||
# Run the script
|
||||
python3 -c 'import vllm'
|
||||
|
||||
@ -629,8 +629,8 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
|
||||
(
|
||||
"internlm/internlm2-1_8b-reward",
|
||||
"decoder",
|
||||
False,
|
||||
"Pooling models with all pooling does not support chunked prefill.",
|
||||
True,
|
||||
"Pooling models with causal attn and all pooling support chunked prefill.",
|
||||
),
|
||||
(
|
||||
"BAAI/bge-base-en",
|
||||
@ -748,8 +748,8 @@ def test_is_chunked_prefill_supported(
|
||||
(
|
||||
"internlm/internlm2-1_8b-reward",
|
||||
"decoder",
|
||||
False,
|
||||
"Pooling models with all pooling does not support prefix caching.",
|
||||
True,
|
||||
"Pooling models with causal attn and all pooling support prefix caching.",
|
||||
),
|
||||
(
|
||||
"BAAI/bge-base-en",
|
||||
|
||||
@ -365,3 +365,54 @@ class TestEnvSetWithChoices:
|
||||
with patch.dict(os.environ, {"TEST_ENV": "option1,option1,option2"}):
|
||||
env_func = env_set_with_choices("TEST_ENV", [], ["option1", "option2"])
|
||||
assert env_func() == {"option1", "option2"}
|
||||
|
||||
|
||||
class TestVllmConfigureLogging:
|
||||
"""Test cases for VLLM_CONFIGURE_LOGGING environment variable."""
|
||||
|
||||
def test_configure_logging_defaults_to_true(self):
|
||||
"""Test that VLLM_CONFIGURE_LOGGING defaults to True when not set."""
|
||||
# Ensure the env var is not set
|
||||
with patch.dict(os.environ, {}, clear=False):
|
||||
if "VLLM_CONFIGURE_LOGGING" in os.environ:
|
||||
del os.environ["VLLM_CONFIGURE_LOGGING"]
|
||||
|
||||
# Clear cache if it exists
|
||||
if hasattr(envs.__getattr__, "cache_clear"):
|
||||
envs.__getattr__.cache_clear()
|
||||
|
||||
result = envs.VLLM_CONFIGURE_LOGGING
|
||||
assert result is True
|
||||
assert isinstance(result, bool)
|
||||
|
||||
def test_configure_logging_with_zero_string(self):
|
||||
"""Test that VLLM_CONFIGURE_LOGGING='0' evaluates to False."""
|
||||
with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "0"}):
|
||||
# Clear cache if it exists
|
||||
if hasattr(envs.__getattr__, "cache_clear"):
|
||||
envs.__getattr__.cache_clear()
|
||||
|
||||
result = envs.VLLM_CONFIGURE_LOGGING
|
||||
assert result is False
|
||||
assert isinstance(result, bool)
|
||||
|
||||
def test_configure_logging_with_one_string(self):
|
||||
"""Test that VLLM_CONFIGURE_LOGGING='1' evaluates to True."""
|
||||
with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "1"}):
|
||||
# Clear cache if it exists
|
||||
if hasattr(envs.__getattr__, "cache_clear"):
|
||||
envs.__getattr__.cache_clear()
|
||||
|
||||
result = envs.VLLM_CONFIGURE_LOGGING
|
||||
assert result is True
|
||||
assert isinstance(result, bool)
|
||||
|
||||
def test_configure_logging_with_invalid_value_raises_error(self):
|
||||
"""Test that invalid VLLM_CONFIGURE_LOGGING value raises ValueError."""
|
||||
with patch.dict(os.environ, {"VLLM_CONFIGURE_LOGGING": "invalid"}):
|
||||
# Clear cache if it exists
|
||||
if hasattr(envs.__getattr__, "cache_clear"):
|
||||
envs.__getattr__.cache_clear()
|
||||
|
||||
with pytest.raises(ValueError, match="invalid literal for int"):
|
||||
_ = envs.VLLM_CONFIGURE_LOGGING
|
||||
|
||||
847
tests/tool_use/test_mistral_tool_parser.py
Normal file
847
tests/tool_use/test_mistral_tool_parser.py
Normal file
@ -0,0 +1,847 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
from collections.abc import Generator
|
||||
|
||||
import partial_json_parser
|
||||
import pytest
|
||||
from mistral_common.protocol.instruct.messages import AssistantMessage
|
||||
from mistral_common.protocol.instruct.request import InstructRequest
|
||||
from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall
|
||||
from partial_json_parser.core.options import Allow
|
||||
|
||||
from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
|
||||
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser
|
||||
from vllm.tokenizers import (
|
||||
MistralTokenizer,
|
||||
TokenizerLike,
|
||||
get_tokenizer,
|
||||
)
|
||||
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def mistral_pre_v11_tokenizer():
|
||||
MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
|
||||
return get_tokenizer(tokenizer_name=MODEL)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def mistral_tokenizer():
|
||||
MODEL = "mistralai/Mistral-Small-3.2-24B-Instruct-2506"
|
||||
return get_tokenizer(tokenizer_name=MODEL, tokenizer_mode="mistral")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mistral_pre_v11_tool_parser(mistral_pre_v11_tokenizer):
|
||||
return MistralToolParser(mistral_pre_v11_tokenizer)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mistral_tool_parser(mistral_tokenizer):
|
||||
return MistralToolParser(mistral_tokenizer)
|
||||
|
||||
|
||||
def assert_tool_calls(
|
||||
actual_tool_calls: list[ToolCall] | list[DeltaToolCall],
|
||||
expected_tool_calls: list[ToolCall],
|
||||
):
|
||||
assert len(actual_tool_calls) == len(expected_tool_calls)
|
||||
|
||||
for actual_tool_call, expected_tool_call in zip(
|
||||
actual_tool_calls, expected_tool_calls
|
||||
):
|
||||
assert isinstance(actual_tool_call.id, str)
|
||||
assert len(actual_tool_call.id) == 9
|
||||
|
||||
if isinstance(actual_tool_call, ToolCall):
|
||||
assert actual_tool_call.type == "function"
|
||||
elif isinstance(actual_tool_call, DeltaToolCall):
|
||||
assert actual_tool_call.function is not None
|
||||
assert actual_tool_call.function.name is not None
|
||||
assert actual_tool_call.function.arguments is not None
|
||||
assert actual_tool_call.function is not None
|
||||
assert actual_tool_call.function.name == expected_tool_call.function.name, (
|
||||
f"got wrong function name:${actual_tool_call.function.name}"
|
||||
)
|
||||
assert (
|
||||
actual_tool_call.function.arguments == expected_tool_call.function.arguments
|
||||
), f"got wrong function argument:${actual_tool_call.function.arguments}"
|
||||
|
||||
|
||||
def fix_tool_call_tokenization(
|
||||
tokens: list[int],
|
||||
mistral_tool_parser: MistralToolParser,
|
||||
mistral_tokenizer: TokenizerLike,
|
||||
):
|
||||
"""
|
||||
Replaces the textual token sequence for [TOOL_CALLS]
|
||||
with its single special token ID.
|
||||
"""
|
||||
textual_tool_call_token_ids = mistral_tokenizer.encode(
|
||||
text=mistral_tool_parser.bot_token,
|
||||
add_special_tokens=False,
|
||||
)
|
||||
# textual_tool_call_token_ids must not contain special tokens like bos, eos etc
|
||||
special_tool_call_token_ids = [mistral_tool_parser.bot_token_id]
|
||||
|
||||
# If the input is too short to contain the sequence, no replacement is possible
|
||||
if not tokens or len(tokens) < len(textual_tool_call_token_ids):
|
||||
return tokens
|
||||
|
||||
result_tokens = []
|
||||
i = 0
|
||||
target_len = len(textual_tool_call_token_ids)
|
||||
|
||||
while i < len(tokens):
|
||||
# Check if the slice from the current position matches the target sequence
|
||||
if tokens[i : i + target_len] == textual_tool_call_token_ids:
|
||||
# If it matches, add the replacement and jump the index forward
|
||||
result_tokens.extend(special_tool_call_token_ids)
|
||||
i += target_len
|
||||
else:
|
||||
# Otherwise, just add the current token and move to the next one
|
||||
result_tokens.append(tokens[i])
|
||||
i += 1
|
||||
|
||||
return result_tokens
|
||||
|
||||
|
||||
def stream_delta_message_generator(
|
||||
mistral_tool_parser: MistralToolParser,
|
||||
mistral_tokenizer: TokenizerLike,
|
||||
model_output: str | None,
|
||||
tools: list[tuple[str, str]] | None,
|
||||
) -> Generator[DeltaMessage, None, None]:
|
||||
if (
|
||||
isinstance(mistral_tokenizer, MistralTokenizer)
|
||||
and mistral_tokenizer.version >= 11
|
||||
):
|
||||
# With the newer versions of the tokenizer,
|
||||
# we cannot tokenize free text
|
||||
# so we need to create a list of messages to get tokenized
|
||||
assert tools is not None
|
||||
assistant_msg = AssistantMessage(
|
||||
tool_calls=[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name=name,
|
||||
arguments=arg,
|
||||
)
|
||||
)
|
||||
for (name, arg) in tools
|
||||
],
|
||||
)
|
||||
request = InstructRequest(
|
||||
messages=[assistant_msg],
|
||||
)
|
||||
all_token_ids = mistral_tokenizer.instruct.encode_instruct(request).tokens
|
||||
else:
|
||||
# Older versions of the tokenizer are
|
||||
# able to encode directly the model's output (free text) into tokens
|
||||
assert model_output is not None
|
||||
all_token_ids = mistral_tokenizer.encode(model_output, add_special_tokens=False)
|
||||
|
||||
all_token_ids = fix_tool_call_tokenization(
|
||||
all_token_ids, mistral_tool_parser, mistral_tokenizer
|
||||
)
|
||||
|
||||
previous_text = ""
|
||||
previous_tokens = None
|
||||
prefix_offset = 0
|
||||
read_offset = 0
|
||||
for i, delta_token in enumerate(all_token_ids):
|
||||
delta_token_ids = [delta_token]
|
||||
previous_token_ids = all_token_ids[:i]
|
||||
current_token_ids = all_token_ids[: i + 1]
|
||||
|
||||
(new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
|
||||
detokenize_incrementally(
|
||||
tokenizer=mistral_tokenizer,
|
||||
all_input_ids=current_token_ids,
|
||||
prev_tokens=previous_tokens,
|
||||
prefix_offset=prefix_offset,
|
||||
read_offset=read_offset,
|
||||
skip_special_tokens=isinstance(mistral_tokenizer, MistralTokenizer),
|
||||
spaces_between_special_tokens=True,
|
||||
)
|
||||
)
|
||||
|
||||
current_text = previous_text + delta_text
|
||||
|
||||
delta_message = mistral_tool_parser.extract_tool_calls_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
previous_token_ids,
|
||||
current_token_ids,
|
||||
delta_token_ids,
|
||||
request=None, # type: ignore[arg-type]
|
||||
)
|
||||
if delta_message:
|
||||
yield delta_message
|
||||
|
||||
previous_text = current_text
|
||||
previous_tokens = (
|
||||
previous_tokens + new_tokens if previous_tokens else new_tokens
|
||||
)
|
||||
prefix_offset = new_prefix_offset
|
||||
read_offset = new_read_offset
|
||||
|
||||
|
||||
def test_extract_tool_calls_no_tools(mistral_pre_v11_tool_parser):
|
||||
model_output = "This is a test"
|
||||
extracted_tool_calls = mistral_pre_v11_tool_parser.extract_tool_calls(
|
||||
model_output, request=None
|
||||
) # type: ignore[arg-type]
|
||||
assert not extracted_tool_calls.tools_called
|
||||
assert extracted_tool_calls.tool_calls == []
|
||||
assert extracted_tool_calls.content == model_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
ids=[
|
||||
"single_tool_add",
|
||||
"single_tool_weather",
|
||||
"argument_before_name",
|
||||
"argument_before_name_and_name_in_argument",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
(
|
||||
"""[TOOL_CALLS][{"name": "add", "arguments":{"a": 3.5, "b": 4}}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
|
||||
)
|
||||
)
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"name": "get_current_weather", "arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps(
|
||||
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps(
|
||||
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"arguments":{"name": "John Doe"}, "name": "get_age"}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_age",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"name": "John Doe",
|
||||
}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
None,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls_pre_v11_tokenizer(
|
||||
mistral_pre_v11_tool_parser, model_output, expected_tool_calls, expected_content
|
||||
):
|
||||
extracted_tool_calls = mistral_pre_v11_tool_parser.extract_tool_calls(
|
||||
model_output, request=None
|
||||
) # type: ignore[arg-type]
|
||||
assert extracted_tool_calls.tools_called
|
||||
|
||||
assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
|
||||
|
||||
assert extracted_tool_calls.content == expected_content
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
ids=[
|
||||
"single_tool_add",
|
||||
"single_tool_weather",
|
||||
"multiple_tool_calls",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
(
|
||||
"""[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add_this_and_that",
|
||||
arguments=json.dumps({"a": 3.5, "b": 4}),
|
||||
)
|
||||
)
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps(
|
||||
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]multiply{"a": 3, "b": 6}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
|
||||
)
|
||||
),
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="multiply", arguments=json.dumps({"a": 3, "b": 6})
|
||||
)
|
||||
),
|
||||
],
|
||||
None,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls(
|
||||
mistral_tool_parser, model_output, expected_tool_calls, expected_content
|
||||
):
|
||||
extracted_tool_calls = mistral_tool_parser.extract_tool_calls(
|
||||
model_output, request=None
|
||||
) # type: ignore[arg-type]
|
||||
assert extracted_tool_calls.tools_called
|
||||
|
||||
assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
|
||||
|
||||
assert extracted_tool_calls.content == expected_content
|
||||
|
||||
|
||||
def _test_extract_tool_calls_streaming(
|
||||
tool_parser, tokenizer, model_output, tools, expected_tool_calls, expected_content
|
||||
):
|
||||
other_content: str = ""
|
||||
function_names: list[str] = []
|
||||
function_args_strs: list[str] = []
|
||||
tool_call_idx: int = -1
|
||||
tool_call_ids: list[str | None] = []
|
||||
|
||||
for delta_message in stream_delta_message_generator(
|
||||
tool_parser, tokenizer, model_output, tools
|
||||
):
|
||||
# role should never be streamed from tool parser
|
||||
assert not delta_message.role
|
||||
|
||||
if delta_message.content:
|
||||
other_content += delta_message.content
|
||||
|
||||
streamed_tool_calls = delta_message.tool_calls
|
||||
|
||||
if streamed_tool_calls and len(streamed_tool_calls) > 0:
|
||||
# make sure only one diff is present - correct even for parallel
|
||||
assert len(streamed_tool_calls) == 1
|
||||
tool_call = streamed_tool_calls[0]
|
||||
|
||||
assert len(tool_parser.prev_tool_call_arr) > 0
|
||||
|
||||
# if a new tool is being called, set up empty arguments
|
||||
if tool_call.index != tool_call_idx:
|
||||
tool_call_idx = tool_call.index
|
||||
function_args_strs.append("")
|
||||
tool_call_ids.append(None)
|
||||
|
||||
# if a tool call ID is streamed, make sure one hasn't been already
|
||||
if tool_call.id and not tool_call_ids[tool_call.index]:
|
||||
tool_call_ids[tool_call.index] = tool_call.id
|
||||
|
||||
# if parts of the function start being streamed
|
||||
if tool_call.function:
|
||||
# if the function name is defined, set it. it should be streamed
|
||||
# IN ENTIRETY, exactly one time.
|
||||
if tool_call.function.name:
|
||||
assert isinstance(tool_call.function.name, str)
|
||||
function_names.append(tool_call.function.name)
|
||||
|
||||
if tool_call.function.arguments:
|
||||
# make sure they're a string and then add them to the list
|
||||
assert isinstance(tool_call.function.arguments, str)
|
||||
|
||||
function_args_strs[tool_call.index] += tool_call.function.arguments
|
||||
|
||||
assert other_content == expected_content
|
||||
|
||||
actual_tool_calls = [
|
||||
ToolCall(
|
||||
id=tool_call_id,
|
||||
function=FunctionCall(
|
||||
name=function_name,
|
||||
arguments=partial_json_parser.ensure_json(
|
||||
function_args_str, Allow.OBJ | Allow.STR
|
||||
),
|
||||
),
|
||||
)
|
||||
for tool_call_id, function_name, function_args_str in zip(
|
||||
tool_call_ids, function_names, function_args_strs
|
||||
)
|
||||
]
|
||||
assert_tool_calls(actual_tool_calls, expected_tool_calls)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
ids=[
|
||||
"no_tools",
|
||||
"single_tool_add",
|
||||
"single_tool_add_strings",
|
||||
"single_tool_weather",
|
||||
"argument_before_name",
|
||||
"argument_before_name_and_name_in_argument",
|
||||
"multiple_tools",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
("""This is a test""", [], """This is a test"""),
|
||||
(
|
||||
"""[TOOL_CALLS] [ {"name":"add" , "arguments" : {"a": 3, "b": 4} } ]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add", arguments=json.dumps({"a": 3, "b": 4})
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"name": "add", "arguments":{"a": "3", "b": "4"}}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add", arguments=json.dumps({"a": "3", "b": "4"})
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps(
|
||||
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps(
|
||||
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"arguments": {"name": "John Doe"}, "name": "get_age"}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_age",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"name": "John Doe",
|
||||
}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"name": "add", "arguments": {"a": 3.5, "b": 4}}, {"name": "get_current_weather", "arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
|
||||
)
|
||||
),
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps(
|
||||
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
"",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls_streaming_pre_v11_tokenizer(
|
||||
mistral_pre_v11_tool_parser,
|
||||
mistral_pre_v11_tokenizer,
|
||||
model_output,
|
||||
expected_tool_calls,
|
||||
expected_content,
|
||||
):
|
||||
_test_extract_tool_calls_streaming(
|
||||
mistral_pre_v11_tool_parser,
|
||||
mistral_pre_v11_tokenizer,
|
||||
model_output,
|
||||
None,
|
||||
expected_tool_calls,
|
||||
expected_content,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
ids=[
|
||||
"single_tool_add",
|
||||
"single_tool_add_strings",
|
||||
"multiple_tools",
|
||||
],
|
||||
argnames=["tools", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
(
|
||||
[("add", '{"a": 3, "b": 4}')],
|
||||
# [TOOL_CALLS]add{"a": 3, "b": 4}
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add", arguments=json.dumps({"a": 3, "b": 4})
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
[("add_two_strings", '{"a": "3", "b": "4"}')],
|
||||
# [TOOL_CALLS]add_two_strings{"a": "3", "b": "4"}
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add_two_strings",
|
||||
arguments=json.dumps({"a": "3", "b": "4"}),
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
[
|
||||
("add", '{"a": 3.5, "b": 4}'),
|
||||
(
|
||||
"get_current_weather",
|
||||
'{"city": "San Francisco", "state": "CA", "unit": "celsius"}', # noqa: E501
|
||||
),
|
||||
],
|
||||
# [TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"} # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
|
||||
)
|
||||
),
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps(
|
||||
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
"",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls_streaming(
|
||||
mistral_tool_parser,
|
||||
mistral_tokenizer,
|
||||
tools,
|
||||
expected_tool_calls,
|
||||
expected_content,
|
||||
):
|
||||
_test_extract_tool_calls_streaming(
|
||||
mistral_tool_parser,
|
||||
mistral_tokenizer,
|
||||
None,
|
||||
tools,
|
||||
expected_tool_calls,
|
||||
expected_content,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
ids=[
|
||||
"single_tool_add",
|
||||
"single_tool_weather",
|
||||
"multiple_tool_calls",
|
||||
"content_before_tool",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
(
|
||||
"""[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add_this_and_that",
|
||||
arguments=json.dumps({"a": 3.5, "b": 4}),
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS]get_current_weather{"city": "San Francisco", "state": "CA", "unit": "celsius"}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps(
|
||||
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS]add{"a": 3.5, "b": 4}[TOOL_CALLS]multiply{"a": 3, "b": 6}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
|
||||
)
|
||||
),
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="multiply", arguments=json.dumps({"a": 3, "b": 6})
|
||||
)
|
||||
),
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
# Additional content should not be after the tool calls
|
||||
"""bla[TOOL_CALLS]add_this_and_that{"a": 3.5, "b": 4}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add_this_and_that",
|
||||
arguments=json.dumps({"a": 3.5, "b": 4}),
|
||||
)
|
||||
)
|
||||
],
|
||||
"bla",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls_streaming_one_chunk(
|
||||
mistral_tool_parser,
|
||||
mistral_tokenizer,
|
||||
model_output,
|
||||
expected_tool_calls,
|
||||
expected_content,
|
||||
):
|
||||
if isinstance(mistral_tokenizer, MistralTokenizer):
|
||||
all_token_ids = mistral_tokenizer.encode(model_output)
|
||||
else:
|
||||
all_token_ids = mistral_tokenizer.encode(model_output, add_special_tokens=False)
|
||||
all_token_ids = fix_tool_call_tokenization(
|
||||
all_token_ids, mistral_tool_parser, mistral_tokenizer
|
||||
)
|
||||
|
||||
delta_message = mistral_tool_parser.extract_tool_calls_streaming(
|
||||
previous_text="",
|
||||
current_text=model_output,
|
||||
delta_text=model_output,
|
||||
previous_token_ids=[],
|
||||
current_token_ids=all_token_ids,
|
||||
delta_token_ids=all_token_ids,
|
||||
request=None,
|
||||
) # type: ignore[arg-type]
|
||||
assert isinstance(delta_message, DeltaMessage)
|
||||
assert len(delta_message.tool_calls) == len(expected_tool_calls)
|
||||
|
||||
assert_tool_calls(delta_message.tool_calls, expected_tool_calls)
|
||||
|
||||
if delta_message.content is None:
|
||||
assert expected_content == ""
|
||||
else:
|
||||
assert delta_message.content == expected_content
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
ids=[
|
||||
"no_tools",
|
||||
"single_tool_add",
|
||||
"single_tool_add_strings",
|
||||
"single_tool_weather",
|
||||
"argument_before_name",
|
||||
"argument_before_name_and_name_in_argument",
|
||||
"multiple_tools",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
("""This is a test""", [], """This is a test"""),
|
||||
(
|
||||
"""[TOOL_CALLS] [ {"name":"add" , "arguments" : {"a": 3, "b": 4} } ]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add", arguments=json.dumps({"a": 3, "b": 4})
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"name": "add", "arguments":{"a": "3", "b": "4"}}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add", arguments=json.dumps({"a": "3", "b": "4"})
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps(
|
||||
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"arguments": {"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps(
|
||||
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"arguments": {"name": "John Doe"}, "name": "get_age"}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_age",
|
||||
arguments=json.dumps(
|
||||
{
|
||||
"name": "John Doe",
|
||||
}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS] [{"arguments": {"a": 3.5, "b": 4}, "name": "add"}, {"arguments":{"city": "San Francisco", "state": "CA", "unit": "celsius"}, "name": "get_current_weather"}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="add", arguments=json.dumps({"a": 3.5, "b": 4})
|
||||
)
|
||||
),
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps(
|
||||
{"city": "San Francisco", "state": "CA", "unit": "celsius"}
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
"",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls_streaming_pre_v11_tokenizer_one_chunk(
|
||||
mistral_pre_v11_tool_parser,
|
||||
mistral_pre_v11_tokenizer,
|
||||
model_output,
|
||||
expected_tool_calls,
|
||||
expected_content,
|
||||
):
|
||||
if isinstance(mistral_pre_v11_tokenizer, MistralTokenizer):
|
||||
all_token_ids = mistral_pre_v11_tokenizer.encode(model_output)
|
||||
else:
|
||||
all_token_ids = mistral_pre_v11_tokenizer.encode(
|
||||
model_output, add_special_tokens=False
|
||||
)
|
||||
all_token_ids = fix_tool_call_tokenization(
|
||||
all_token_ids, mistral_pre_v11_tool_parser, mistral_pre_v11_tokenizer
|
||||
)
|
||||
|
||||
delta_message = mistral_pre_v11_tool_parser.extract_tool_calls_streaming(
|
||||
previous_text="",
|
||||
current_text=model_output,
|
||||
delta_text=model_output,
|
||||
previous_token_ids=[],
|
||||
current_token_ids=all_token_ids,
|
||||
delta_token_ids=all_token_ids,
|
||||
request=None,
|
||||
) # type: ignore[arg-type]
|
||||
assert isinstance(delta_message, DeltaMessage)
|
||||
assert len(delta_message.tool_calls) == len(expected_tool_calls)
|
||||
|
||||
assert_tool_calls(delta_message.tool_calls, expected_tool_calls)
|
||||
|
||||
if delta_message.content is None:
|
||||
assert expected_content == ""
|
||||
else:
|
||||
assert delta_message.content == expected_content
|
||||
@ -123,7 +123,7 @@ CONFIGS: dict[str, ServerConfig] = {
|
||||
"supports_parallel": True,
|
||||
"extended": True,
|
||||
},
|
||||
"mistral": {
|
||||
"mistral-7b": {
|
||||
"model": "mistralai/Mistral-7B-Instruct-v0.3",
|
||||
"arguments": [
|
||||
"--enforce-eager",
|
||||
@ -145,6 +145,32 @@ CONFIGS: dict[str, ServerConfig] = {
|
||||
"call the tool. Otherwise, answer the user's query directly "
|
||||
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
|
||||
"to the user's question - just respond to it normally.",
|
||||
"supports_parallel": True,
|
||||
},
|
||||
"mistral-small-3.2": {
|
||||
"model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
|
||||
"arguments": [
|
||||
"--enforce-eager",
|
||||
"--no-enable-prefix-caching",
|
||||
"--tool-call-parser",
|
||||
"mistral",
|
||||
"--tokenizer-mode",
|
||||
"mistral",
|
||||
"--config-format",
|
||||
"mistral",
|
||||
"--load-format",
|
||||
"mistral",
|
||||
"--tensor-parallel-size",
|
||||
"4",
|
||||
'--ignore-patterns="consolidated.safetensors"',
|
||||
],
|
||||
"system_prompt": "You are a helpful assistant with access to tools. If a tool"
|
||||
" that you have would be helpful to answer a user query, "
|
||||
"call the tool. Otherwise, answer the user's query directly "
|
||||
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
|
||||
"to the user's question - just respond to it normally.",
|
||||
"supports_parallel": True,
|
||||
"extended": True,
|
||||
},
|
||||
# FIXME: This test currently fails, need to debug why.
|
||||
# "granite20b": {
|
||||
|
||||
@ -11,7 +11,9 @@ PROMPTS = [
|
||||
]
|
||||
|
||||
|
||||
def test_reset_prefix_cache_e2e():
|
||||
def test_reset_prefix_cache_e2e(monkeypatch):
|
||||
# "spawn" is required for test to be deterministic
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
engine_args = EngineArgs(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
gpu_memory_utilization=0.2,
|
||||
|
||||
@ -9,6 +9,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.hashing import _xxhash
|
||||
|
||||
|
||||
def test_prefix_caching_from_cli():
|
||||
@ -48,6 +49,21 @@ def test_prefix_caching_from_cli():
|
||||
args = parser.parse_args(["--prefix-caching-hash-algo", "invalid"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(_xxhash is None, reason="xxhash not installed")
|
||||
def test_prefix_caching_xxhash_from_cli():
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
|
||||
# set hash algorithm to xxhash (pickle)
|
||||
args = parser.parse_args(["--prefix-caching-hash-algo", "xxhash"])
|
||||
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
|
||||
assert vllm_config.cache_config.prefix_caching_hash_algo == "xxhash"
|
||||
|
||||
# set hash algorithm to xxhash_cbor
|
||||
args = parser.parse_args(["--prefix-caching-hash-algo", "xxhash_cbor"])
|
||||
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
|
||||
assert vllm_config.cache_config.prefix_caching_hash_algo == "xxhash_cbor"
|
||||
|
||||
|
||||
def test_defaults_with_usage_context():
|
||||
engine_args = EngineArgs(model="facebook/opt-125m")
|
||||
vllm_config: VllmConfig = engine_args.create_engine_config(UsageContext.LLM_CLASS)
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
@ -13,6 +11,7 @@ from transformers import AutoConfig
|
||||
|
||||
from tests.conftest import ImageTestAssets
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
|
||||
@ -50,18 +49,6 @@ async def client_with_image_embeds(server_with_image_embeds):
|
||||
yield async_client
|
||||
|
||||
|
||||
def encode_image_embedding_to_base64(image_embedding) -> str:
|
||||
"""
|
||||
Encode image embedding to base64 string
|
||||
"""
|
||||
buffer = io.BytesIO()
|
||||
torch.save(image_embedding, buffer)
|
||||
buffer.seek(0)
|
||||
binary_data = buffer.read()
|
||||
base64_image_embedding = base64.b64encode(binary_data).decode("utf-8")
|
||||
return base64_image_embedding
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("dtype", [torch.half, torch.float16, torch.float32])
|
||||
@ -73,7 +60,7 @@ async def test_completions_with_image_embeds(
|
||||
):
|
||||
# Test case: Single image embeds input
|
||||
image_embeds = image_assets[0].image_embeds.to(dtype=dtype)
|
||||
base64_image_embedding = encode_image_embedding_to_base64(image_embeds)
|
||||
base64_image_embedding = tensor2base64(image_embeds)
|
||||
chat_completion = await client_with_image_embeds.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
|
||||
@ -3,12 +3,14 @@
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.config import KVTransferConfig
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
|
||||
|
||||
@ -108,6 +110,13 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason=(
|
||||
"hipErrorLaunchFailure when running this test, see issue:"
|
||||
"https://github.com/ROCm/pytorch/issues/2822"
|
||||
),
|
||||
)
|
||||
def test_shared_storage_connector_hashes(tmp_path):
|
||||
"""
|
||||
Tests that SharedStorageConnector saves KV to the storage locations
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import random
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
@ -10,7 +9,6 @@ from tests.utils import create_new_process_for_each_test
|
||||
from tests.v1.logits_processors.utils import (
|
||||
DUMMY_LOGITPROC_ARG,
|
||||
DUMMY_LOGITPROC_FQCN,
|
||||
DUMMY_LOGITPROC_MODULE,
|
||||
MAX_TOKENS,
|
||||
MODEL_NAME,
|
||||
POOLING_MODEL_NAME,
|
||||
@ -18,7 +16,6 @@ from tests.v1.logits_processors.utils import (
|
||||
CustomLogitprocSource,
|
||||
DummyLogitsProcessor,
|
||||
WrappedPerReqLogitsProcessor,
|
||||
dummy_module,
|
||||
prompts,
|
||||
)
|
||||
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
|
||||
@ -162,8 +159,6 @@ def test_custom_logitsprocs(monkeypatch, logitproc_source: CustomLogitprocSource
|
||||
kwargs: dict[str, list[str | type[LogitsProcessor]]] = {}
|
||||
if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
|
||||
# Scenario: load logitproc based on fully-qualified class name (FQCN)
|
||||
# Inject dummy module which defines logitproc
|
||||
sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
|
||||
kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
|
||||
elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
|
||||
# Scenario: load logitproc from provided class object
|
||||
|
||||
@ -14,11 +14,9 @@ from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_te
|
||||
from tests.v1.logits_processors.utils import (
|
||||
DUMMY_LOGITPROC_ARG,
|
||||
DUMMY_LOGITPROC_FQCN,
|
||||
DUMMY_LOGITPROC_MODULE,
|
||||
MAX_TOKENS,
|
||||
MODEL_NAME,
|
||||
TEMP_GREEDY,
|
||||
dummy_module,
|
||||
prompts,
|
||||
)
|
||||
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
|
||||
@ -47,20 +45,14 @@ def _server_with_logitproc_entrypoint(
|
||||
main.main()
|
||||
|
||||
|
||||
def _server_with_logitproc_module(
|
||||
def _server_with_logitproc_fqcn(
|
||||
env_dict: dict[str, str] | None,
|
||||
model: str,
|
||||
vllm_serve_args: list[str],
|
||||
) -> None:
|
||||
"""Start vLLM server, inject module with dummy logitproc"""
|
||||
|
||||
# Patch `modules` to inject dummy logitproc module
|
||||
from vllm.entrypoints.cli import main
|
||||
|
||||
sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
|
||||
|
||||
# fork is required for workers to see entrypoint patch
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "fork"
|
||||
if env_dict is not None:
|
||||
os.environ.update(env_dict)
|
||||
|
||||
@ -99,7 +91,7 @@ def server(default_server_args, request, monkeypatch):
|
||||
if request.param:
|
||||
# Launch server, append FQCN argument, inject dummy logitproc module
|
||||
args = default_server_args + request.param
|
||||
_server_fxn = _server_with_logitproc_module
|
||||
_server_fxn = _server_with_logitproc_fqcn
|
||||
else:
|
||||
# Launch server, inject dummy logitproc entrypoint
|
||||
args = default_server_args
|
||||
|
||||
@ -27,7 +27,7 @@ DUMMY_LOGITPROC_ARG = "target_token"
|
||||
TEMP_GREEDY = 0.0
|
||||
MAX_TOKENS = 20
|
||||
DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
|
||||
DUMMY_LOGITPROC_MODULE = "DummyModule"
|
||||
DUMMY_LOGITPROC_MODULE = "tests.v1.logits_processors.utils"
|
||||
DUMMY_LOGITPROC_FQCN = f"{DUMMY_LOGITPROC_MODULE}:DummyLogitsProcessor"
|
||||
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import torch
|
||||
|
||||
from vllm.config import SpeculativeConfig
|
||||
from vllm.model_executor.models.interfaces import supports_eagle3
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -21,6 +22,10 @@ from vllm.model_executor.models.interfaces import supports_eagle3
|
||||
pytest.param(
|
||||
"nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized-w4a16",
|
||||
id="qwen3-eagle3-speculator-w4a16-verifier",
|
||||
marks=pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason="The tests are skipped on rocm platform.",
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
@ -761,6 +761,10 @@ def test_init_kv_cache_with_kv_sharing_valid():
|
||||
assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[1] == layer_1
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason="Attention backend FLASHINFER is not supported on ROCm.",
|
||||
)
|
||||
def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
|
||||
"""
|
||||
The GPU model runner creates different views into the
|
||||
|
||||
@ -283,6 +283,28 @@ def _rocm_aiter_grouped_topk_fake(
|
||||
pass
|
||||
|
||||
|
||||
# Cache whether aiter supports FP8 MLA parameters
|
||||
_AITER_MLA_SUPPORTS_FP8: bool | None = None
|
||||
|
||||
|
||||
def _check_aiter_mla_fp8_support() -> bool:
|
||||
"""Check if aiter.mla.mla_decode_fwd supports q_scale and kv_scale parameters."""
|
||||
global _AITER_MLA_SUPPORTS_FP8
|
||||
if _AITER_MLA_SUPPORTS_FP8 is None:
|
||||
try:
|
||||
import inspect
|
||||
|
||||
from aiter.mla import mla_decode_fwd
|
||||
|
||||
sig = inspect.signature(mla_decode_fwd)
|
||||
_AITER_MLA_SUPPORTS_FP8 = (
|
||||
"q_scale" in sig.parameters and "kv_scale" in sig.parameters
|
||||
)
|
||||
except Exception:
|
||||
_AITER_MLA_SUPPORTS_FP8 = False
|
||||
return _AITER_MLA_SUPPORTS_FP8
|
||||
|
||||
|
||||
def _rocm_aiter_mla_decode_fwd_impl(
|
||||
q: torch.Tensor,
|
||||
kv_buffer: torch.Tensor,
|
||||
@ -299,6 +321,16 @@ def _rocm_aiter_mla_decode_fwd_impl(
|
||||
) -> None:
|
||||
from aiter.mla import mla_decode_fwd
|
||||
|
||||
kwargs = {
|
||||
"sm_scale": sm_scale,
|
||||
"logit_cap": logit_cap,
|
||||
}
|
||||
|
||||
# Only pass q_scale and kv_scale if the aiter library supports them
|
||||
if _check_aiter_mla_fp8_support():
|
||||
kwargs["q_scale"] = q_scale
|
||||
kwargs["kv_scale"] = kv_scale
|
||||
|
||||
mla_decode_fwd(
|
||||
q,
|
||||
kv_buffer.view(-1, 1, 1, q.shape[-1]),
|
||||
@ -308,10 +340,7 @@ def _rocm_aiter_mla_decode_fwd_impl(
|
||||
kv_indices,
|
||||
kv_last_page_lens,
|
||||
max_seqlen_qo,
|
||||
sm_scale=sm_scale,
|
||||
logit_cap=logit_cap,
|
||||
q_scale=q_scale,
|
||||
kv_scale=kv_scale,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import dataclasses
|
||||
from collections import Counter
|
||||
from collections.abc import Callable
|
||||
from contextlib import ExitStack
|
||||
from typing import Any
|
||||
@ -22,6 +23,99 @@ from vllm.utils.torch_utils import weak_ref_tensors
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class CUDAGraphStat:
|
||||
num_unpadded_tokens: int
|
||||
num_padded_tokens: int
|
||||
num_paddings: int
|
||||
runtime_mode: str
|
||||
|
||||
|
||||
class CUDAGraphLogging:
|
||||
"""Aggregate and log cudagraph metrics"""
|
||||
|
||||
COLUMN_HEADERS = [
|
||||
"Unpadded Tokens",
|
||||
"Padded Tokens",
|
||||
"Num Paddings",
|
||||
"Runtime Mode",
|
||||
"Count",
|
||||
]
|
||||
|
||||
def __init__(self, cg_mode: CUDAGraphMode, cg_capture_sizes: list[int] | None):
|
||||
self.reset()
|
||||
self.cg_mode = str(cg_mode)
|
||||
self.cg_capture_sizes = str(cg_capture_sizes or [])
|
||||
|
||||
self.settings_header = (
|
||||
"**CUDAGraph Config Settings:**\n\n"
|
||||
f"- Mode: {self.cg_mode}\n"
|
||||
f"- Capture sizes: {self.cg_capture_sizes}\n\n"
|
||||
"**CUDAGraph Stats:**\n\n"
|
||||
)
|
||||
|
||||
def reset(self):
|
||||
self.stats = []
|
||||
|
||||
def observe(self, cudagraph_stat: CUDAGraphStat):
|
||||
self.stats.append(cudagraph_stat)
|
||||
|
||||
def generate_metric_table(self) -> str:
|
||||
stats_counts = Counter(self.stats)
|
||||
|
||||
# Convert stats to rows of strings, in descending order of observed frequencies
|
||||
rows = []
|
||||
for stat, count in sorted(
|
||||
stats_counts.items(), key=lambda item: item[1], reverse=True
|
||||
):
|
||||
rows.append(
|
||||
[
|
||||
str(stat.num_unpadded_tokens),
|
||||
str(stat.num_padded_tokens),
|
||||
str(stat.num_paddings),
|
||||
stat.runtime_mode,
|
||||
str(count),
|
||||
]
|
||||
)
|
||||
|
||||
# Calculate column widths (max of header and data)
|
||||
col_widths = []
|
||||
for i, header_text in enumerate(self.COLUMN_HEADERS):
|
||||
max_width = len(header_text)
|
||||
for row in rows:
|
||||
max_width = max(max_width, len(row[i]))
|
||||
col_widths.append(max_width)
|
||||
|
||||
table_header_list = [
|
||||
h.ljust(w) for h, w in zip(self.COLUMN_HEADERS, col_widths)
|
||||
]
|
||||
table_header = "| " + " | ".join(table_header_list) + " |\n"
|
||||
|
||||
table_separator = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|\n"
|
||||
|
||||
# Create data rows with proper alignment
|
||||
data_rows = []
|
||||
for row in rows:
|
||||
formatted_row = [
|
||||
str(val).ljust(width) for val, width in zip(row, col_widths)
|
||||
]
|
||||
data_rows.append("| " + " | ".join(formatted_row) + " |")
|
||||
|
||||
return (
|
||||
self.settings_header
|
||||
+ table_header
|
||||
+ table_separator
|
||||
+ "\n".join(data_rows)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
def log(self, log_fn=logger.info):
|
||||
if not self.stats:
|
||||
return
|
||||
log_fn(self.generate_metric_table())
|
||||
self.reset()
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class CUDAGraphEntry:
|
||||
batch_descriptor: BatchDescriptor
|
||||
|
||||
@ -104,7 +104,8 @@ class FixFunctionalizationPass(VllmInductorPass):
|
||||
mutated_args = {1: "result"}
|
||||
self.defunctionalize(graph, node, mutated_args)
|
||||
elif (
|
||||
at_target
|
||||
hasattr(torch.ops.vllm, "flashinfer_trtllm_fused_allreduce_norm")
|
||||
and at_target
|
||||
== torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default
|
||||
):
|
||||
mutated_args = {
|
||||
|
||||
@ -30,7 +30,7 @@ CacheDType = Literal[
|
||||
"fp8_ds_mla",
|
||||
]
|
||||
MambaDType = Literal["auto", "float32"]
|
||||
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
|
||||
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
|
||||
KVOffloadingBackend = Literal["native", "lmcache"]
|
||||
|
||||
|
||||
@ -77,9 +77,21 @@ class CacheConfig:
|
||||
"""Whether to enable prefix caching."""
|
||||
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
|
||||
"""Set the hash algorithm for prefix caching:\n
|
||||
- "sha256" uses Pickle for object serialization before hashing.\n
|
||||
- "sha256" uses Pickle for object serialization before hashing. This is the
|
||||
current default, as SHA256 is the most secure choice to avoid potential
|
||||
hash collisions.\n
|
||||
- "sha256_cbor" provides a reproducible, cross-language compatible hash. It
|
||||
serializes objects using canonical CBOR and hashes them with SHA-256."""
|
||||
serializes objects using canonical CBOR and hashes them with SHA-256.\n
|
||||
- "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
|
||||
non-cryptographic hashing. Requires the optional ``xxhash`` package.
|
||||
IMPORTANT: Use of a hashing algorithm that is not considered
|
||||
cryptographically secure theoretically increases the risk of hash collisions,
|
||||
which can cause undefined behavior or even leak private information in
|
||||
multi-tenant environments. Even if collisions are still very unlikely, it is
|
||||
important to consider your security risk tolerance against the performance
|
||||
benefits before turning this on.\n
|
||||
- "xxhash_cbor" combines canonical CBOR serialization with xxHash for
|
||||
reproducible hashing. Requires the optional ``xxhash`` package."""
|
||||
cpu_offload_gb: float = Field(default=0, ge=0)
|
||||
"""The space in GiB to offload to CPU, per GPU. Default is 0, which means
|
||||
no offloading. Intuitively, this argument can be seen as a virtual way to
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
import enum
|
||||
from collections import Counter
|
||||
from collections.abc import Callable
|
||||
from dataclasses import asdict, field
|
||||
from dataclasses import field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
||||
|
||||
@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
||||
from vllm.config.utils import config, handle_deprecated
|
||||
from vllm.config.utils import config, get_hash_factors, handle_deprecated, hash_factors
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.import_utils import resolve_obj_by_qualname
|
||||
@ -196,7 +196,16 @@ class PassConfig:
|
||||
Any new fields that affect compilation should be added to the hash.
|
||||
Any future fields that don't affect compilation should be excluded.
|
||||
"""
|
||||
return InductorPass.hash_dict(asdict(self))
|
||||
|
||||
ignored_fields = [
|
||||
"enable_fusion",
|
||||
"enable_attn_fusion",
|
||||
"enable_noop",
|
||||
"enable_sequence_parallelism",
|
||||
"enable_async_tp",
|
||||
"enable_fi_allreduce_fusion",
|
||||
]
|
||||
return hash_factors(get_hash_factors(self, ignored_factors=ignored_fields))
|
||||
|
||||
@field_validator(
|
||||
"fuse_norm_quant",
|
||||
@ -267,14 +276,6 @@ class PassConfig:
|
||||
"v0.13.0 or v1.0.0, whichever is sooner",
|
||||
)
|
||||
|
||||
# Force old flags to None to ensure they are not used
|
||||
self.enable_fusion = None
|
||||
self.enable_attn_fusion = None
|
||||
self.enable_noop = None
|
||||
self.enable_sequence_parallelism = None
|
||||
self.enable_async_tp = None
|
||||
self.enable_fi_allreduce_fusion = None
|
||||
|
||||
if not self.eliminate_noops:
|
||||
if self.fuse_norm_quant or self.fuse_act_quant:
|
||||
logger.warning_once(
|
||||
|
||||
@ -84,7 +84,7 @@ TaskOption = Literal[
|
||||
"transcription",
|
||||
"draft",
|
||||
]
|
||||
TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
|
||||
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
|
||||
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
||||
LogprobsMode = Literal[
|
||||
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
|
||||
@ -141,6 +141,7 @@ class ModelConfig:
|
||||
- "hf" will use the fast tokenizer if available.\n
|
||||
- "slow" will always use the slow tokenizer.\n
|
||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
||||
- Other custom values can be supported via plugins."""
|
||||
trust_remote_code: bool = False
|
||||
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
||||
@ -1779,20 +1780,22 @@ class ModelConfig:
|
||||
return False
|
||||
elif attn_type == "decoder":
|
||||
pooling_type = self.pooler_config.pooling_type.lower()
|
||||
if pooling_type in ["all", "mean", "step", "cls"]:
|
||||
if pooling_type in ["mean", "step", "cls"]:
|
||||
logger.debug(
|
||||
"Pooling models with %s pooling does not "
|
||||
"support chunked prefill.",
|
||||
pooling_type,
|
||||
)
|
||||
return False
|
||||
else:
|
||||
# pooling_type == "last"
|
||||
elif pooling_type in ["all", "last"]:
|
||||
logger.debug(
|
||||
"Pooling models with causal attn and last pooling support "
|
||||
"chunked prefill."
|
||||
"Pooling models with causal attn and %s pooling support "
|
||||
"chunked prefill.",
|
||||
pooling_type,
|
||||
)
|
||||
return True
|
||||
else:
|
||||
raise ValueError(f"{pooling_type=} not supported.")
|
||||
# vllm currently does not have pooling models using hybrid,
|
||||
# attention_free or encoder_decoder attn types.
|
||||
return attn_type != "encoder_decoder"
|
||||
@ -1816,20 +1819,22 @@ class ModelConfig:
|
||||
return False
|
||||
elif attn_type == "decoder":
|
||||
pooling_type = self.pooler_config.pooling_type.lower()
|
||||
if pooling_type in ["all", "mean", "step", "cls"]:
|
||||
if pooling_type in ["mean", "step", "cls"]:
|
||||
logger.debug(
|
||||
"Pooling models with %s pooling does not "
|
||||
"support prefix caching.",
|
||||
pooling_type,
|
||||
)
|
||||
return False
|
||||
else:
|
||||
# pooling_type == "last"
|
||||
elif pooling_type in ["all", "last"]:
|
||||
logger.debug(
|
||||
"Pooling models with causal attn and last pooling support "
|
||||
"prefix caching."
|
||||
"Pooling models with causal attn and %s pooling support "
|
||||
"prefix caching.",
|
||||
pooling_type,
|
||||
)
|
||||
return True
|
||||
else:
|
||||
raise ValueError(f"{pooling_type=} not supported.")
|
||||
# vllm currently does not have pooling models using hybrid,
|
||||
# attention_free or encoder_decoder attn types.
|
||||
return False
|
||||
|
||||
@ -55,6 +55,10 @@ class ObservabilityConfig:
|
||||
kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
|
||||
"""Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""
|
||||
|
||||
cudagraph_metrics: bool = False
|
||||
"""Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
|
||||
dispatch modes, and their observed frequencies at every logging interval)."""
|
||||
|
||||
@cached_property
|
||||
def collect_model_forward_time(self) -> bool:
|
||||
"""Whether to collect model forward time for the request."""
|
||||
|
||||
@ -593,10 +593,14 @@ class ParallelConfig:
|
||||
"max_parallel_loading_workers is currently "
|
||||
"not supported and will be ignored."
|
||||
)
|
||||
if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
|
||||
allowed_backends = ("mp", "uni", "external_launcher")
|
||||
if (
|
||||
self.distributed_executor_backend not in allowed_backends
|
||||
and self.nnodes > 1
|
||||
):
|
||||
raise ValueError(
|
||||
"nnodes > 1 can only be set when distributed executor "
|
||||
"backend is mp or uni."
|
||||
"backend is mp, uni or external_launcher."
|
||||
)
|
||||
|
||||
@property
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user