Merge branch 'main' into woosuk/test-router

This commit is contained in:
Woosuk Kwon 2025-10-16 00:32:13 +00:00
commit 8935ca208d
248 changed files with 4643 additions and 2419 deletions

View File

@ -0,0 +1,12 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.419
- name: "exact_match,flexible-extract"
value: 0.416
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
value: 0.90
limit: 100
num_fewshot: 0

View File

@ -0,0 +1,11 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend: "vllm-vlm"
tasks:
- name: "mmlu_pro"
metrics:
- name: "exact_match,custom-extract"
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5

View File

@ -1,4 +1,5 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1 # For vllm script, with -t option (tensor parallel size)
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks: tasks:
- name: "gsm8k" - name: "gsm8k"

View File

@ -0,0 +1,12 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
value: 0.855
limit: 2500
num_fewshot: 0

View File

@ -0,0 +1 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

View File

@ -0,0 +1 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml

View File

@ -0,0 +1 @@
Qwen2.5-VL-7B-Instruct.yaml

View File

@ -0,0 +1,44 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.9
usage() {
echo``
echo "Runs lm eval harness on ChartQA using multimodal vllm."
echo "This pathway is intended to be used to create baselines for "
echo "our correctness tests in vllm's CI."
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:l:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm-vlm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
--tasks chartqa \
--batch_size auto \
--apply_chat_template \
--limit $LIMIT

View File

View File

@ -0,0 +1,50 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size auto

View File

@ -19,21 +19,27 @@ RTOL = 0.08
def launch_lm_eval(eval_config, tp_size): def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False) trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096) max_model_len = eval_config.get("max_model_len", 4096)
batch_size = eval_config.get("batch_size", "auto")
backend = eval_config.get("backend", "vllm")
model_args = ( model_args = (
f"pretrained={eval_config['model_name']}," f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size}," f"tensor_parallel_size={tp_size},"
f"enforce_eager=true," f"enforce_eager=true,"
f"add_bos_token=true," f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code}," f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}" f"max_model_len={max_model_len},"
) )
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
model="vllm", model=backend,
model_args=model_args, model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]], tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"], num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"], limit=eval_config["limit"],
batch_size="auto", # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm.
apply_chat_template=backend == "vllm-vlm",
batch_size=batch_size,
) )
return results return results

View File

@ -63,7 +63,7 @@ steps:
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins - label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
timeout_in_minutes: 10 timeout_in_minutes: 10
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
@ -353,7 +353,7 @@ steps:
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: V1 Test others (CPU) # 5 mins - label: V1 Test others (CPU) # 5 mins
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
@ -459,6 +459,7 @@ steps:
- pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py - pytest -v -s compile/test_decorator.py
- pytest -v -s compile/test_noop_elimination.py - pytest -v -s compile/test_noop_elimination.py
- pytest -v -s compile/test_aot_compile.py
- label: PyTorch Fullgraph Smoke Test # 15min - label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30 timeout_in_minutes: 30
@ -487,14 +488,14 @@ steps:
- label: Kernels Core Operation Test # 48min - label: Kernels Core Operation Test # 48min
timeout_in_minutes: 75 timeout_in_minutes: 75
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- tests/kernels/core - tests/kernels/core
commands: commands:
- pytest -v -s kernels/core - pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N # 23min - label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35 timeout_in_minutes: 35
@ -603,7 +604,8 @@ steps:
# since torchao nightly is only compatible with torch nightly currently # since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
@ -631,7 +633,7 @@ steps:
- label: OpenAI-Compatible Tool Use # 23 min - label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35 timeout_in_minutes: 35
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
fast_check: false fast_check: false

View File

@ -527,7 +527,8 @@ steps:
# since torchao nightly is only compatible with torch nightly currently # since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
@ -733,6 +734,16 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1 - label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
optional: true optional: true

3
.github/CODEOWNERS vendored
View File

@ -5,9 +5,7 @@
/vllm/attention @LucasWilkinson /vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin /vllm/model_executor/layers/fused_moe @mgoin
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn /vllm/model_executor/model_loader @22quinn
@ -26,7 +24,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/attention @LucasWilkinson /vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backends/flashinfer.py @mgoin /vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/attention/backends/triton_attn.py @tdoublep

View File

@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
else: else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size shard_intermediate_size = 2 * intermediate_size // args.tp_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config) block_quant_shape = get_weight_block_size_safety(config)

View File

@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
hidden_size = config.hidden_size hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute use_customized_permute = args.use_customized_permute

View File

@ -22,10 +22,10 @@ else()
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
) )
FetchContent_Populate(qutlass)
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
endif() endif()
FetchContent_Populate(qutlass)
if(NOT qutlass_SOURCE_DIR) if(NOT qutlass_SOURCE_DIR)
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.") message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
endif() endif()

View File

@ -2,6 +2,7 @@
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "cub_helpers.h" #include "cub_helpers.h"
#include "core/batch_invariant.hpp" #include "core/batch_invariant.hpp"
#include "quantization/vectorization_utils.cuh"
#include <torch/cuda.h> #include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
const float epsilon, const int num_tokens, const int hidden_size) { const float epsilon, const int num_tokens, const int hidden_size) {
__shared__ float s_variance; __shared__ float s_variance;
float variance = 0.0f; float variance = 0.0f;
const scalar_t* input_row = input + blockIdx.x * input_stride;
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { constexpr int VEC_SIZE = 8;
const float x = (float)input[blockIdx.x * input_stride + idx]; auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
#pragma unroll
for (int i = 0; i < VEC_SIZE; ++i) {
float x = static_cast<float>(vec.val[i]);
variance += x * x;
}
};
auto scalar_op = [&variance](const scalar_t& val) {
float x = static_cast<float>(val);
variance += x * x; variance += x * x;
} };
vllm::vectorize_read_with_alignment<VEC_SIZE>(
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
using BlockReduce = cub::BlockReduce<float, 1024>; using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore; __shared__ typename BlockReduce::TempStorage reduceStore;

View File

@ -10,6 +10,7 @@
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "cub_helpers.h" #include "cub_helpers.h"
#include "core/batch_invariant.hpp" #include "core/batch_invariant.hpp"
#include "quantization/vectorization_utils.cuh"
#include <torch/cuda.h> #include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
__shared__ float s_variance; __shared__ float s_variance;
float variance = 0.0f; float variance = 0.0f;
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { const scalar_t* input_row = input + blockIdx.x * input_stride;
const float x = (float)input[blockIdx.x * input_stride + idx];
constexpr int VEC_SIZE = 8;
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
#pragma unroll
for (int i = 0; i < VEC_SIZE; ++i) {
float x = static_cast<float>(vec.val[i]);
variance += x * x;
}
};
auto scalar_op = [&variance](const scalar_t& val) {
float x = static_cast<float>(val);
variance += x * x; variance += x * x;
} };
vllm::vectorize_read_with_alignment<VEC_SIZE>(
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
using BlockReduce = cub::BlockReduce<float, 1024>; using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore; __shared__ typename BlockReduce::TempStorage reduceStore;

View File

@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
```python ```python
from vllm import LLM from vllm import LLM
from vllm.config import CompilationConfig, CompilationLevel from vllm.config import CompilationConfig, CompilationMode
llm = LLM( llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
# By default, it goes up to max_num_seqs # By default, it goes up to max_num_seqs
cudagraph_capture_sizes=[1, 2, 4, 8, 16], cudagraph_capture_sizes=[1, 2, 4, 8, 16],
), ),

View File

@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
"""NO CUDA Graphs support""" """NO CUDA Graphs support"""
``` ```
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
The following table lists backends that support full CUDA Graphs at the time of writing. The following table lists backends that support full CUDA Graphs at the time of writing.
@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
import vllm import vllm
from vllm.config import CUDAGraphMode from vllm.config import CUDAGraphMode
compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
model = vllm.LLM( model = vllm.LLM(
model="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
dtype="auto", dtype="auto",

View File

@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
from awq import AutoAWQForCausalLM from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
model_path = 'mistralai/Mistral-7B-Instruct-v0.2' model_path = "mistralai/Mistral-7B-Instruct-v0.2"
quant_path = 'mistral-instruct-v0.2-awq' quant_path = "mistral-instruct-v0.2-awq"
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
# Load model # Load model
model = AutoAWQForCausalLM.from_pretrained( model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False} model_path,
low_cpu_mem_usage=True,
use_cache=False,
) )
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

View File

@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_round import AutoRound from auto_round import AutoRound
model_name = "Qwen/Qwen3-0.6B" model_name = "Qwen/Qwen3-0.6B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
bits, group_size, sym = 4, 128, True bits, group_size, sym = 4, 128, True

View File

@ -34,7 +34,7 @@ llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitblas" quantization="bitblas",
) )
``` ```
@ -53,6 +53,6 @@ llm = LLM(
dtype=torch.float16, dtype=torch.float16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitblas", quantization="bitblas",
max_model_len=1024 max_model_len=1024,
) )
``` ```

View File

@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
llm = LLM( llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True trust_remote_code=True,
) )
``` ```
@ -43,7 +43,7 @@ llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitsandbytes" quantization="bitsandbytes",
) )
``` ```

View File

@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```
@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
# Configure the simple PTQ quantization # Configure the simple PTQ quantization
recipe = QuantizationModifier( recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) targets="Linear",
scheme="FP8_DYNAMIC",
ignore=["lm_head"],
)
# Apply the quantization algorithm. # Apply the quantization algorithm.
oneshot(model=model, recipe=recipe) oneshot(model=model, recipe=recipe)

View File

@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
conversation = [ conversation = [
{ {
"role": "system", "role": "system",
"content": "You are a helpful assistant" "content": "You are a helpful assistant",
}, },
{ {
"role": "user", "role": "user",
"content": "Hello" "content": "Hello",
}, },
{ {
"role": "assistant", "role": "assistant",
"content": "Hello! How can I assist you today?" "content": "Hello! How can I assist you today?",
}, },
{ {
"role": "user", "role": "user",
@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", llm = LLM(
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.chat(conversation, sampling_params) outputs = llm.chat(conversation, sampling_params)

View File

@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
calibration_dataset = load_dataset( calibration_dataset = load_dataset(
"allenai/c4", "allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz", data_files="en/c4-train.00001-of-01024.json.gz",
split="train" split="train",
).select(range(1024))["text"] ).select(range(1024))["text"]
quant_config = QuantizeConfig(bits=4, group_size=128) quant_config = QuantizeConfig(bits=4, group_size=128)

View File

@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```
@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
}, },
ignore=["lm_head"], ignore=["lm_head"],
update_size=NUM_CALIBRATION_SAMPLES, update_size=NUM_CALIBRATION_SAMPLES,
dampening_frac=0.01 dampening_frac=0.01,
) )
``` ```

View File

@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```

View File

@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
def main(): def main():
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
sampling_params = SamplingParams(temperature=0.8, top_p=0.9) sampling_params = SamplingParams(temperature=0.8, top_p=0.9)

View File

@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
sampling_params = SamplingParams(temperature=0.7, top_p=0.8) sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", llm = LLM(
kv_cache_dtype="fp8", model="meta-llama/Llama-2-7b-chat-hf",
calculate_kv_scales=True) kv_cache_dtype="fp8",
calculate_kv_scales=True,
)
prompt = "London is the capital of" prompt = "London is the capital of"
out = llm.generate(prompt, sampling_params)[0].outputs[0].text out = llm.generate(prompt, sampling_params)[0].outputs[0].text
print(out) print(out)
@ -80,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
# Select model and load it # Select model and load it
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset # Select calibration dataset

View File

@ -48,7 +48,9 @@ to fetch model and tokenizer.
MAX_SEQ_LEN = 512 MAX_SEQ_LEN = 512
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
dtype="auto",
) )
model.eval() model.eval()
@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
text_data = dataset["text"][:NUM_CALIBRATION_DATA] text_data = dataset["text"][:NUM_CALIBRATION_DATA]
tokenized_outputs = tokenizer(text_data, return_tensors="pt", tokenized_outputs = tokenizer(
padding=True, truncation=True, max_length=MAX_SEQ_LEN) text_data,
calib_dataloader = DataLoader(tokenized_outputs['input_ids'], return_tensors="pt",
batch_size=BATCH_SIZE, drop_last=True) padding=True,
truncation=True,
max_length=MAX_SEQ_LEN,
)
calib_dataloader = DataLoader(
tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE,
drop_last=True,
)
``` ```
### 3. Set the Quantization Configuration ### 3. Set the Quantization Configuration
@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
load_quant_algo_config_from_file) load_quant_algo_config_from_file)
# Define fp8/per-tensor/static spec. # Define fp8/per-tensor/static spec.
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
is_dynamic=False).to_quantization_spec() observer_method="min_max",
is_dynamic=False,
).to_quantization_spec()
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, global_quant_config = QuantizationConfig(
weight=FP8_PER_TENSOR_SPEC) input_tensors=FP8_PER_TENSOR_SPEC,
weight=FP8_PER_TENSOR_SPEC,
)
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
kv_cache_quant_config = {name : kv_cache_quant_config = {
QuantizationConfig(input_tensors=global_quant_config.input_tensors, name: QuantizationConfig(
weight=global_quant_config.weight, input_tensors=global_quant_config.input_tensors,
output_tensors=KV_CACHE_SPEC) weight=global_quant_config.weight,
for name in kv_cache_layer_names_for_llama} output_tensors=KV_CACHE_SPEC,
)
for name in kv_cache_layer_names_for_llama
}
layer_quant_config = kv_cache_quant_config.copy() layer_quant_config = kv_cache_quant_config.copy()
# Define algorithm config by config file. # Define algorithm config by config file.
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
EXCLUDE_LAYERS = ["lm_head"] EXCLUDE_LAYERS = ["lm_head"]
@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
layer_quant_config=layer_quant_config, layer_quant_config=layer_quant_config,
kv_cache_quant_config=kv_cache_quant_config, kv_cache_quant_config=kv_cache_quant_config,
exclude=EXCLUDE_LAYERS, exclude=EXCLUDE_LAYERS,
algo_config=algo_config) algo_config=algo_config,
)
``` ```
### 4. Quantize the Model and Export ### 4. Quantize the Model and Export
@ -165,8 +182,11 @@ for more exporting format details.
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
with torch.no_grad(): with torch.no_grad():
exporter.export_safetensors_model(freezed_model, exporter.export_safetensors_model(
quant_config=quant_config, tokenizer=tokenizer) freezed_model,
quant_config=quant_config,
tokenizer=tokenizer,
)
``` ```
### 5. Evaluation in vLLM ### 5. Evaluation in vLLM
@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", llm = LLM(
kv_cache_dtype='fp8',quantization='quark') model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
kv_cache_dtype="fp8",
quantization="quark",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)

View File

@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
quantized_model = AutoModelForCausalLM.from_pretrained( quantized_model = AutoModelForCausalLM.from_pretrained(
model_name, model_name,
torch_dtype="auto", dtype="auto",
device_map="auto", device_map="auto",
quantization_config=quantization_config quantization_config=quantization_config
) )

View File

@ -11,6 +11,7 @@ vLLM currently supports the following reasoning models:
| Model Series | Parser Name | Structured Output Support | Tool Calling | | Model Series | Parser Name | Structured Output Support | Tool Calling |
|--------------|-------------|------------------|-------------| |--------------|-------------|------------------|-------------|
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ | | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ | | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ | | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ | | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
@ -20,8 +21,9 @@ vLLM currently supports the following reasoning models:
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ | | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
!!! note !!! note
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
DeepSeek-V3.1 tool calling is supported in non-thinking mode.
## Quickstart ## Quickstart

View File

@ -352,6 +352,16 @@ Supported models:
Flags: `--tool-call-parser qwen3_xml` Flags: `--tool-call-parser qwen3_xml`
### Olmo 3 Models (`olmo3`)
Olmo 3 models output tool calls in a format that is very similar to the one expected by the `pythonic` parser (see below), with a few differences. Each tool call is a pythonic string, but the parallel tool calls are newline-delimited, and the calls are wrapped within XML tags as `<function_calls>..</function_calls>`. In addition, the parser also allows JSON boolean and null literals (`true`, `false`, and `null`) in addition to the pythonic ones (`True`, `False`, and `None`).
Supported models:
* TODO (will be updated after Olmo 3 release)
Flags: `--tool-call-parser olmo3`
### Models with Pythonic Tool Calls (`pythonic`) ### Models with Pythonic Tool Calls (`pythonic`)
A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.

View File

@ -23,7 +23,46 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
# --8<-- [end:pre-built-wheels] # --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source] # --8<-- [start:build-wheel-from-source]
--8<-- "docs/getting_started/installation/cpu/build.inc.md:extra-information" First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
```bash
sudo apt-get update -y
sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
```
Second, clone the vLLM project:
```bash
git clone https://github.com/vllm-project/vllm.git vllm_source
cd vllm_source
```
Third, install required dependencies:
```bash
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
uv pip install -r requirements/cpu.txt --torch-backend cpu
```
??? console "pip"
```bash
pip install --upgrade pip
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
```
Finally, build and install vLLM:
```bash
VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
```
If you want to develop vLLM, install it in editable mode instead.
```bash
VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
```
Testing has been conducted on AWS Graviton3 instances for compatibility. Testing has been conducted on AWS Graviton3 instances for compatibility.

View File

@ -1,44 +0,0 @@
# --8<-- [start:extra-information]
First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
```bash
sudo apt-get update -y
sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
```
Second, clone the vLLM project:
```bash
git clone https://github.com/vllm-project/vllm.git vllm_source
cd vllm_source
```
Third, install required dependencies:
```bash
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
uv pip install -r requirements/cpu.txt --torch-backend cpu
```
??? console "pip"
```bash
pip install --upgrade pip
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
```
Finally, build and install vLLM:
```bash
VLLM_TARGET_DEVICE=cpu python setup.py install
```
If you want to develop vLLM, install it in editable mode instead.
```bash
VLLM_TARGET_DEVICE=cpu python setup.py develop
```
# --8<-- [end:extra-information]

View File

@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", completion = client.completions.create(
prompt="San Francisco is a") model="Qwen/Qwen2.5-1.5B-Instruct",
prompt="San Francisco is a",
)
print("Completion result:", completion) print("Completion result:", completion)
``` ```
@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package:
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."}, {"role": "user", "content": "Tell me a joke."},
] ],
) )
print("Chat response:", chat_response) print("Chat response:", chat_response)
``` ```

View File

@ -22,6 +22,11 @@ sys.modules["vllm._C"] = MagicMock()
class PydanticMagicMock(MagicMock): class PydanticMagicMock(MagicMock):
"""`MagicMock` that's able to generate pydantic-core schemas.""" """`MagicMock` that's able to generate pydantic-core schemas."""
def __init__(self, *args, **kwargs):
name = kwargs.pop("name", None)
super().__init__(*args, **kwargs)
self.__spec__ = importlib.machinery.ModuleSpec(name, None)
def __get_pydantic_core_schema__(self, source_type, handler): def __get_pydantic_core_schema__(self, source_type, handler):
return core_schema.any_schema() return core_schema.any_schema()
@ -42,7 +47,9 @@ def auto_mock(module, attr, max_mocks=50):
raise e raise e
except ModuleNotFoundError as e: except ModuleNotFoundError as e:
logger.info("Mocking %s for argparse doc generation", e.name) logger.info("Mocking %s for argparse doc generation", e.name)
sys.modules[e.name] = PydanticMagicMock() sys.modules[e.name] = PydanticMagicMock(name=e.name)
except Exception as e:
logger.warning("Failed to import %s.%s: %s", module, attr, e)
raise ImportError( raise ImportError(
f"Failed to import {module}.{attr} after mocking {max_mocks} imports" f"Failed to import {module}.{attr} after mocking {max_mocks} imports"

View File

@ -60,7 +60,7 @@ from vllm import LLM
llm = LLM( llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1", "s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer", load_format="tensorizer",
enable_lora=True enable_lora=True,
) )
``` ```
@ -97,6 +97,6 @@ llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1", "s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer", load_format="tensorizer",
enable_lora=True, enable_lora=True,
model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}} model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}},
) )
``` ```

View File

@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
conversation = [ conversation = [
{ {
"role": "system", "role": "system",
"content": "You are a helpful assistant" "content": "You are a helpful assistant",
}, },
{ {
"role": "user", "role": "user",
"content": "Hello" "content": "Hello",
}, },
{ {
"role": "assistant", "role": "assistant",
"content": "Hello! How can I assist you today?" "content": "Hello! How can I assist you today?",
}, },
{ {
"role": "user", "role": "user",

View File

@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u
from vllm import LLM from vllm import LLM
llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
(output,) = llm.score("What is the capital of France?", (output,) = llm.score(
"The capital of Brazil is Brasilia.") "What is the capital of France?",
"The capital of Brazil is Brasilia.",
)
score = output.outputs.score score = output.outputs.score
print(f"Score: {score}") print(f"Score: {score}")
@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please
Here is an example to serve a model with Matryoshka Embeddings enabled. Here is an example to serve a model with Matryoshka Embeddings enabled.
```text ```bash
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}' vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
``` ```
@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka
```python ```python
from vllm import LLM, PoolingParams from vllm import LLM, PoolingParams
llm = LLM(model="jinaai/jina-embeddings-v3", llm = LLM(
runner="pooling", model="jinaai/jina-embeddings-v3",
trust_remote_code=True) runner="pooling",
outputs = llm.embed(["Follow the white rabbit."], trust_remote_code=True,
pooling_params=PoolingParams(dimensions=32)) )
outputs = llm.embed(
["Follow the white rabbit."],
pooling_params=PoolingParams(dimensions=32),
)
print(outputs[0].outputs) print(outputs[0].outputs)
``` ```
@ -234,13 +240,13 @@ A code example can be found here: <gh-file:examples/offline_inference/pooling/em
Use the following command to start vllm server. Use the following command to start vllm server.
```text ```bash
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
``` ```
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter. You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
```text ```bash
curl http://127.0.0.1:8000/v1/embeddings \ curl http://127.0.0.1:8000/v1/embeddings \
-H 'accept: application/json' \ -H 'accept: application/json' \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \

View File

@ -278,8 +278,8 @@ https_proxy=http://your.proxy.server:port vllm serve <model_name>
```python ```python
import os import os
os.environ['http_proxy'] = 'http://your.proxy.server:port' os.environ["http_proxy"] = "http://your.proxy.server:port"
os.environ['https_proxy'] = 'http://your.proxy.server:port' os.environ["https_proxy"] = "http://your.proxy.server:port"
``` ```
### ModelScope ### ModelScope

View File

@ -0,0 +1,47 @@
# Context Parallel Deployment
Context parallel mainly solves the problem of serving long context requests. As prefill and decode present quite different characteristics and have quite different SLO (service level objectives), we need to implement context parallel separately for them. The major considerations are:
- For long context prefill, we need to control the TTFT (time to first token) by amortizing the computation time of the prefill across query tokens.
- For long context decode, we need more space for KV cache to increase the batchsize (and hence the throughput).
## Prefill Context Parallel
During prefill, for a long request with `T` new tokens, we need to compute query/key/value tensors for these new tokens. Say we have `N` GPUs, we can split the request into `N` chunks, and each GPU computes one chunk of the query/key/value tensors.
Depending on the use case, there're two possible strategies:
1. Partial query, full key/value: If the request token length is moderately long (we can afford holding the full key/value tensors), and the goal is to accelerate the prefill (and amortize the computation time of the prefill across query tokens), then we can gather the key/value tensors from all GPUs and let each GPU compute the attention output corresponding to the query tokens of its chunk.
2. Partial query, partial key/value: If the request token length is too long, we cannot afford holding the full key/value tensors anymore, then we can only compute one chunk of query/key/value tensors for each GPU, and use techniques like [ring-attention](http://arxiv.org/abs/2310.01889) to send/recv key/value tensors chunk by chunk.
Both approaches are under active development.
## Decode Context Parallel
Due to the auto-regressive nature of decoding, every decoding step needs to compute a small amount of query tokens w.r.t. a large number of key/value tokens stored in the paged KV cache. The core of decode context parallel is how to shard the KV cache across GPUs.
For a model with `H` kv-heads, a request with `T` tokens in the context needs to store `H * T` key/value tensors in the KV cache.
1. If one GPU can hold them all, and the performance is good enough, then no parallelization is needed.
2. If one GPU cannot hold them all, or we want to hold more requests in the KV cache, we can first shard the KV cache along the `H` dimension, that's the plain tensor parallel sharding. It's as simple as adding `-tp <num_gpus>` to the command line.
3. Since `H` is limited (determined by the model architecture), when we continue to increase the tensor parallel size, the KV cache for each GPU will be duplicated for `tp_size / H` times. Of course, duplication is not good for efficiency. Then we need to add decode context parallel to further shard the KV cache along the `T` dimension. This is as simple as adding `-dcp <size>` to the command line. Note that `size` does not increase the number of GPUs we need to launch, but just reduces the KV cache duplication. The dcp size should lie in the range of `[1, tp_size/H]`. With larger dcp size, the KV cache duplication is reduced, but the communication overhead increases.
Theoretically, it is possible to extend the dcp size beyond `tp_size / H` to further shard the KV cache and accelerate the decoding phase. However, since the number of query tokens is limited in decoding, it's unclear what should we do for the remaining `dcp_size - tp_size / H` GPUs for non-attention layers. For the sake of simplicity, dcp size is upper bounded by `tp_size / H`. If you want to further accelerate the decoding phase, you can consider increasing the `tp_size` first, and then increasing the dcp size.
Note that kv cache can grow during decoding, and the sharding strategy needs to be carefully implemented. We use an interleaving strategy to shard the KV cache along the `T` dimension, so that kv cache for future tokens can be naturally sharded along the `T` dimension. This is proposed by [Chao Hong from Moonshot](https://github.com/youzhedian), and also explained in details in [this paper](http://arxiv.org/abs/2507.07120).
Case study:
For DeepSeek-R1, we have 1 kv-head when MLA is enabled. The typical single-node deployment with `-tp 8` causes 8x KV cache duplication. We can consider adding `-dcp 8` to reduce the KV cache duplication.
For Kimi-K2, the architecture is similar to DeepSeek-R1, but with more parameters. When we deploy it with `-tp 16`, the KV cache duplication is 16x. We can add `-dcp 16` to completely remove the KV cache duplication, at the cost of more communication overhead. We can also add `-dcp 8` to reduce the KV cache duplication to 2x. Although it still duplicates the KV cache twice, the communication overhead is smaller since the DCP communication only happens inside one node.
For Qwen3-235B-A22B, we have 4 kv-heads. When we deploy it with `-tp 8`, the KV cache duplication is 2x. Then we can add `-dcp 2` to remove the KV cache duplication.
In short, for decode context parallel, try to increase `-tp` size until you get satisfactory performance, and then add `-dcp` to reduce the KV cache duplication.
Decode context parallel is supported in vLLM, for both MLA and GQA models. Some attention backends also support the combination of decode context parallel and MTP (multi-token prediction) to further accelerate the decoding phase.
## Technical Discussions
The main discussions happen in the `#sig-context-parallel` channel of [vLLM Slack](https://slack.vllm.ai/).

View File

@ -243,10 +243,10 @@ try:
"remote_engine_id": None, # Will be populated by vLLM "remote_engine_id": None, # Will be populated by vLLM
"remote_block_ids": None, # Will be populated by vLLM "remote_block_ids": None, # Will be populated by vLLM
"remote_host": None, # Will be populated by vLLM "remote_host": None, # Will be populated by vLLM
"remote_port": None # Will be populated by vLLM "remote_port": None, # Will be populated by vLLM
} }
}, },
extra_headers={"X-Request-Id": request_id} extra_headers={"X-Request-Id": request_id},
) )
print("-" * 50) print("-" * 50)
@ -262,7 +262,7 @@ try:
extra_body={ extra_body={
"kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info "kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info
}, },
extra_headers={"X-Request-Id": request_id} # Same request ID extra_headers={"X-Request-Id": request_id}, # Same request ID
) )
print("-" * 50) print("-" * 50)

View File

@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
```python ```python
from langchain_community.llms import VLLM from langchain_community.llms import VLLM
llm = VLLM(model="mosaicml/mpt-7b", llm = VLLM(
trust_remote_code=True, # mandatory for hf models model="mosaicml/mpt-7b",
max_new_tokens=128, trust_remote_code=True, # mandatory for hf models
top_k=10, max_new_tokens=128,
top_p=0.95, top_k=10,
temperature=0.8, top_p=0.95,
# tensor_parallel_size=... # for distributed inference temperature=0.8,
# for distributed inference
# tensor_parallel_size=...,
) )
print(llm("What is the capital of France ?")) print(llm("What is the capital of France ?"))

View File

@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": "Hello!"} {"role": "user", "content": "Hello!"},
] ],
) )
print(completion.choices[0].message) print(completion.choices[0].message)
@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below:
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} {
] "role": "user",
"content": [
{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"},
],
},
],
) )
``` ```
@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
], ],
extra_body={ extra_body={
"structured_outputs": {"choice": ["positive", "negative"]} "structured_outputs": {"choice": ["positive", "negative"]},
} },
) )
``` ```
@ -149,11 +154,11 @@ with `--enable-request-id-headers`.
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
], ],
extra_headers={ extra_headers={
"x-request-id": "sentiment-classification-00001", "x-request-id": "sentiment-classification-00001",
} },
) )
print(completion._request_id) print(completion._request_id)
@ -162,7 +167,7 @@ with `--enable-request-id-headers`.
prompt="A robot may not injure a human being", prompt="A robot may not injure a human being",
extra_headers={ extra_headers={
"x-request-id": "completion-test", "x-request-id": "completion-test",
} },
) )
print(completion._request_id) print(completion._request_id)
``` ```
@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi
model="openai/whisper-large-v3-turbo", model="openai/whisper-large-v3-turbo",
file=audio_file, file=audio_file,
language="en", language="en",
response_format="verbose_json" response_format="verbose_json",
) )
print(transcription.text) print(transcription.text)
@ -812,22 +817,22 @@ You can pass multi-modal inputs to scoring models by passing `content` including
"model": "jinaai/jina-reranker-m0", "model": "jinaai/jina-reranker-m0",
"text_1": "slm markdown", "text_1": "slm markdown",
"text_2": { "text_2": {
"content": [ "content": [
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
}, },
}, },
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
}, },
}, },
] ],
}
}, },
},
) )
response.raise_for_status() response.raise_for_status()
response_json = response.json() response_json = response.json()

View File

@ -95,7 +95,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--compilation-config", "--compilation-config",
type=int, type=int,
help=("Compilation optimization (O) level 0-3."), help=("Compilation optimization (O) mode 0-3."),
) )
parser.add_argument( parser.add_argument(
"--quantization", "--quantization",

View File

@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
""" """
try: try:
url = s3_client.generate_presigned_url( url = s3_client.generate_presigned_url(
ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in ClientMethod=client_method,
Params=method_parameters,
ExpiresIn=expires_in,
) )
except ClientError: except ClientError:
raise raise
@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
s3_client = boto3.client("s3") s3_client = boto3.client("s3")
input_url = generate_presigned_url( input_url = generate_presigned_url(
s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600 s3_client,
"get_object",
{"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"},
expires_in=3600,
) )
output_url = generate_presigned_url( output_url = generate_presigned_url(
s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600 s3_client,
"put_object",
{"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"},
expires_in=3600,
) )
print(f"{input_url=}") print(f"{input_url=}")
print(f"{output_url=}") print(f"{output_url=}")

View File

@ -26,6 +26,12 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
python examples/offline_inference/pooling/embed_matryoshka_fy.py python examples/offline_inference/pooling/embed_matryoshka_fy.py
``` ```
## Multi vector retrieval usage
```bash
python examples/offline_inference/pooling/multi_vector_retrieval.py
```
## Named Entity Recognition (NER) usage ## Named Entity Recognition (NER) usage
```bash ```bash

View File

@ -0,0 +1,56 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="BAAI/bge-m3",
runner="pooling",
enforce_eager=True,
)
return parser.parse_args()
def main(args: Namespace):
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM.
# You should pass runner="pooling" for embedding models
llm = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = llm.embed(prompts)
# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)
for prompt, output in zip(prompts, outputs):
embeds = output.outputs.embedding
print(len(embeds))
# Generate embedding for each token. The output is a list of PoolingRequestOutput.
outputs = llm.encode(prompts, pooling_task="token_embed")
# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)
for prompt, output in zip(prompts, outputs):
multi_vector = output.outputs.data
print(multi_vector.shape)
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -40,7 +40,7 @@ def main():
model_impl="terratorch", model_impl="terratorch",
) )
pooling_params = PoolingParams(task="encode", softmax=False) pooling_params = PoolingParams(task="token_classify", activation=False)
pooler_output = llm.encode( pooler_output = llm.encode(
img_prompt, img_prompt,
pooling_params=pooling_params, pooling_params=pooling_params,

View File

@ -18,6 +18,12 @@ python examples/online_serving/pooling/embedding_embed_dtype_client.py
python examples/online_serving/pooling/jinaai_rerank_client.py python examples/online_serving/pooling/jinaai_rerank_client.py
``` ```
## Multi vector retrieval usage
```bash
python examples/online_serving/pooling/multi_vector_retrieval_client.py
```
## Named Entity Recognition (NER) usage ## Named Entity Recognition (NER) usage
```bash ```bash

View File

@ -0,0 +1,54 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example online usage of Pooling API for multi vector retrieval.
Run `vllm serve <model> --runner pooling`
to start up the server in vLLM. e.g.
vllm serve BAAI/bge-m3
"""
import argparse
import requests
import torch
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
headers = {"User-Agent": "Test Client"}
response = requests.post(api_url, headers=headers, json=prompt)
return response
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model", type=str, default="BAAI/bge-m3")
return parser.parse_args()
def main(args):
api_url = f"http://{args.host}:{args.port}/pooling"
model_name = args.model
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompt = {"model": model_name, "input": prompts}
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
for output in pooling_response.json()["data"]:
multi_vector = torch.tensor(output["data"])
print(multi_vector.shape)
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -84,7 +84,7 @@ directly to load models:
from vllm import LLM from vllm import LLM
llm = LLM( llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1", "s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer" load_format="tensorizer",
) )
``` ```

View File

@ -107,7 +107,6 @@ markers = [
"distributed: run this test only in distributed GPU tests", "distributed: run this test only in distributed GPU tests",
"skip_v1: do not run this test with v1", "skip_v1: do not run this test with v1",
"optional: optional tests that are automatically skipped, include --optional to run them", "optional: optional tests that are automatically skipped, include --optional to run them",
"extra_server_args: extra arguments to pass to the server fixture",
] ]
[tool.ty.src] [tool.ty.src]

View File

@ -7,7 +7,7 @@ requests >= 2.26.0
tqdm tqdm
blake3 blake3
py-cpuinfo py-cpuinfo
transformers >= 4.55.2 transformers >= 4.56.0
tokenizers >= 0.21.1 # Required for fast incremental detokenization. tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer. protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.

View File

@ -11,6 +11,7 @@ from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig from vllm.config import CompilationConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import is_torch_equal_or_newer
@contextlib.contextmanager @contextlib.contextmanager
@ -32,13 +33,13 @@ def temporary_environ(env_vars):
os.environ[k] = v os.environ[k] = v
test_params_full_cudagraph = [] model_backends_full_cudagraph = []
# deepseek-ai/DeepSeek-V2-Lite with MLA # deepseek-ai/DeepSeek-V2-Lite with MLA
MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"] MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
for mla_backend in MLA_backends: for mla_backend in MLA_backends:
test_params_full_cudagraph.append( model_backends_full_cudagraph.append(
pytest.param(("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])) ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])
) )
# Qwen/Qwen2-1.5B-Instruct with other backends # Qwen/Qwen2-1.5B-Instruct with other backends
@ -46,14 +47,18 @@ other_backend_configs = [
backend_configs[c] for c in backend_configs if c not in MLA_backends backend_configs[c] for c in backend_configs if c not in MLA_backends
] ]
for backend_config in other_backend_configs: for backend_config in other_backend_configs:
test_params_full_cudagraph.append( model_backends_full_cudagraph.append(("Qwen/Qwen2-1.5B-Instruct", backend_config))
pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config))
)
@pytest.fixture(scope="class") @pytest.fixture(scope="class")
def llm_pair(request): def llm_pair(request):
model, backend_config = request.param model, backend_config, use_inductor_graph_partition = request.param
backend_config.comp_config["use_inductor_graph_partition"] = (
use_inductor_graph_partition
)
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("Inductor graph partition only supported in torch>=2.9")
# Dynamically skip test if GPU capability is not met # Dynamically skip test if GPU capability is not met
if ( if (
@ -104,7 +109,15 @@ def llm_pair(request):
) )
@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True) @pytest.mark.parametrize(
"llm_pair",
[
pytest.param((model, backend_config, use_inductor_graph_partition))
for model, backend_config in model_backends_full_cudagraph
for use_inductor_graph_partition in [True, False]
],
indirect=True,
)
class TestFullCUDAGraph: class TestFullCUDAGraph:
""" """
Use a class such that an llm pair is constructed once for all Use a class such that an llm pair is constructed once for all

View File

@ -5,6 +5,7 @@ Test (piecewise) compilation with a simple model where multiple submodules
are compiled and graph captured separately. are compiled and graph captured separately.
""" """
import pytest
import torch import torch
from torch import nn from torch import nn
@ -13,12 +14,13 @@ from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
CUDAGraphMode, CUDAGraphMode,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
) )
from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import is_torch_equal_or_newer
# This import automatically registers `torch.ops.silly.attention` # This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401 from .. import silly_attention # noqa: F401
@ -190,16 +192,21 @@ def run_model(
return output.cpu() return output.cpu()
def test_multi_graph_piecewise_compile_outputs_equal(): @pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
outputs = [] outputs = []
# piecewise compile # vllmcompile compile
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
cudagraph_capture_sizes=[1, 2], cudagraph_capture_sizes=[1, 2],
use_inductor_graph_partition=use_inductor_graph_partition,
) )
) )
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@ -220,23 +227,31 @@ def test_multi_graph_piecewise_compile_outputs_equal():
# static tensor addresses # static tensor addresses
inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda() inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
with compilation_counter.expect( if use_inductor_graph_partition:
num_graphs_seen=2, # two graphs for the model # Splitting happens at Inductor lowering level,
num_piecewise_graphs_seen=6, # total piecewise fx graphs is equal to total graphs
num_piecewise_fx = 2
num_piecewise_capturable_fx = 2
else:
# attn_one, attn_two each has 3 piecewise graphs # attn_one, attn_two each has 3 piecewise graphs
# (pre attn, post attn, silly_attention) each # (pre attn, post attn, silly_attention) each
num_piecewise_capturable_graphs_seen=4, num_piecewise_fx = 6
# attn_one, attn_two has pre attn and post attn each, total=4 # attn_one, attn_two has pre attn and post attn each, total=4
num_backend_compilations=4, # num_piecewise_capturable_graphs_seen num_piecewise_capturable_fx = 4
num_cudagraph_captured=8,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen with compilation_counter.expect(
num_graphs_seen=2, # two graphs for the model
num_piecewise_graphs_seen=num_piecewise_fx,
num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
num_backend_compilations=num_piecewise_capturable_fx,
num_cudagraph_captured=8, # num_cudagraph_sizes * num_partitions
): ):
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
# no compile or cudagraph # no compile or cudagraph
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.NO_COMPILATION, mode=CompilationMode.NONE,
) )
) )
cudagraph_runtime_mode = CUDAGraphMode.NONE cudagraph_runtime_mode = CUDAGraphMode.NONE
@ -265,9 +280,10 @@ def test_multi_graph_piecewise_compile_outputs_equal():
# piecewise compile without CUDA graph # piecewise compile without CUDA graph
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=False, use_cudagraph=False,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
use_inductor_graph_partition=use_inductor_graph_partition,
) )
) )
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@ -286,9 +302,9 @@ def test_multi_graph_piecewise_compile_outputs_equal():
with compilation_counter.expect( with compilation_counter.expect(
num_graphs_seen=2, num_graphs_seen=2,
num_piecewise_graphs_seen=6, num_piecewise_graphs_seen=num_piecewise_fx,
num_piecewise_capturable_graphs_seen=4, num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
num_backend_compilations=4, num_backend_compilations=num_piecewise_capturable_fx,
num_cudagraph_captured=0, # no cudagraph captured num_cudagraph_captured=0, # no cudagraph captured
): ):
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))

View File

@ -13,7 +13,7 @@ from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
CUDAGraphMode, CUDAGraphMode,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
@ -61,7 +61,7 @@ def _run_simple_model(
): ):
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
use_inductor=use_inductor, use_inductor=use_inductor,
splitting_ops=splitting_ops, splitting_ops=splitting_ops,

View File

@ -9,6 +9,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
initialized randomly with a fixed seed. initialized randomly with a fixed seed.
""" """
from copy import deepcopy
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any from typing import Any
@ -20,12 +21,13 @@ from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
CUDAGraphMode, CUDAGraphMode,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
) )
from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import is_torch_equal_or_newer
# This import automatically registers `torch.ops.silly.attention` # This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401 from .. import silly_attention # noqa: F401
@ -257,27 +259,13 @@ def tractable_computation(
@torch.inference_mode @torch.inference_mode
def run_model( def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
llama_config, use_compile: bool, backend: str, split_attn: bool = False # Start with a fresh copy to make sure there's no cache dir sharing
) -> torch.Tensor: compile_config = deepcopy(compile_config)
if use_compile: cudagraph_runtime_mode = compile_config.cudagraph_mode
compilation_config = CompilationConfig(
level=CompilationLevel.PIECEWISE,
use_cudagraph=True,
backend=backend,
cudagraph_capture_sizes=[1, 2],
)
if split_attn:
compilation_config.splitting_ops = ["silly::attention"]
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
else:
compilation_config = CompilationConfig(
level=CompilationLevel.NO_COMPILATION,
)
cudagraph_runtime_mode = CUDAGraphMode.NONE
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=compilation_config, additional_config=llama_config compilation_config=compile_config, additional_config=llama_config
) )
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
model = ( model = (
@ -338,8 +326,25 @@ def run_model(
return output.cpu() return output.cpu()
@pytest.mark.parametrize("backend", ["inductor", "eager"]) @pytest.mark.parametrize(
def test_toy_llama(backend: str): "backend, use_inductor_graph_partition",
[
("eager", False), # No inductor
("inductor", False), # Inductor, Dynamo partition
("inductor", True), # Inductor, Inductor partition
],
)
def test_toy_llama(
backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
):
# We disable the vLLM compile cache into a new tmp dir for 2 reasons:
# 1. To make sure we can properly track the number of Inductor compilations.
# 2. Inductor partitioning does not play nicely with Autograd cache (below)
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("Inductor graph partition only supported in torch>=2.9")
# compare output with and without piecewise compilation # compare output with and without piecewise compilation
llama_config = LlamaConfig( llama_config = LlamaConfig(
@ -350,6 +355,32 @@ def test_toy_llama(backend: str):
hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True
) )
compile_config_no_compile = CompilationConfig(
level=CompilationMode.NONE,
cudagraph_mode=CUDAGraphMode.NONE,
backend="eager",
)
compile_config_no_split = CompilationConfig(
level=CompilationMode.VLLM_COMPILE,
use_inductor_graph_partition=use_inductor_graph_partition,
cudagraph_mode=CUDAGraphMode.PIECEWISE,
backend=backend,
cudagraph_capture_sizes=[1, 2],
)
# FIXME(luka/boyuan): the graph from the previous test case
# (no inductor partition) gets cached by AotAutograd so then the
# compilation with inductor partitioning incorrectly loads an unpartitioned
# graph and never partitions. I think this is a bug with custom inductor
# partitioning but does not affect vLLM more generally as vLLM uses its own
# cache (which takes inductor partitioning into account).
if use_inductor_graph_partition:
compile_config_no_split.inductor_compile_config["force_disable_caches"] = True
compile_config_split = deepcopy(compile_config_no_split)
compile_config_split.splitting_ops = ["silly::attention"]
outputs = [] outputs = []
with compilation_counter.expect( with compilation_counter.expect(
num_graphs_seen=0, num_graphs_seen=0,
@ -358,8 +389,9 @@ def test_toy_llama(backend: str):
num_backend_compilations=0, num_backend_compilations=0,
num_cudagraph_captured=0, num_cudagraph_captured=0,
): ):
outputs.append(run_model(llama_config, backend="eager", use_compile=False)) outputs.append(run_model(llama_config, compile_config_no_compile))
run_model(tractable_config, backend="eager", use_compile=False)
run_model(tractable_config, compile_config_no_compile)
if backend == "inductor": if backend == "inductor":
kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0} kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
@ -367,35 +399,34 @@ def test_toy_llama(backend: str):
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0} kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
with compilation_counter.expect( with compilation_counter.expect(
# One graph for the model num_graphs_seen=1, # one graph for the model
num_graphs_seen=1,
num_piecewise_graphs_seen=1, num_piecewise_graphs_seen=1,
num_piecewise_capturable_graphs_seen=1, num_piecewise_capturable_graphs_seen=1,
# num_piecewise_capturable_graphs_seen num_backend_compilations=1, # num_piecewise_capturable_graphs_seen
num_backend_compilations=1,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
num_cudagraph_captured=2, num_cudagraph_captured=2,
**kwargs, **kwargs,
): ):
outputs.append(run_model(llama_config, backend=backend, use_compile=True)) outputs.append(run_model(llama_config, compile_config_no_split))
run_model(tractable_config, backend=backend, use_compile=True)
run_model(tractable_config, compile_config_no_split)
if use_inductor_graph_partition:
num_piecewise_fx = 1
num_piecewise_capturable_fx = 1
else:
num_piecewise_fx = 2 * llama_config.num_layers + 1
num_piecewise_capturable_fx = 1 + llama_config.num_layers
with compilation_counter.expect( with compilation_counter.expect(
num_graphs_seen=1, # one graph for the model num_graphs_seen=1, # one graph for the model
num_piecewise_graphs_seen=2 * llama_config.num_layers + 1, # 2 * num_layers + 1 num_piecewise_graphs_seen=num_piecewise_fx,
num_piecewise_capturable_graphs_seen=1 num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+ llama_config.num_layers, # 1 + num_layers num_backend_compilations=num_piecewise_capturable_fx,
num_backend_compilations=1 # num_cudagraph_sizes * num_partitions
+ llama_config.num_layers, # num_piecewise_capturable_graphs_seen num_cudagraph_captured=2 * (1 + llama_config.num_layers),
num_cudagraph_captured=2
* (
1 + llama_config.num_layers
), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
): ):
outputs.append( outputs.append(run_model(llama_config, compile_config_split))
run_model(llama_config, backend=backend, use_compile=True, split_attn=True) run_model(tractable_config, compile_config_split)
)
run_model(tractable_config, backend=backend, use_compile=True, split_attn=True)
for i in range(1, len(outputs)): for i in range(1, len(outputs)):
assert torch.allclose(outputs[0], outputs[i]) assert torch.allclose(outputs[0], outputs[i])
@ -427,14 +458,14 @@ def benchmark():
for piecewise in [False, True]: for piecewise in [False, True]:
if piecewise: if piecewise:
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
cudagraph_capture_sizes=cudagraph_sizes, cudagraph_capture_sizes=cudagraph_sizes,
) )
else: else:
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
cudagraph_capture_sizes=cudagraph_sizes, cudagraph_capture_sizes=cudagraph_sizes,
) )

View File

@ -62,5 +62,4 @@ direct_register_custom_op(
mutates_args=["out"], mutates_args=["out"],
fake_impl=silly_attention_fake, fake_impl=silly_attention_fake,
target_lib=silly_lib, target_lib=silly_lib,
tags=(torch._C.Tag.cudagraph_unsafe,),
) )

View File

@ -10,7 +10,7 @@ import torch
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
) )
@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module):
def make_vllm_config() -> VllmConfig: def make_vllm_config() -> VllmConfig:
return VllmConfig( return VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, level=CompilationMode.VLLM_COMPILE,
) )
) )

View File

@ -10,6 +10,7 @@ import vllm.envs as envs
from vllm.compilation.collective_fusion import AsyncTPPass from vllm.compilation.collective_fusion import AsyncTPPass
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationMode,
DeviceConfig, DeviceConfig,
ModelConfig, ModelConfig,
PassConfig, PassConfig,
@ -400,7 +401,7 @@ def test_async_tp_pass_correctness(
common_args.append("--enforce-eager") common_args.append("--enforce-eager")
compilation_config = { compilation_config = {
"level": 3, "mode": CompilationMode.VLLM_COMPILE,
"compile_sizes": [2, 4, 8], "compile_sizes": [2, 4, 8],
"splitting_ops": [], "splitting_ops": [],
"pass_config": {"enable_async_tp": async_tp_enabled}, "pass_config": {"enable_async_tp": async_tp_enabled},

View File

@ -4,7 +4,7 @@ import dataclasses
import pytest import pytest
from vllm.config import CompilationLevel from vllm.config import CompilationMode
from vllm.utils import cuda_device_count_stateless from vllm.utils import cuda_device_count_stateless
from ..utils import compare_all_settings from ..utils import compare_all_settings
@ -21,7 +21,7 @@ class TestSetting:
# we cannot afford testing the full Cartesian product # we cannot afford testing the full Cartesian product
# of all models and all levels # of all models and all modes
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_setting", "test_setting",
[ [
@ -121,15 +121,13 @@ def test_compile_correctness(
all_args: list[list[str]] = [] all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = [] all_envs: list[dict[str, str] | None] = []
for comp_level in [ for comp_mode in [
CompilationLevel.DYNAMO_AS_IS, CompilationMode.STOCK_TORCH_COMPILE,
CompilationLevel.DYNAMO_ONCE, CompilationMode.DYNAMO_TRACE_ONCE,
CompilationLevel.PIECEWISE, CompilationMode.VLLM_COMPILE,
]: ]:
for level in [CompilationLevel.NO_COMPILATION, comp_level]: for mode in [CompilationMode.NONE, comp_mode]:
all_args.append( all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"])
final_args + [f"-O.level={level}", "-O.backend=inductor"]
)
# inductor will change the output, so we only compare if the output # inductor will change the output, so we only compare if the output
# is close, not exactly the same. # is close, not exactly the same.
@ -142,13 +140,13 @@ def test_compile_correctness(
all_envs.clear() all_envs.clear()
all_args.clear() all_args.clear()
for level in [ for mode in [
CompilationLevel.NO_COMPILATION, CompilationMode.NONE,
CompilationLevel.DYNAMO_AS_IS, CompilationMode.STOCK_TORCH_COMPILE,
CompilationLevel.DYNAMO_ONCE, CompilationMode.DYNAMO_TRACE_ONCE,
CompilationLevel.PIECEWISE, CompilationMode.VLLM_COMPILE,
]: ]:
all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"]) all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"])
all_envs.append({}) all_envs.append({})
all_envs.append({}) all_envs.append({})

View File

@ -4,7 +4,7 @@ import pytest
from vllm.compilation.counter import compilation_counter from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
from vllm.config.compilation import CompilationLevel from vllm.config.compilation import CompilationMode
from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked @pytest.mark.forked
def test_dynamo_as_is(vllm_runner, monkeypatch): def test_stock_torch_compile(vllm_runner, monkeypatch):
# Disable multiprocessing so that the counter is in the same process # Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with ( with (
compilation_counter.expect(dynamo_as_is_count=1), compilation_counter.expect(stock_torch_compile_count=1),
# loading the model causes compilation (if enabled) to happen # loading the model causes compilation (if enabled) to happen
vllm_runner( vllm_runner(
"facebook/opt-125m", "facebook/opt-125m",
compilation_config={"level": 1}, compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE},
gpu_memory_utilization=0.4, gpu_memory_utilization=0.4,
) as _, ) as _,
): ):
@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch):
# Disable multiprocessing so that the counter is in the same process # Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with ( with (
compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
# loading the model causes compilation (if enabled) to happen # loading the model causes compilation (if enabled) to happen
vllm_runner( vllm_runner(
"facebook/opt-125m", "facebook/opt-125m",
compilation_config={"level": 0}, compilation_config={"mode": CompilationMode.NONE},
gpu_memory_utilization=0.4, gpu_memory_utilization=0.4,
) as _, ) as _,
): ):
@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch):
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with ( with (
compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
# loading the model causes compilation (if enabled) to happen # loading the model causes compilation (if enabled) to happen
vllm_runner( vllm_runner(
"facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4 "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
@ -151,7 +151,7 @@ def test_splitting_ops_dynamic():
if is_torch_equal_or_newer("2.9.0.dev"): if is_torch_equal_or_newer("2.9.0.dev"):
config = VllmConfig( config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, level=CompilationMode.VLLM_COMPILE,
use_inductor_graph_partition=True, use_inductor_graph_partition=True,
splitting_ops=["vllm::unified_attention"], splitting_ops=["vllm::unified_attention"],
) )
@ -163,7 +163,7 @@ def test_splitting_ops_dynamic():
# When attn_fusion pass enabled, splitting_ops now default to attention ops. # When attn_fusion pass enabled, splitting_ops now default to attention ops.
config = VllmConfig( config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, level=CompilationMode.VLLM_COMPILE,
pass_config={"enable_attn_fusion": True, "enable_noop": True}, pass_config={"enable_attn_fusion": True, "enable_noop": True},
custom_ops=["+quant_fp8"], custom_ops=["+quant_fp8"],
cudagraph_mode=CUDAGraphMode.PIECEWISE, cudagraph_mode=CUDAGraphMode.PIECEWISE,
@ -178,7 +178,7 @@ def test_splitting_ops_dynamic():
if is_torch_equal_or_newer("2.9.0.dev"): if is_torch_equal_or_newer("2.9.0.dev"):
config = VllmConfig( config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, level=CompilationMode.VLLM_COMPILE,
use_inductor_graph_partition=True, use_inductor_graph_partition=True,
pass_config={"enable_attn_fusion": True, "enable_noop": True}, pass_config={"enable_attn_fusion": True, "enable_noop": True},
custom_ops=["+quant_fp8"], custom_ops=["+quant_fp8"],

View File

@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch import torch
from torch import nn from torch import nn
@ -8,12 +9,13 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_comp
from vllm.config import ( from vllm.config import (
CacheConfig, CacheConfig,
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
CUDAGraphMode, CUDAGraphMode,
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
) )
from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import is_torch_equal_or_newer
# This import automatically registers `torch.ops.silly.attention` # This import automatically registers `torch.ops.silly.attention`
from . import silly_attention # noqa: F401 from . import silly_attention # noqa: F401
@ -65,18 +67,40 @@ def run_model(
return output.cpu() return output.cpu()
def test_ignore_torch_compile_decorator(): @pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
def test_ignore_torch_compile_decorator(use_inductor_graph_partition, monkeypatch):
# disable compile cache so that we can count the number of compilations
# appropriately
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
# piecewise # piecewise
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
cudagraph_capture_sizes=[1, 2], cudagraph_capture_sizes=[1, 2],
use_inductor_graph_partition=use_inductor_graph_partition,
) )
) )
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
expected_num_graphs_seen = 1
expected_num_cudagraph_captured = (
4 # num_cudagraph_sizes * num cudagraphs to capture
)
if use_inductor_graph_partition:
expected_num_piecewise_graphs_seen = 1
expected_num_piecewise_capturable_graphs_seen = 1
expected_num_backend_compilations = 1
else:
expected_num_piecewise_graphs_seen = 3
expected_num_piecewise_capturable_graphs_seen = 2
expected_num_backend_compilations = 2
@support_torch_compile @support_torch_compile
class A(nn.Module): class A(nn.Module):
def __init__( def __init__(
@ -103,12 +127,11 @@ def test_ignore_torch_compile_decorator():
# A has support_torch_compile # A has support_torch_compile
with compilation_counter.expect( with compilation_counter.expect(
num_graphs_seen=1, num_graphs_seen=expected_num_graphs_seen,
num_piecewise_graphs_seen=3, num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
num_piecewise_capturable_graphs_seen=2, num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
num_backend_compilations=2, num_backend_compilations=expected_num_backend_compilations,
num_cudagraph_captured=4, num_cudagraph_captured=expected_num_cudagraph_captured,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
): ):
run_model(vllm_config, mod_A, cudagraph_runtime_mode) run_model(vllm_config, mod_A, cudagraph_runtime_mode)
@ -130,12 +153,11 @@ def test_ignore_torch_compile_decorator():
# C's support_torch_compile should override B's ignore_torch_compile # C's support_torch_compile should override B's ignore_torch_compile
with compilation_counter.expect( with compilation_counter.expect(
num_graphs_seen=1, num_graphs_seen=expected_num_graphs_seen,
num_piecewise_graphs_seen=3, num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
num_piecewise_capturable_graphs_seen=2, num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
num_backend_compilations=2, num_backend_compilations=expected_num_backend_compilations,
num_cudagraph_captured=4, num_cudagraph_captured=expected_num_cudagraph_captured,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
): ):
run_model(vllm_config, mod_C, cudagraph_runtime_mode) run_model(vllm_config, mod_C, cudagraph_runtime_mode)
@ -178,16 +200,25 @@ class A(nn.Module):
return x return x
def test_conditional_compile_enable_if(): @pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch):
# disable compile cache so that we can count the number of compilations
# appropriately
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
vllm_config = VllmConfig( vllm_config = VllmConfig(
cache_config=CacheConfig( cache_config=CacheConfig(
kv_sharing_fast_prefill=True, kv_sharing_fast_prefill=True,
), ),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
cudagraph_capture_sizes=[1, 2], cudagraph_capture_sizes=[1, 2],
use_inductor_graph_partition=use_inductor_graph_partition,
), ),
) )
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@ -195,17 +226,26 @@ def test_conditional_compile_enable_if():
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda() mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
if use_inductor_graph_partition:
expected_num_piecewise_graphs_seen = 2
expected_num_piecewise_capturable_graphs_seen = 2
expected_num_backend_compilations = 2
else:
expected_num_piecewise_graphs_seen = 6
expected_num_piecewise_capturable_graphs_seen = 4
expected_num_backend_compilations = 4
# A has support_torch_compile but enable_if fn returns False # A has support_torch_compile but enable_if fn returns False
# enalbe_if will be True for B, so we expect mod1 and mod2 # enalbe_if will be True for B, so we expect mod1 and mod2
# to be compiled # to be compiled
with compilation_counter.expect( with compilation_counter.expect(
num_graphs_seen=2, num_graphs_seen=2,
num_piecewise_graphs_seen=6, num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
# 3 piecewise graphs per instance of B() # 3 piecewise graphs per instance of B()
num_piecewise_capturable_graphs_seen=4, num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
num_backend_compilations=4, num_backend_compilations=expected_num_backend_compilations,
num_cudagraph_captured=8, num_cudagraph_captured=8,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen # num_cudagraph_sizes * num cudagraphable graphs to capture
): ):
run_model(vllm_config, mod_A, cudagraph_runtime_mode) run_model(vllm_config, mod_A, cudagraph_runtime_mode)
@ -216,23 +256,34 @@ def test_conditional_compile_enable_if():
kv_sharing_fast_prefill=False, kv_sharing_fast_prefill=False,
), ),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_cudagraph=True, use_cudagraph=True,
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
cudagraph_capture_sizes=[1, 2], cudagraph_capture_sizes=[1, 2],
use_inductor_graph_partition=use_inductor_graph_partition,
), ),
) )
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda() mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
if use_inductor_graph_partition:
expected_num_piecewise_graphs_seen = 1
expected_num_piecewise_capturable_graphs_seen = 1
expected_num_backend_compilations = 1
else:
# 3 attn ops and 4 non-attn ops
expected_num_piecewise_graphs_seen = 7
expected_num_piecewise_capturable_graphs_seen = 4
expected_num_backend_compilations = 4
with compilation_counter.expect( with compilation_counter.expect(
num_graphs_seen=1, num_graphs_seen=1,
num_piecewise_graphs_seen=7, num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
# 3 attn ops and 4 non-attn ops # 3 attn ops and 4 non-attn ops
num_piecewise_capturable_graphs_seen=4, num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
num_backend_compilations=4, num_backend_compilations=expected_num_backend_compilations,
num_cudagraph_captured=8, num_cudagraph_captured=8,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen # num_cudagraph_sizes * num cudagraphable graphs to capture
): ):
run_model(vllm_config, mod_A, cudagraph_runtime_mode) run_model(vllm_config, mod_A, cudagraph_runtime_mode)

View File

@ -12,7 +12,7 @@ from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.attention.backends.registry import _Backend from vllm.attention.backends.registry import _Backend
from vllm.attention.selector import global_force_attn_backend_context_manager from vllm.attention.selector import global_force_attn_backend_context_manager
from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import is_torch_equal_or_newer from vllm.utils import is_torch_equal_or_newer
@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"optimization_level", "compilation_mode",
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
) )
@pytest.mark.parametrize("model_info", models_list(all=True)) @pytest.mark.parametrize("model_info", models_list(all=True))
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_full_graph( def test_full_graph(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
model_info: tuple[str, dict[str, Any]], model_info: tuple[str, dict[str, Any]],
optimization_level: int, compilation_mode: int,
): ):
model, model_kwargs = model_info model, model_kwargs = model_info
with monkeypatch.context(): with monkeypatch.context():
print(f"MODEL={model}") print(f"MODEL={model}")
run_model(optimization_level, model, model_kwargs) run_model(compilation_mode, model, model_kwargs)
# TODO(luka) add other supported compilation config scenarios here # TODO(luka) add other supported compilation config scenarios here
@ -104,7 +104,7 @@ def test_full_graph(
[ [
# additional compile sizes, only some of the models # additional compile sizes, only some of the models
( (
CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]), CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
model, model,
) )
for model in models_list(all=False) for model in models_list(all=False)
@ -113,7 +113,7 @@ def test_full_graph(
# RMSNorm + quant fusion, only 8-bit quant models # RMSNorm + quant fusion, only 8-bit quant models
( (
CompilationConfig( CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+rms_norm"], custom_ops=["+rms_norm"],
pass_config=PassConfig(enable_fusion=True, enable_noop=True), pass_config=PassConfig(enable_fusion=True, enable_noop=True),
), ),
@ -125,7 +125,8 @@ def test_full_graph(
# Test depyf integration works # Test depyf integration works
( (
CompilationConfig( CompilationConfig(
level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir() mode=CompilationMode.VLLM_COMPILE,
debug_dump_path=tempfile.gettempdir(),
), ),
("facebook/opt-125m", {}), ("facebook/opt-125m", {}),
), ),
@ -134,7 +135,7 @@ def test_full_graph(
# graph inductor partition # graph inductor partition
( (
CompilationConfig( CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
# inductor graph partition uses # inductor graph partition uses
# torch._C.Tag.cudagraph_unsafe to specify splitting ops # torch._C.Tag.cudagraph_unsafe to specify splitting ops
use_inductor_graph_partition=True, use_inductor_graph_partition=True,
@ -164,10 +165,10 @@ def test_custom_compile_config(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"optimization_level", "compilation_mode",
[CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE], [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
) )
def test_fp8_kv_scale_compile(optimization_level: int): def test_fp8_kv_scale_compile(compilation_mode: int):
model = "Qwen/Qwen2-0.5B" model = "Qwen/Qwen2-0.5B"
model_kwargs = { model_kwargs = {
"quantization": "fp8", "quantization": "fp8",
@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int):
"calculate_kv_scales": True, "calculate_kv_scales": True,
"max_model_len": 512, "max_model_len": 512,
} }
run_model(optimization_level, model, model_kwargs) run_model(compilation_mode, model, model_kwargs)
def test_inductor_graph_partition_attn_fusion(caplog_vllm): def test_inductor_graph_partition_attn_fusion(caplog_vllm):
@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
use_inductor_graph_partition=True, use_inductor_graph_partition=True,
cudagraph_mode=CUDAGraphMode.PIECEWISE, cudagraph_mode=CUDAGraphMode.PIECEWISE,
custom_ops=["+quant_fp8"], custom_ops=["+quant_fp8"],

View File

@ -13,7 +13,7 @@ from vllm.compilation.fusion import (
) )
from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass from vllm.compilation.post_cleanup import PostCleanupPass
from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape, GroupShape,
@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant(
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+rms_norm", "+quant_fp8"], custom_ops=["+rms_norm", "+quant_fp8"],
pass_config=PassConfig(enable_fusion=True, enable_noop=True), pass_config=PassConfig(enable_fusion=True, enable_noop=True),
) )

View File

@ -12,7 +12,7 @@ from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass from vllm.compilation.post_cleanup import PostCleanupPass
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
DeviceConfig, DeviceConfig,
ModelConfig, ModelConfig,
PassConfig, PassConfig,
@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model(
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"] mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"]
) )
) )
vllm_config.compilation_config.pass_config = PassConfig( vllm_config.compilation_config.pass_config = PassConfig(

View File

@ -19,7 +19,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass
from vllm.config import ( from vllm.config import (
CacheConfig, CacheConfig,
CompilationConfig, CompilationConfig,
CompilationLevel, CompilationMode,
ModelConfig, ModelConfig,
PassConfig, PassConfig,
SchedulerConfig, SchedulerConfig,
@ -321,7 +321,7 @@ def test_attention_quant_pattern(
), ),
scheduler_config=SchedulerConfig(max_num_seqs=1024), scheduler_config=SchedulerConfig(max_num_seqs=1024),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+quant_fp8"], custom_ops=["+quant_fp8"],
use_inductor_graph_partition=use_inductor_graph_partition, use_inductor_graph_partition=use_inductor_graph_partition,
), ),
@ -421,7 +421,9 @@ def test_attention_quant_pattern(
] ]
if any(attn_fusion_supported): if any(attn_fusion_supported):
# Check quantization ops in the graph before and after fusion # Check quantization ops in the graph before and after fusion
test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=True) # Note: fully_replaced=False because query quant ops remain in graph.
# Only output quant ops are fused into attention.
test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=False)
# access the underlying `AttnFusionPass` on the `LazyInitPass` # access the underlying `AttnFusionPass` on the `LazyInitPass`
assert attn_pass.pass_.matched_count == sum(attn_fusion_supported) assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)

View File

@ -6,7 +6,7 @@ import torch
import vllm import vllm
from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
from .backend import TestBackend from .backend import TestBackend
@ -50,7 +50,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
pass_config=PassConfig(enable_noop=True), pass_config=PassConfig(enable_noop=True),
) )
) )
@ -98,7 +98,7 @@ def test_non_noop_slice_preserved():
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE, mode=CompilationMode.VLLM_COMPILE,
pass_config=PassConfig(enable_noop=True), pass_config=PassConfig(enable_noop=True),
) )
) )

View File

@ -5,7 +5,7 @@
import torch import torch
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
from vllm.config import CompilationLevel from vllm.config import CompilationMode
class MyMod(torch.nn.Module): class MyMod(torch.nn.Module):
@ -20,7 +20,7 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
self.model = model self.model = model
compiled_callable = torch.compile(self.forward, backend="eager") compiled_callable = torch.compile(self.forward, backend="eager")
super().__init__( super().__init__(
compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
) )
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):

View File

@ -334,7 +334,7 @@ class HfRunner:
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
self.device = self.get_default_device() self.device = self.get_default_device()
self.dtype = torch_dtype = _get_and_verify_dtype( self.dtype = dtype = _get_and_verify_dtype(
self.model_name, self.model_name,
self.config, self.config,
dtype=dtype, dtype=dtype,
@ -342,7 +342,7 @@ class HfRunner:
) )
model_kwargs = model_kwargs if model_kwargs is not None else {} model_kwargs = model_kwargs if model_kwargs is not None else {}
model_kwargs.setdefault("torch_dtype", torch_dtype) model_kwargs.setdefault("dtype", dtype)
if is_sentence_transformer: if is_sentence_transformer:
# Lazy init required for AMD CI # Lazy init required for AMD CI
@ -388,7 +388,7 @@ class HfRunner:
if not skip_tokenizer_init: if not skip_tokenizer_init:
self.tokenizer = AutoTokenizer.from_pretrained( self.tokenizer = AutoTokenizer.from_pretrained(
model_name, model_name,
torch_dtype=torch_dtype, dtype=dtype,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
@ -398,7 +398,7 @@ class HfRunner:
self.processor = AutoProcessor.from_pretrained( self.processor = AutoProcessor.from_pretrained(
model_name, model_name,
torch_dtype=torch_dtype, dtype=dtype,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
if skip_tokenizer_init: if skip_tokenizer_init:
@ -1011,8 +1011,12 @@ class VllmRunner:
req_outputs = self.llm.embed(inputs, *args, **kwargs) req_outputs = self.llm.embed(inputs, *args, **kwargs)
return [req_output.outputs.embedding for req_output in req_outputs] return [req_output.outputs.embedding for req_output in req_outputs]
def encode(self, prompts: list[str]) -> list[list[float]]: def token_embed(self, prompts: list[str]) -> list[list[float]]:
req_outputs = self.llm.encode(prompts) req_outputs = self.llm.encode(prompts, pooling_task="token_embed")
return [req_output.outputs.data for req_output in req_outputs]
def token_classify(self, prompts: list[str]) -> list[list[float]]:
req_outputs = self.llm.encode(prompts, pooling_task="token_classify")
return [req_output.outputs.data for req_output in req_outputs] return [req_output.outputs.data for req_output in req_outputs]
def reward(self, prompts: list[str]) -> list[list[float]]: def reward(self, prompts: list[str]) -> list[list[float]]:

View File

@ -15,6 +15,7 @@ from typing import Literal, NamedTuple
import pytest import pytest
from vllm.config.compilation import CompilationMode
from vllm.config.model import RunnerOption from vllm.config.model import RunnerOption
from vllm.logger import init_logger from vllm.logger import init_logger
@ -234,7 +235,7 @@ def _compare_sp(
common_args.append("--skip-tokenizer-init") common_args.append("--skip-tokenizer-init")
compilation_config = { compilation_config = {
"level": 3, "mode": CompilationMode.VLLM_COMPILE,
"custom_ops": ["+rms_norm"], "custom_ops": ["+rms_norm"],
"compile_sizes": [4, 8], "compile_sizes": [4, 8],
"pass_config": { "pass_config": {

View File

@ -226,30 +226,30 @@ def test_compilation_config():
# set to O3 # set to O3
args = parser.parse_args(["-O0"]) args = parser.parse_args(["-O0"])
assert args.compilation_config.level == 0 assert args.compilation_config.mode == 0
# set to O 3 (space) # set to O 3 (space)
args = parser.parse_args(["-O", "1"]) args = parser.parse_args(["-O", "1"])
assert args.compilation_config.level == 1 assert args.compilation_config.mode == 1
# set to O 3 (equals) # set to O 3 (equals)
args = parser.parse_args(["-O=2"]) args = parser.parse_args(["-O=2"])
assert args.compilation_config.level == 2 assert args.compilation_config.mode == 2
# set to O.level 3 # set to O.mode 3
args = parser.parse_args(["-O.level", "3"]) args = parser.parse_args(["-O.mode", "3"])
assert args.compilation_config.level == 3 assert args.compilation_config.mode == 3
# set to string form of a dict # set to string form of a dict
args = parser.parse_args( args = parser.parse_args(
[ [
"-O", "-O",
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": false}', '"use_inductor": false}',
] ]
) )
assert ( assert (
args.compilation_config.level == 3 args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and not args.compilation_config.use_inductor and not args.compilation_config.use_inductor
) )
@ -258,12 +258,12 @@ def test_compilation_config():
args = parser.parse_args( args = parser.parse_args(
[ [
"--compilation-config=" "--compilation-config="
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": true}', '"use_inductor": true}',
] ]
) )
assert ( assert (
args.compilation_config.level == 3 args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and args.compilation_config.use_inductor and args.compilation_config.use_inductor
) )

View File

@ -53,21 +53,34 @@ def base64_encoded_audio() -> dict[str, str]:
} }
def dummy_messages_from_audio_url(
audio_urls: str | list[str],
content_text: str = "What's happening in this audio?",
):
if isinstance(audio_urls, str):
audio_urls = [audio_urls]
return [
{
"role": "user",
"content": [
*(
{"type": "audio_url", "audio_url": {"url": audio_url}}
for audio_url in audio_urls
),
{"type": "text", "text": content_text},
],
}
]
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
async def test_single_chat_session_audio( async def test_single_chat_session_audio(
client: openai.AsyncOpenAI, model_name: str, audio_url: str client: openai.AsyncOpenAI, model_name: str, audio_url: str
): ):
messages = [ messages = dummy_messages_from_audio_url(audio_url)
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": audio_url}},
{"type": "text", "text": "What's happening in this audio?"},
],
}
]
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
@ -138,20 +151,9 @@ async def test_single_chat_session_audio_base64encoded(
audio_url: str, audio_url: str,
base64_encoded_audio: dict[str, str], base64_encoded_audio: dict[str, str],
): ):
messages = [ messages = dummy_messages_from_audio_url(
{ f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
"role": "user", )
"content": [
{
"type": "audio_url",
"audio_url": {
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501
},
},
{"type": "text", "text": "What's happening in this audio?"},
],
}
]
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
@ -252,15 +254,7 @@ async def test_single_chat_session_input_audio(
async def test_chat_streaming_audio( async def test_chat_streaming_audio(
client: openai.AsyncOpenAI, model_name: str, audio_url: str client: openai.AsyncOpenAI, model_name: str, audio_url: str
): ):
messages = [ messages = dummy_messages_from_audio_url(audio_url)
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": audio_url}},
{"type": "text", "text": "What's happening in this audio?"},
],
}
]
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
@ -365,18 +359,7 @@ async def test_chat_streaming_input_audio(
async def test_multi_audio_input( async def test_multi_audio_input(
client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str] client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]
): ):
messages = [ messages = dummy_messages_from_audio_url(audio_urls)
{
"role": "user",
"content": [
*(
{"type": "audio_url", "audio_url": {"url": audio_url}}
for audio_url in audio_urls
),
{"type": "text", "text": "What's happening in this audio?"},
],
}
]
if len(audio_urls) > MAXIMUM_AUDIOS: if len(audio_urls) > MAXIMUM_AUDIOS:
with pytest.raises(openai.BadRequestError): # test multi-audio input with pytest.raises(openai.BadRequestError): # test multi-audio input

View File

@ -55,21 +55,34 @@ def base64_encoded_video() -> dict[str, str]:
} }
def dummy_messages_from_video_url(
video_urls: str | list[str],
content_text: str = "What's in this video?",
):
if isinstance(video_urls, str):
video_urls = [video_urls]
return [
{
"role": "user",
"content": [
*(
{"type": "video_url", "video_url": {"url": video_url}}
for video_url in video_urls
),
{"type": "text", "text": content_text},
],
}
]
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video( async def test_single_chat_session_video(
client: openai.AsyncOpenAI, model_name: str, video_url: str client: openai.AsyncOpenAI, model_name: str, video_url: str
): ):
messages = [ messages = dummy_messages_from_video_url(video_url)
{
"role": "user",
"content": [
{"type": "video_url", "video_url": {"url": video_url}},
{"type": "text", "text": "What's in this video?"},
],
}
]
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
@ -137,15 +150,7 @@ async def test_error_on_invalid_video_url_type(
async def test_single_chat_session_video_beamsearch( async def test_single_chat_session_video_beamsearch(
client: openai.AsyncOpenAI, model_name: str, video_url: str client: openai.AsyncOpenAI, model_name: str, video_url: str
): ):
messages = [ messages = dummy_messages_from_video_url(video_url)
{
"role": "user",
"content": [
{"type": "video_url", "video_url": {"url": video_url}},
{"type": "text", "text": "What's in this video?"},
],
}
]
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
@ -172,20 +177,9 @@ async def test_single_chat_session_video_base64encoded(
video_url: str, video_url: str,
base64_encoded_video: dict[str, str], base64_encoded_video: dict[str, str],
): ):
messages = [ messages = dummy_messages_from_video_url(
{ f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
"role": "user", )
"content": [
{
"type": "video_url",
"video_url": {
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
},
},
{"type": "text", "text": "What's in this video?"},
],
}
]
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
@ -231,20 +225,10 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
video_url: str, video_url: str,
base64_encoded_video: dict[str, str], base64_encoded_video: dict[str, str],
): ):
messages = [ messages = dummy_messages_from_video_url(
{ f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
"role": "user", )
"content": [
{
"type": "video_url",
"video_url": {
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
},
},
{"type": "text", "text": "What's in this video?"},
],
}
]
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
@ -265,15 +249,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
async def test_chat_streaming_video( async def test_chat_streaming_video(
client: openai.AsyncOpenAI, model_name: str, video_url: str client: openai.AsyncOpenAI, model_name: str, video_url: str
): ):
messages = [ messages = dummy_messages_from_video_url(video_url)
{
"role": "user",
"content": [
{"type": "video_url", "video_url": {"url": video_url}},
{"type": "text", "text": "What's in this video?"},
],
}
]
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
@ -318,18 +294,7 @@ async def test_chat_streaming_video(
async def test_multi_video_input( async def test_multi_video_input(
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str] client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
): ):
messages = [ messages = dummy_messages_from_video_url(video_urls)
{
"role": "user",
"content": [
*(
{"type": "video_url", "video_url": {"url": video_url}}
for video_url in video_urls
),
{"type": "text", "text": "What's in this video?"},
],
}
]
if len(video_urls) > MAXIMUM_VIDEOS: if len(video_urls) > MAXIMUM_VIDEOS:
with pytest.raises(openai.BadRequestError): # test multi-video input with pytest.raises(openai.BadRequestError): # test multi-video input

View File

@ -78,6 +78,27 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]:
} }
def dummy_messages_from_image_url(
image_urls: str | list[str],
content_text: str = "What's in this image?",
):
if isinstance(image_urls, str):
image_urls = [image_urls]
return [
{
"role": "user",
"content": [
*(
{"type": "image_url", "image_url": {"url": image_url}}
for image_url in image_urls
),
{"type": "text", "text": content_text},
],
}
]
def get_hf_prompt_tokens(model_name, content, image_url): def get_hf_prompt_tokens(model_name, content, image_url):
processor = AutoProcessor.from_pretrained( processor = AutoProcessor.from_pretrained(
model_name, trust_remote_code=True, num_crops=4 model_name, trust_remote_code=True, num_crops=4
@ -107,15 +128,7 @@ async def test_single_chat_session_image(
client: openai.AsyncOpenAI, model_name: str, image_url: str client: openai.AsyncOpenAI, model_name: str, image_url: str
): ):
content_text = "What's in this image?" content_text = "What's in this image?"
messages = [ messages = dummy_messages_from_image_url(image_url, content_text)
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": content_text},
],
}
]
max_completion_tokens = 10 max_completion_tokens = 10
# test single completion # test single completion
@ -188,15 +201,8 @@ async def test_error_on_invalid_image_url_type(
async def test_single_chat_session_image_beamsearch( async def test_single_chat_session_image_beamsearch(
client: openai.AsyncOpenAI, model_name: str, image_url: str client: openai.AsyncOpenAI, model_name: str, image_url: str
): ):
messages = [ content_text = "What's in this image?"
{ messages = dummy_messages_from_image_url(image_url, content_text)
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "What's in this image?"},
],
}
]
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
@ -226,20 +232,10 @@ async def test_single_chat_session_image_base64encoded(
base64_encoded_image: dict[str, str], base64_encoded_image: dict[str, str],
): ):
content_text = "What's in this image?" content_text = "What's in this image?"
messages = [ messages = dummy_messages_from_image_url(
{ f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
"role": "user", content_text,
"content": [ )
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
},
},
{"type": "text", "text": content_text},
],
}
]
max_completion_tokens = 10 max_completion_tokens = 10
# test single completion # test single completion
@ -293,20 +289,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
raw_image_url = TEST_IMAGE_ASSETS[image_idx] raw_image_url = TEST_IMAGE_ASSETS[image_idx]
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
messages = [ messages = dummy_messages_from_image_url(
{ f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
"role": "user", )
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
},
},
{"type": "text", "text": "What's in this image?"},
],
}
]
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
@ -326,15 +312,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
async def test_chat_streaming_image( async def test_chat_streaming_image(
client: openai.AsyncOpenAI, model_name: str, image_url: str client: openai.AsyncOpenAI, model_name: str, image_url: str
): ):
messages = [ messages = dummy_messages_from_image_url(image_url)
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "What's in this image?"},
],
}
]
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
@ -381,18 +359,7 @@ async def test_chat_streaming_image(
async def test_multi_image_input( async def test_multi_image_input(
client: openai.AsyncOpenAI, model_name: str, image_urls: list[str] client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
): ):
messages = [ messages = dummy_messages_from_image_url(image_urls)
{
"role": "user",
"content": [
*(
{"type": "image_url", "image_url": {"url": image_url}}
for image_url in image_urls
),
{"type": "text", "text": "What's in this image?"},
],
}
]
if len(image_urls) > MAXIMUM_IMAGES: if len(image_urls) > MAXIMUM_IMAGES:
with pytest.raises(openai.BadRequestError): # test multi-image input with pytest.raises(openai.BadRequestError): # test multi-image input

View File

@ -0,0 +1,243 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock, patch
import pytest
from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction,
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
SIMPLE_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "San Francisco", "metric": "celsius"}',
)
MORE_TYPES_FUNCTION_OUTPUT = (
"register_user(name='John Doe', "
"age=37, "
"address={'city': 'San Francisco', 'state': 'CA'}, "
"role=None, "
"passed_test=True, "
"aliases=['John', 'Johnny'])"
)
MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS = (
"register_user(name='John Doe', "
"age=37, "
"address={'city': 'San Francisco', 'state': 'CA'}, "
"role=null, "
"passed_test=true, "
"aliases=['John', 'Johnny'])"
)
MORE_TYPES_FUNCTION_CALL = FunctionCall(
name="register_user",
arguments='{"name": "John Doe", '
'"age": 37, '
'"address": {"city": "San Francisco", "state": "CA"}, '
'"role": null, '
'"passed_test": true, '
'"aliases": ["John", "Johnny"]}',
)
PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments="{}",
)
EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
EMPTY_DICT_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"additional_data": {}}',
)
EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
EMPTY_LIST_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"steps": []}',
)
ESCAPED_STRING_FUNCTION_OUTPUT = (
r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
)
ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
)
@pytest.mark.parametrize("streaming", [True, False])
def test_no_tool_call(streaming: bool):
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
model_output = "How can I help you today?"
content, tool_calls = run_tool_extraction(
tool_parser, model_output, streaming=streaming
)
assert content == model_output
assert len(tool_calls) == 0
TEST_CASES = [
pytest.param(
True,
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
[SIMPLE_FUNCTION_CALL],
id="simple_streaming",
),
pytest.param(
False,
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
[SIMPLE_FUNCTION_CALL],
id="simple_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
[MORE_TYPES_FUNCTION_CALL],
id="more_types_streaming",
),
pytest.param(
False,
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
[MORE_TYPES_FUNCTION_CALL],
id="more_types_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
[MORE_TYPES_FUNCTION_CALL],
id="more_types_streaming_json_literals",
),
pytest.param(
False,
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
[MORE_TYPES_FUNCTION_CALL],
id="more_types_nonstreaming_json_literals",
),
pytest.param(
True,
f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
[PARAMETERLESS_FUNCTION_CALL],
id="parameterless_streaming",
),
pytest.param(
False,
f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
[PARAMETERLESS_FUNCTION_CALL],
id="parameterless_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
[EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_streaming",
),
pytest.param(
False,
f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
[EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
[EMPTY_LIST_FUNCTION_CALL],
id="empty_list_streaming",
),
pytest.param(
False,
f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
[EMPTY_LIST_FUNCTION_CALL],
id="empty_list_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_streaming",
),
pytest.param(
False,
f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_nonstreaming",
),
pytest.param(
True,
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
id="parallel_calls_streaming",
),
pytest.param(
False,
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
id="parallel_calls_nonstreaming",
),
]
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
def test_tool_call(
streaming: bool, model_output: str, expected_tool_calls: list[FunctionCall]
):
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
content, tool_calls = run_tool_extraction(
tool_parser, model_output, streaming=streaming
)
assert content is None
assert len(tool_calls) == len(expected_tool_calls)
for actual, expected in zip(tool_calls, expected_tool_calls):
assert actual.type == "function"
assert actual.function == expected
def test_streaming_tool_call_with_large_steps():
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
model_output_deltas = [
"<function_calls>get_weather(city='San",
" Francisco', metric='celsius')\n"
f"{PARAMETERLESS_FUNCTION_OUTPUT}\n"
f"{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
]
reconstructor = run_tool_extraction_streaming(
tool_parser, model_output_deltas, assert_one_tool_per_delta=False
)
assert reconstructor.other_content == ""
assert len(reconstructor.tool_calls) == 3
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
@pytest.mark.parametrize("streaming", [False])
def test_regex_timeout_handling(streaming: bool):
"""test regex timeout is handled gracefully"""
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
# create a mock regex that raises TimeoutError
mock_regex = MagicMock()
mock_regex.match.side_effect = TimeoutError("Regex timeout")
with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
content, tool_calls = run_tool_extraction(
tool_parser, fake_problematic_input, streaming=streaming
)
# should treat as regular text when regex times out
assert content == fake_problematic_input
assert len(tool_calls) == 0
mock_regex.match.assert_called_once()

View File

@ -63,7 +63,7 @@ def test_encode_api(llm: LLM):
# chunked prefill does not support all pooling # chunked prefill does not support all pooling
err_msg = "pooling_task must be one of.+" err_msg = "pooling_task must be one of.+"
with pytest.raises(ValueError, match=err_msg): with pytest.raises(ValueError, match=err_msg):
llm.encode(prompts, use_tqdm=False) llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
def test_score_api(llm: LLM): def test_score_api(llm: LLM):

View File

@ -35,6 +35,13 @@ def llm():
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_encode_api(llm: LLM):
outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
multi_vector = outputs[0].outputs.data
assert multi_vector.shape == (11, 384)
def test_pooling_params(llm: LLM): def test_pooling_params(llm: LLM):
def get_outputs(normalize): def get_outputs(normalize):
outputs = llm.embed( outputs = llm.embed(

View File

@ -57,20 +57,24 @@ def test_multiple_pooling_params(llm: LLM):
] ]
# Multiple PoolingParams should be matched with each prompt # Multiple PoolingParams should be matched with each prompt
outputs = llm.encode(PROMPTS, pooling_params=pooling_params) outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed")
assert len(PROMPTS) == len(outputs) assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts # Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError): with pytest.raises(ValueError):
outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3]) outputs = llm.encode(
PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed"
)
# Single PoolingParams should be applied to every prompt # Single PoolingParams should be applied to every prompt
single_pooling_params = PoolingParams() single_pooling_params = PoolingParams()
outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params) outputs = llm.encode(
PROMPTS, pooling_params=single_pooling_params, pooling_task="embed"
)
assert len(PROMPTS) == len(outputs) assert len(PROMPTS) == len(outputs)
# pooling_params is None, default params should be applied # pooling_params is None, default params should be applied
outputs = llm.encode(PROMPTS, pooling_params=None) outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed")
assert len(PROMPTS) == len(outputs) assert len(PROMPTS) == len(outputs)

View File

@ -36,22 +36,23 @@ def llm():
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_pooling_params(llm: LLM): def test_pooling_params(llm: LLM):
def get_outputs(softmax): def get_outputs(activation):
outputs = llm.reward( outputs = llm.reward(
prompts, pooling_params=PoolingParams(softmax=softmax), use_tqdm=False prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False
) )
return torch.cat([x.outputs.data for x in outputs]) return torch.cat([x.outputs.data for x in outputs])
default = get_outputs(softmax=None) default = get_outputs(activation=None)
w_softmax = get_outputs(softmax=True) w_activation = get_outputs(activation=True)
wo_softmax = get_outputs(softmax=False) wo_activation = get_outputs(activation=False)
assert torch.allclose(default, w_softmax, atol=1e-2), "Default should use softmax." assert torch.allclose(default, w_activation, atol=1e-2), (
assert not torch.allclose(w_softmax, wo_softmax, atol=1e-2), ( "Default should use activation."
"wo_softmax should not use softmax."
) )
assert torch.allclose(softmax(wo_softmax), w_softmax, atol=1e-2), ( assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"w_softmax should be close to softmax(wo_softmax)." "wo_activation should not use activation."
)
assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
) )

View File

@ -17,6 +17,7 @@ from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
EMBED_DTYPE_TO_TORCH_DTYPE, EMBED_DTYPE_TO_TORCH_DTYPE,
EmbeddingResponse, EmbeddingResponse,
PoolingResponse,
) )
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
@ -509,3 +510,20 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str):
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), ( assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
"w_normal should be close to normal(wo_normal)." "w_normal should be close to normal(wo_normal)."
) )
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling(server: RemoteOpenAIServer, model_name: str):
input_text = ["The chef prepared a delicious meal."]
response = requests.post(
server.url_for("pooling"),
json={"model": model_name, "input": input_text, "encoding_format": "float"},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 11
assert len(poolings.data[0].data[0]) == 384

View File

@ -7,7 +7,7 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import RerankResponse from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
MODEL_NAME = "BAAI/bge-reranker-base" MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "bfloat16" DTYPE = "bfloat16"
@ -159,3 +159,20 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str):
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), ( assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)." "w_activation should be close to activation(wo_activation)."
) )
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling(server: RemoteOpenAIServer, model_name: str):
input_text = ["The chef prepared a delicious meal."]
response = requests.post(
server.url_for("pooling"),
json={"model": model_name, "input": input_text, "encoding_format": "float"},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 11
assert len(poolings.data[0].data[0]) == 1

View File

@ -6,7 +6,11 @@ from unittest.mock import MagicMock, patch
import pytest import pytest
from openai_harmony import Author, Message, Role, StreamState, TextContent from openai_harmony import Author, Message, Role, StreamState, TextContent
from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext from vllm.entrypoints.context import (
HarmonyContext,
StreamingHarmonyContext,
TurnMetrics,
)
from vllm.outputs import CompletionOutput, RequestOutput from vllm.outputs import CompletionOutput, RequestOutput
@ -101,8 +105,12 @@ def test_single_turn_token_counting():
# Verify internal state tracking # Verify internal state tracking
assert not context.is_first_turn assert not context.is_first_turn
assert context.previous_turn.input_tokens == 5 assert len(context.all_turn_metrics) == 1
assert context.previous_turn.output_tokens == 3 previous_turn = context.all_turn_metrics[0]
assert previous_turn.input_tokens == 5
assert previous_turn.output_tokens == 3
assert previous_turn.cached_input_tokens == 2
assert previous_turn.tool_output_tokens == 0
@pytest.mark.asyncio @pytest.mark.asyncio
@ -156,6 +164,15 @@ async def test_multi_turn_token_counting():
assert context.num_tool_output_tokens == expected_tool_output assert context.num_tool_output_tokens == expected_tool_output
assert context.num_cached_tokens == 5 + 15 assert context.num_cached_tokens == 5 + 15
# Validate all turn metrics
assert len(context.all_turn_metrics) == 3
for i, turn in enumerate(context.all_turn_metrics):
assert turn.input_tokens == prompt_token_counts[i]
assert turn.output_tokens == output_token_counts[i]
assert turn.cached_input_tokens == cached_token_counts[i]
assert context.all_turn_metrics[1].tool_output_tokens == 7
assert context.all_turn_metrics[2].tool_output_tokens == 1
def test_empty_output_tokens(): def test_empty_output_tokens():
"""Test behavior when RequestOutput has empty output tokens.""" """Test behavior when RequestOutput has empty output tokens."""
@ -314,6 +331,10 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
# Create a streaming context # Create a streaming context
context = StreamingHarmonyContext(messages=[], available_tools=["browser"]) context = StreamingHarmonyContext(messages=[], available_tools=["browser"])
num_prompt_tokens = [3, 8, 13]
num_output_tokens = [3, 3, 2]
num_cached_tokens = [0, 3, 8]
# Simulate three turns of conversation: # Simulate three turns of conversation:
# Turn 1: stream tokens one by one, then finish the message # Turn 1: stream tokens one by one, then finish the message
# Turn 2: new prompt, stream more tokens with a reasoning segment # Turn 2: new prompt, stream more tokens with a reasoning segment
@ -325,7 +346,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
create_mock_request_output( create_mock_request_output(
prompt_token_ids=[1, 2, 3], # 3 prompt tokens prompt_token_ids=[1, 2, 3], # 3 prompt tokens
output_token_ids=[101], # Single token output_token_ids=[101], # Single token
num_cached_tokens=0, num_cached_tokens=num_cached_tokens[0],
finished=False, # Not end of message yet finished=False, # Not end of message yet
) )
) )
@ -370,7 +391,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
5, 5,
], # 8 tokens (includes previous) ], # 8 tokens (includes previous)
output_token_ids=[201], output_token_ids=[201],
num_cached_tokens=3, # Some tokens cached num_cached_tokens=num_cached_tokens[1], # Some tokens cached
finished=False, finished=False,
) )
) )
@ -422,7 +443,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
7, 7,
], # 13 tokens ], # 13 tokens
output_token_ids=[301], output_token_ids=[301],
num_cached_tokens=8, # More cached tokens num_cached_tokens=num_cached_tokens[2], # More cached tokens
finished=False, finished=False,
) )
) )
@ -435,10 +456,12 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
) )
# Final token counts check # Final token counts check
assert context.num_prompt_tokens == 3 + 8 + 13 # All prompts assert context.num_prompt_tokens == sum(num_prompt_tokens) # All prompts
assert context.num_output_tokens == 3 + 3 + 2 # All outputs assert context.num_output_tokens == sum(num_output_tokens) # All outputs
assert context.num_reasoning_tokens == 3 # Unchanged from second turn assert context.num_reasoning_tokens == 3 # Unchanged from second turn
assert context.num_cached_tokens == 3 + 8 # Accumulated cached tokens assert context.num_cached_tokens == sum(
num_cached_tokens
) # Accumulated cached tokens
# Additional tool tokens from third turn # Additional tool tokens from third turn
# Formula: this turn prompt - last turn prompt - last turn output # Formula: this turn prompt - last turn prompt - last turn output
@ -447,6 +470,15 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens
) )
# Validate all turn metrics
assert len(context.all_turn_metrics) == 3
for i, turn in enumerate(context.all_turn_metrics):
assert turn.input_tokens == num_prompt_tokens[i]
assert turn.output_tokens == num_output_tokens[i]
assert turn.cached_input_tokens == num_cached_tokens[i]
assert context.all_turn_metrics[1].tool_output_tokens == 2
assert context.all_turn_metrics[2].tool_output_tokens == 2
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_streaming_message_synchronization(mock_parser): async def test_streaming_message_synchronization(mock_parser):
@ -522,3 +554,46 @@ async def test_streaming_message_synchronization(mock_parser):
assert len(context._messages) == 3 assert len(context._messages) == 3
assert context.num_init_messages == 1 assert context.num_init_messages == 1
assert context._messages[2].content[0].text == "Response 4" assert context._messages[2].content[0].text == "Response 4"
def test_turn_metrics_copy_and_reset():
"""Test TurnMetrics copy and reset methods work correctly."""
# Create a TurnMetrics with specific values
original_metrics = TurnMetrics(
input_tokens=10,
output_tokens=20,
cached_input_tokens=5,
tool_output_tokens=3,
)
# Test copy functionality
copied_metrics = original_metrics.copy()
# Verify copy has same values
assert copied_metrics.input_tokens == 10
assert copied_metrics.output_tokens == 20
assert copied_metrics.cached_input_tokens == 5
assert copied_metrics.tool_output_tokens == 3
# Verify they are separate objects
assert copied_metrics is not original_metrics
# Modify copy to ensure independence
copied_metrics.input_tokens = 999
assert original_metrics.input_tokens == 10 # Original unchanged
assert copied_metrics.input_tokens == 999
# Test reset functionality
original_metrics.reset()
# Verify all fields are reset to zero
assert original_metrics.input_tokens == 0
assert original_metrics.output_tokens == 0
assert original_metrics.cached_input_tokens == 0
assert original_metrics.tool_output_tokens == 0
# Verify copied metrics are unaffected by reset
assert copied_metrics.input_tokens == 999
assert copied_metrics.output_tokens == 20
assert copied_metrics.cached_input_tokens == 5
assert copied_metrics.tool_output_tokens == 3

View File

@ -12,7 +12,7 @@ from vllm.entrypoints.openai.api_server import (
from vllm.inputs import TextPrompt from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils import merge_async_iterators from vllm.utils.async_utils import merge_async_iterators
MODEL_PATH = "zai-org/chatglm3-6b" MODEL_PATH = "zai-org/chatglm3-6b"
LORA_RANK = 64 LORA_RANK = 64

View File

@ -0,0 +1,45 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from transformers import AutoModel
from tests.models.utils import check_embeddings_close
@pytest.mark.parametrize(
"model",
["BAAI/bge-m3"],
)
@pytest.mark.parametrize("dtype", ["half"])
@torch.inference_mode
def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str):
with vllm_runner(
model,
runner="pooling",
max_model_len=None,
) as vllm_model:
vllm_outputs = vllm_model.token_embed(example_prompts)
with hf_runner(
model,
auto_cls=AutoModel,
) as hf_model:
tokenizer = hf_model.tokenizer
hf_outputs = []
for prompt in example_prompts:
inputs = tokenizer([prompt], return_tensors="pt")
inputs = hf_model.wrap_device(inputs)
output = hf_model.model(**inputs)
embedding = output.last_hidden_state[0].float()
# normal
hf_outputs.append(embedding.cpu())
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
check_embeddings_close(
embeddings_0_lst=hf_output,
embeddings_1_lst=vllm_output,
name_0="hf",
name_1="vllm",
tol=1e-2,
)

View File

@ -93,7 +93,7 @@ def test_embed_models_using_normalize(
], ],
) )
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_reward_models_using_softmax( def test_reward_models_using_activation(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
example_prompts, example_prompts,
@ -104,22 +104,64 @@ def test_reward_models_using_softmax(
model, model,
max_model_len=1024, max_model_len=1024,
dtype=dtype, dtype=dtype,
pooler_config=PoolerConfig(softmax=False), pooler_config=PoolerConfig(activation=False),
) as vllm_model: ) as vllm_model:
wo_softmax = vllm_model.encode(example_prompts) wo_activation = vllm_model.reward(example_prompts)
with vllm_runner( with vllm_runner(
model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True) model,
max_model_len=1024,
dtype=dtype,
pooler_config=PoolerConfig(activation=True),
) as vllm_model: ) as vllm_model:
w_softmax = vllm_model.encode(example_prompts) w_activation = vllm_model.reward(example_prompts)
for wo, w in zip(wo_softmax, w_softmax): for wo, w in zip(wo_activation, w_activation):
wo = torch.tensor(wo) wo = torch.tensor(wo)
w = torch.tensor(w) w = torch.tensor(w)
assert not torch.allclose(wo, w, atol=1e-2), ( assert not torch.allclose(wo, w, atol=1e-2), (
"pooler_config softmax is not working" "pooler_config activation is not working"
) )
assert torch.allclose(softmax(wo), w, atol=1e-2), ( assert torch.allclose(softmax(wo), w, atol=1e-2), (
"w_softmax should be close to softmax(wo_softmax)." "w_activation should be close to activation(wo_activation)."
)
@pytest.mark.parametrize(
"model",
[
"intfloat/multilingual-e5-small",
],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_multi_vector_retrieval_models_using_normalize(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
) -> None:
with vllm_runner(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=False),
) as vllm_model:
wo_normalize = vllm_model.token_embed(example_prompts)
with vllm_runner(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=True),
) as vllm_model:
w_normalize = vllm_model.token_embed(example_prompts)
for wo, w in zip(wo_normalize, w_normalize):
assert not torch.allclose(wo, w, atol=1e-2), (
"pooler_config normalize is not working"
)
assert torch.allclose(F.normalize(wo, p=2, dim=-1), w, atol=1e-2), (
"w_normal should be close to normal(wo_normal)."
) )

View File

@ -19,7 +19,7 @@ def test_bert_models(
dtype: str, dtype: str,
) -> None: ) -> None:
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts) vllm_outputs = vllm_model.token_classify(example_prompts)
with hf_runner( with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForTokenClassification model, dtype=dtype, auto_cls=AutoModelForTokenClassification
@ -50,7 +50,7 @@ def test_modernbert_models(
dtype: str, dtype: str,
) -> None: ) -> None:
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts) vllm_outputs = vllm_model.token_classify(example_prompts)
with hf_runner( with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForTokenClassification model, dtype=dtype, auto_cls=AutoModelForTokenClassification

View File

@ -17,7 +17,7 @@ from transformers import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import identity from vllm.utils.func import identity
from ....conftest import ( from ....conftest import (
IMAGE_ASSETS, IMAGE_ASSETS,

View File

@ -38,7 +38,7 @@ def run_intern_vit_test(
config.norm_type = "rms_norm" config.norm_type = "rms_norm"
hf_model = AutoModel.from_pretrained( hf_model = AutoModel.from_pretrained(
model, torch_dtype=torch_dtype, trust_remote_code=True model, dtype=torch_dtype, trust_remote_code=True
).to("cuda") ).to("cuda")
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).last_hidden_state hf_model(pixel_value.to("cuda")).last_hidden_state

View File

@ -39,7 +39,7 @@ def _run_test(
max_num_seqs=32, max_num_seqs=32,
default_torch_num_threads=1, default_torch_num_threads=1,
) as vllm_model: ) as vllm_model:
vllm_model.encode(prompt) vllm_model.llm.encode(prompt, pooling_task="token_classify")
MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]

View File

@ -45,7 +45,7 @@ def run_radio_test(
hf_model = AutoModel.from_pretrained( hf_model = AutoModel.from_pretrained(
model_id, model_id,
config=config, config=config,
torch_dtype=torch_dtype, dtype=torch_dtype,
trust_remote_code=True, trust_remote_code=True,
).to("cuda") ).to("cuda")
hf_model.eval() hf_model.eval()

View File

@ -30,7 +30,7 @@ class MyGemma2Embedding(nn.Module):
self.pooler = DispatchPooler( self.pooler = DispatchPooler(
{ {
"encode": Pooler.for_encode(pooler_config), "token_embed": Pooler.for_token_embed(pooler_config),
"embed": Pooler.for_embed(pooler_config), "embed": Pooler.for_embed(pooler_config),
} }
) )

View File

@ -93,7 +93,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
out_data_format="b64_json", out_data_format="b64_json",
) )
pooling_params = PoolingParams(task="encode", softmax=False) pooling_params = PoolingParams(activation=False)
with vllm_runner( with vllm_runner(
model_name, model_name,
@ -108,8 +108,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
io_processor_plugin="prithvi_to_tiff", io_processor_plugin="prithvi_to_tiff",
) as llm_runner: ) as llm_runner:
pooler_output = llm_runner.get_llm().encode( pooler_output = llm_runner.get_llm().encode(
img_prompt, img_prompt, pooling_params=pooling_params, pooling_task="token_classify"
pooling_params=pooling_params,
) )
output = pooler_output[0].outputs output = pooler_output[0].outputs

View File

@ -697,7 +697,8 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"args", "args",
[ [
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4), # TODO: Enable once model is available again
# ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4), ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4),
], ],
) )

Some files were not shown because too many files have changed in this diff Show More