mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 11:47:09 +08:00
Merge branch 'main' into woosuk/test-router
This commit is contained in:
commit
8935ca208d
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.419
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.416
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
||||
@ -0,0 +1,11 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "chartqa"
|
||||
metrics:
|
||||
- name: "relaxed_accuracy,none"
|
||||
value: 0.90
|
||||
limit: 100
|
||||
num_fewshot: 0
|
||||
@ -0,0 +1,11 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "mmlu_pro"
|
||||
metrics:
|
||||
- name: "exact_match,custom-extract"
|
||||
value: 0.80
|
||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||
num_fewshot: 5
|
||||
@ -1,4 +1,5 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||
# For vllm script, with -t option (tensor parallel size)
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
||||
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
||||
|
||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "chartqa"
|
||||
metrics:
|
||||
- name: "relaxed_accuracy,none"
|
||||
value: 0.855
|
||||
limit: 2500
|
||||
num_fewshot: 0
|
||||
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
||||
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
@ -0,0 +1 @@
|
||||
Qwen2.5-VL-7B-Instruct.yaml
|
||||
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install lm-eval==0.4.9
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our correctness tests in vllm's CI."
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:l:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm-vlm \
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
||||
--tasks chartqa \
|
||||
--batch_size auto \
|
||||
--apply_chat_template \
|
||||
--limit $LIMIT
|
||||
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our automated nm-test-accuracy workflow"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -f - number of fewshot samples to use"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:b:l:f:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
b )
|
||||
BATCH_SIZE="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
f )
|
||||
FEWSHOT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm \
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
||||
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||
--batch_size auto
|
||||
@ -19,21 +19,27 @@ RTOL = 0.08
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||
max_model_len = eval_config.get("max_model_len", 4096)
|
||||
batch_size = eval_config.get("batch_size", "auto")
|
||||
backend = eval_config.get("backend", "vllm")
|
||||
model_args = (
|
||||
f"pretrained={eval_config['model_name']},"
|
||||
f"tensor_parallel_size={tp_size},"
|
||||
f"enforce_eager=true,"
|
||||
f"add_bos_token=true,"
|
||||
f"trust_remote_code={trust_remote_code},"
|
||||
f"max_model_len={max_model_len}"
|
||||
f"max_model_len={max_model_len},"
|
||||
)
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
batch_size="auto",
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm.
|
||||
apply_chat_template=backend == "vllm-vlm",
|
||||
batch_size=batch_size,
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
@ -63,7 +63,7 @@ steps:
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
||||
timeout_in_minutes: 10
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@ -353,7 +353,7 @@ steps:
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@ -459,6 +459,7 @@ steps:
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
- pytest -v -s compile/test_noop_elimination.py
|
||||
- pytest -v -s compile/test_aot_compile.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@ -487,14 +488,14 @@ steps:
|
||||
|
||||
- label: Kernels Core Operation Test # 48min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- tests/kernels/core
|
||||
commands:
|
||||
- pytest -v -s kernels/core
|
||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||
|
||||
- label: Kernels Attention Test %N # 23min
|
||||
timeout_in_minutes: 35
|
||||
@ -603,7 +604,8 @@ steps:
|
||||
# since torchao nightly is only compatible with torch nightly currently
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
@ -631,7 +633,7 @@ steps:
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 23 min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
fast_check: false
|
||||
|
||||
@ -527,7 +527,8 @@ steps:
|
||||
# since torchao nightly is only compatible with torch nightly currently
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
@ -733,6 +734,16 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
|
||||
3
.github/CODEOWNERS
vendored
3
.github/CODEOWNERS
vendored
@ -5,9 +5,7 @@
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/model_executor/layers/fused_moe @mgoin
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
@ -26,7 +24,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
|
||||
@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
|
||||
else:
|
||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
block_quant_shape = get_weight_block_size_safety(config)
|
||||
|
||||
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
use_customized_permute = args.use_customized_permute
|
||||
|
||||
@ -22,10 +22,10 @@ else()
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
)
|
||||
FetchContent_Populate(qutlass)
|
||||
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
|
||||
endif()
|
||||
|
||||
FetchContent_Populate(qutlass)
|
||||
|
||||
if(NOT qutlass_SOURCE_DIR)
|
||||
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
|
||||
endif()
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
|
||||
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||
__shared__ float s_variance;
|
||||
float variance = 0.0f;
|
||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||
constexpr int VEC_SIZE = 8;
|
||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
float x = static_cast<float>(vec.val[i]);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
auto scalar_op = [&variance](const scalar_t& val) {
|
||||
float x = static_cast<float>(val);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
||||
__shared__ float s_variance;
|
||||
float variance = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||
|
||||
constexpr int VEC_SIZE = 8;
|
||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
float x = static_cast<float>(vec.val[i]);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
auto scalar_op = [&variance](const scalar_t& val) {
|
||||
float x = static_cast<float>(val);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
|
||||
@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
from vllm.config import CompilationConfig, CompilationLevel
|
||||
from vllm.config import CompilationConfig, CompilationMode
|
||||
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
# By default, it goes up to max_num_seqs
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
|
||||
),
|
||||
|
||||
@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
|
||||
"""NO CUDA Graphs support"""
|
||||
```
|
||||
|
||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
||||
|
||||
The following table lists backends that support full CUDA Graphs at the time of writing.
|
||||
|
||||
@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
|
||||
import vllm
|
||||
from vllm.config import CUDAGraphMode
|
||||
|
||||
compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
||||
compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
||||
model = vllm.LLM(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
dtype="auto",
|
||||
|
||||
@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
|
||||
from awq import AutoAWQForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
|
||||
quant_path = 'mistral-instruct-v0.2-awq'
|
||||
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
|
||||
model_path = "mistralai/Mistral-7B-Instruct-v0.2"
|
||||
quant_path = "mistral-instruct-v0.2-awq"
|
||||
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
|
||||
|
||||
# Load model
|
||||
model = AutoAWQForCausalLM.from_pretrained(
|
||||
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
|
||||
model_path,
|
||||
low_cpu_mem_usage=True,
|
||||
use_cache=False,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
|
||||
@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from auto_round import AutoRound
|
||||
|
||||
model_name = "Qwen/Qwen3-0.6B"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
bits, group_size, sym = 4, 128, True
|
||||
|
||||
@ -34,7 +34,7 @@ llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitblas"
|
||||
quantization="bitblas",
|
||||
)
|
||||
```
|
||||
|
||||
@ -53,6 +53,6 @@ llm = LLM(
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitblas",
|
||||
max_model_len=1024
|
||||
max_model_len=1024,
|
||||
)
|
||||
```
|
||||
|
||||
@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True
|
||||
trust_remote_code=True,
|
||||
)
|
||||
```
|
||||
|
||||
@ -43,7 +43,7 @@ llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitsandbytes"
|
||||
quantization="bitsandbytes",
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
```
|
||||
@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
|
||||
|
||||
# Configure the simple PTQ quantization
|
||||
recipe = QuantizationModifier(
|
||||
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
|
||||
targets="Linear",
|
||||
scheme="FP8_DYNAMIC",
|
||||
ignore=["lm_head"],
|
||||
)
|
||||
|
||||
# Apply the quantization algorithm.
|
||||
oneshot(model=model, recipe=recipe)
|
||||
|
||||
@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
|
||||
conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
"content": "You are a helpful assistant",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello"
|
||||
"content": "Hello",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?"
|
||||
"content": "Hello! How can I assist you today?",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
||||
llm = LLM(
|
||||
model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.chat(conversation, sampling_params)
|
||||
|
||||
@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
||||
calibration_dataset = load_dataset(
|
||||
"allenai/c4",
|
||||
data_files="en/c4-train.00001-of-01024.json.gz",
|
||||
split="train"
|
||||
split="train",
|
||||
).select(range(1024))["text"]
|
||||
|
||||
quant_config = QuantizeConfig(bits=4, group_size=128)
|
||||
|
||||
@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
```
|
||||
@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
|
||||
},
|
||||
ignore=["lm_head"],
|
||||
update_size=NUM_CALIBRATION_SAMPLES,
|
||||
dampening_frac=0.01
|
||||
dampening_frac=0.01,
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
```
|
||||
|
||||
@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
def main():
|
||||
|
||||
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
|
||||
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
|
||||
|
||||
# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
|
||||
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
|
||||
|
||||
@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
|
||||
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
|
||||
kv_cache_dtype="fp8",
|
||||
calculate_kv_scales=True)
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-2-7b-chat-hf",
|
||||
kv_cache_dtype="fp8",
|
||||
calculate_kv_scales=True,
|
||||
)
|
||||
prompt = "London is the capital of"
|
||||
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
|
||||
print(out)
|
||||
@ -80,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
|
||||
|
||||
# Select model and load it
|
||||
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
|
||||
# Select calibration dataset
|
||||
|
||||
@ -48,7 +48,9 @@ to fetch model and tokenizer.
|
||||
MAX_SEQ_LEN = 512
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
model.eval()
|
||||
|
||||
@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
|
||||
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
|
||||
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
|
||||
|
||||
tokenized_outputs = tokenizer(text_data, return_tensors="pt",
|
||||
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
|
||||
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
|
||||
batch_size=BATCH_SIZE, drop_last=True)
|
||||
tokenized_outputs = tokenizer(
|
||||
text_data,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=MAX_SEQ_LEN,
|
||||
)
|
||||
calib_dataloader = DataLoader(
|
||||
tokenized_outputs['input_ids'],
|
||||
batch_size=BATCH_SIZE,
|
||||
drop_last=True,
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Set the Quantization Configuration
|
||||
@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
|
||||
load_quant_algo_config_from_file)
|
||||
|
||||
# Define fp8/per-tensor/static spec.
|
||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
|
||||
is_dynamic=False).to_quantization_spec()
|
||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
|
||||
observer_method="min_max",
|
||||
is_dynamic=False,
|
||||
).to_quantization_spec()
|
||||
|
||||
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
|
||||
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
|
||||
weight=FP8_PER_TENSOR_SPEC)
|
||||
global_quant_config = QuantizationConfig(
|
||||
input_tensors=FP8_PER_TENSOR_SPEC,
|
||||
weight=FP8_PER_TENSOR_SPEC,
|
||||
)
|
||||
|
||||
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
|
||||
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
|
||||
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
|
||||
kv_cache_quant_config = {name :
|
||||
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
|
||||
weight=global_quant_config.weight,
|
||||
output_tensors=KV_CACHE_SPEC)
|
||||
for name in kv_cache_layer_names_for_llama}
|
||||
kv_cache_quant_config = {
|
||||
name: QuantizationConfig(
|
||||
input_tensors=global_quant_config.input_tensors,
|
||||
weight=global_quant_config.weight,
|
||||
output_tensors=KV_CACHE_SPEC,
|
||||
)
|
||||
for name in kv_cache_layer_names_for_llama
|
||||
}
|
||||
layer_quant_config = kv_cache_quant_config.copy()
|
||||
|
||||
# Define algorithm config by config file.
|
||||
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
|
||||
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
|
||||
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
|
||||
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
|
||||
|
||||
EXCLUDE_LAYERS = ["lm_head"]
|
||||
@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
|
||||
layer_quant_config=layer_quant_config,
|
||||
kv_cache_quant_config=kv_cache_quant_config,
|
||||
exclude=EXCLUDE_LAYERS,
|
||||
algo_config=algo_config)
|
||||
algo_config=algo_config,
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Quantize the Model and Export
|
||||
@ -165,8 +182,11 @@ for more exporting format details.
|
||||
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
||||
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
||||
with torch.no_grad():
|
||||
exporter.export_safetensors_model(freezed_model,
|
||||
quant_config=quant_config, tokenizer=tokenizer)
|
||||
exporter.export_safetensors_model(
|
||||
freezed_model,
|
||||
quant_config=quant_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
```
|
||||
|
||||
### 5. Evaluation in vLLM
|
||||
@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
||||
kv_cache_dtype='fp8',quantization='quark')
|
||||
llm = LLM(
|
||||
model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
||||
kv_cache_dtype="fp8",
|
||||
quantization="quark",
|
||||
)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
|
||||
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype="auto",
|
||||
dtype="auto",
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
|
||||
@ -11,6 +11,7 @@ vLLM currently supports the following reasoning models:
|
||||
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|
||||
|--------------|-------------|------------------|-------------|
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
|
||||
| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
|
||||
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
|
||||
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
|
||||
@ -20,8 +21,9 @@ vLLM currently supports the following reasoning models:
|
||||
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
|
||||
|
||||
!!! note
|
||||
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
||||
IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
||||
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
|
||||
DeepSeek-V3.1 tool calling is supported in non-thinking mode.
|
||||
|
||||
## Quickstart
|
||||
|
||||
|
||||
@ -352,6 +352,16 @@ Supported models:
|
||||
|
||||
Flags: `--tool-call-parser qwen3_xml`
|
||||
|
||||
### Olmo 3 Models (`olmo3`)
|
||||
|
||||
Olmo 3 models output tool calls in a format that is very similar to the one expected by the `pythonic` parser (see below), with a few differences. Each tool call is a pythonic string, but the parallel tool calls are newline-delimited, and the calls are wrapped within XML tags as `<function_calls>..</function_calls>`. In addition, the parser also allows JSON boolean and null literals (`true`, `false`, and `null`) in addition to the pythonic ones (`True`, `False`, and `None`).
|
||||
|
||||
Supported models:
|
||||
|
||||
* TODO (will be updated after Olmo 3 release)
|
||||
|
||||
Flags: `--tool-call-parser olmo3`
|
||||
|
||||
### Models with Pythonic Tool Calls (`pythonic`)
|
||||
|
||||
A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
|
||||
|
||||
@ -23,7 +23,46 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu/build.inc.md:extra-information"
|
||||
First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
|
||||
|
||||
```bash
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||
```
|
||||
|
||||
Second, clone the vLLM project:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/vllm-project/vllm.git vllm_source
|
||||
cd vllm_source
|
||||
```
|
||||
|
||||
Third, install required dependencies:
|
||||
|
||||
```bash
|
||||
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
|
||||
uv pip install -r requirements/cpu.txt --torch-backend cpu
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
pip install --upgrade pip
|
||||
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
Finally, build and install vLLM:
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
|
||||
```
|
||||
|
||||
If you want to develop vLLM, install it in editable mode instead.
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
|
||||
```
|
||||
|
||||
Testing has been conducted on AWS Graviton3 instances for compatibility.
|
||||
|
||||
|
||||
@ -1,44 +0,0 @@
|
||||
# --8<-- [start:extra-information]
|
||||
|
||||
First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
|
||||
|
||||
```bash
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||
```
|
||||
|
||||
Second, clone the vLLM project:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/vllm-project/vllm.git vllm_source
|
||||
cd vllm_source
|
||||
```
|
||||
|
||||
Third, install required dependencies:
|
||||
|
||||
```bash
|
||||
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
|
||||
uv pip install -r requirements/cpu.txt --torch-backend cpu
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
pip install --upgrade pip
|
||||
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
Finally, build and install vLLM:
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py install
|
||||
```
|
||||
|
||||
If you want to develop vLLM, install it in editable mode instead.
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py develop
|
||||
```
|
||||
|
||||
# --8<-- [end:extra-information]
|
||||
@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
prompt="San Francisco is a")
|
||||
completion = client.completions.create(
|
||||
model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
prompt="San Francisco is a",
|
||||
)
|
||||
print("Completion result:", completion)
|
||||
```
|
||||
|
||||
@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package:
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Tell me a joke."},
|
||||
]
|
||||
],
|
||||
)
|
||||
print("Chat response:", chat_response)
|
||||
```
|
||||
|
||||
@ -22,6 +22,11 @@ sys.modules["vllm._C"] = MagicMock()
|
||||
class PydanticMagicMock(MagicMock):
|
||||
"""`MagicMock` that's able to generate pydantic-core schemas."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
name = kwargs.pop("name", None)
|
||||
super().__init__(*args, **kwargs)
|
||||
self.__spec__ = importlib.machinery.ModuleSpec(name, None)
|
||||
|
||||
def __get_pydantic_core_schema__(self, source_type, handler):
|
||||
return core_schema.any_schema()
|
||||
|
||||
@ -42,7 +47,9 @@ def auto_mock(module, attr, max_mocks=50):
|
||||
raise e
|
||||
except ModuleNotFoundError as e:
|
||||
logger.info("Mocking %s for argparse doc generation", e.name)
|
||||
sys.modules[e.name] = PydanticMagicMock()
|
||||
sys.modules[e.name] = PydanticMagicMock(name=e.name)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to import %s.%s: %s", module, attr, e)
|
||||
|
||||
raise ImportError(
|
||||
f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
|
||||
|
||||
@ -60,7 +60,7 @@ from vllm import LLM
|
||||
llm = LLM(
|
||||
"s3://my-bucket/vllm/facebook/opt-125m/v1",
|
||||
load_format="tensorizer",
|
||||
enable_lora=True
|
||||
enable_lora=True,
|
||||
)
|
||||
```
|
||||
|
||||
@ -97,6 +97,6 @@ llm = LLM(
|
||||
"s3://my-bucket/vllm/facebook/opt-125m/v1",
|
||||
load_format="tensorizer",
|
||||
enable_lora=True,
|
||||
model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}
|
||||
model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}},
|
||||
)
|
||||
```
|
||||
|
||||
@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
|
||||
conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
"content": "You are a helpful assistant",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello"
|
||||
"content": "Hello",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?"
|
||||
"content": "Hello! How can I assist you today?",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
|
||||
@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
|
||||
(output,) = llm.score("What is the capital of France?",
|
||||
"The capital of Brazil is Brasilia.")
|
||||
(output,) = llm.score(
|
||||
"What is the capital of France?",
|
||||
"The capital of Brazil is Brasilia.",
|
||||
)
|
||||
|
||||
score = output.outputs.score
|
||||
print(f"Score: {score}")
|
||||
@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please
|
||||
|
||||
Here is an example to serve a model with Matryoshka Embeddings enabled.
|
||||
|
||||
```text
|
||||
```bash
|
||||
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
|
||||
```
|
||||
|
||||
@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka
|
||||
```python
|
||||
from vllm import LLM, PoolingParams
|
||||
|
||||
llm = LLM(model="jinaai/jina-embeddings-v3",
|
||||
runner="pooling",
|
||||
trust_remote_code=True)
|
||||
outputs = llm.embed(["Follow the white rabbit."],
|
||||
pooling_params=PoolingParams(dimensions=32))
|
||||
llm = LLM(
|
||||
model="jinaai/jina-embeddings-v3",
|
||||
runner="pooling",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
outputs = llm.embed(
|
||||
["Follow the white rabbit."],
|
||||
pooling_params=PoolingParams(dimensions=32),
|
||||
)
|
||||
print(outputs[0].outputs)
|
||||
```
|
||||
|
||||
@ -234,13 +240,13 @@ A code example can be found here: <gh-file:examples/offline_inference/pooling/em
|
||||
|
||||
Use the following command to start vllm server.
|
||||
|
||||
```text
|
||||
```bash
|
||||
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
|
||||
```
|
||||
|
||||
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
|
||||
|
||||
```text
|
||||
```bash
|
||||
curl http://127.0.0.1:8000/v1/embeddings \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
|
||||
@ -278,8 +278,8 @@ https_proxy=http://your.proxy.server:port vllm serve <model_name>
|
||||
```python
|
||||
import os
|
||||
|
||||
os.environ['http_proxy'] = 'http://your.proxy.server:port'
|
||||
os.environ['https_proxy'] = 'http://your.proxy.server:port'
|
||||
os.environ["http_proxy"] = "http://your.proxy.server:port"
|
||||
os.environ["https_proxy"] = "http://your.proxy.server:port"
|
||||
```
|
||||
|
||||
### ModelScope
|
||||
|
||||
47
docs/serving/context_parallel_deployment.md
Normal file
47
docs/serving/context_parallel_deployment.md
Normal file
@ -0,0 +1,47 @@
|
||||
# Context Parallel Deployment
|
||||
|
||||
Context parallel mainly solves the problem of serving long context requests. As prefill and decode present quite different characteristics and have quite different SLO (service level objectives), we need to implement context parallel separately for them. The major considerations are:
|
||||
|
||||
- For long context prefill, we need to control the TTFT (time to first token) by amortizing the computation time of the prefill across query tokens.
|
||||
- For long context decode, we need more space for KV cache to increase the batchsize (and hence the throughput).
|
||||
|
||||
## Prefill Context Parallel
|
||||
|
||||
During prefill, for a long request with `T` new tokens, we need to compute query/key/value tensors for these new tokens. Say we have `N` GPUs, we can split the request into `N` chunks, and each GPU computes one chunk of the query/key/value tensors.
|
||||
|
||||
Depending on the use case, there're two possible strategies:
|
||||
|
||||
1. Partial query, full key/value: If the request token length is moderately long (we can afford holding the full key/value tensors), and the goal is to accelerate the prefill (and amortize the computation time of the prefill across query tokens), then we can gather the key/value tensors from all GPUs and let each GPU compute the attention output corresponding to the query tokens of its chunk.
|
||||
2. Partial query, partial key/value: If the request token length is too long, we cannot afford holding the full key/value tensors anymore, then we can only compute one chunk of query/key/value tensors for each GPU, and use techniques like [ring-attention](http://arxiv.org/abs/2310.01889) to send/recv key/value tensors chunk by chunk.
|
||||
|
||||
Both approaches are under active development.
|
||||
|
||||
## Decode Context Parallel
|
||||
|
||||
Due to the auto-regressive nature of decoding, every decoding step needs to compute a small amount of query tokens w.r.t. a large number of key/value tokens stored in the paged KV cache. The core of decode context parallel is how to shard the KV cache across GPUs.
|
||||
|
||||
For a model with `H` kv-heads, a request with `T` tokens in the context needs to store `H * T` key/value tensors in the KV cache.
|
||||
|
||||
1. If one GPU can hold them all, and the performance is good enough, then no parallelization is needed.
|
||||
2. If one GPU cannot hold them all, or we want to hold more requests in the KV cache, we can first shard the KV cache along the `H` dimension, that's the plain tensor parallel sharding. It's as simple as adding `-tp <num_gpus>` to the command line.
|
||||
3. Since `H` is limited (determined by the model architecture), when we continue to increase the tensor parallel size, the KV cache for each GPU will be duplicated for `tp_size / H` times. Of course, duplication is not good for efficiency. Then we need to add decode context parallel to further shard the KV cache along the `T` dimension. This is as simple as adding `-dcp <size>` to the command line. Note that `size` does not increase the number of GPUs we need to launch, but just reduces the KV cache duplication. The dcp size should lie in the range of `[1, tp_size/H]`. With larger dcp size, the KV cache duplication is reduced, but the communication overhead increases.
|
||||
|
||||
Theoretically, it is possible to extend the dcp size beyond `tp_size / H` to further shard the KV cache and accelerate the decoding phase. However, since the number of query tokens is limited in decoding, it's unclear what should we do for the remaining `dcp_size - tp_size / H` GPUs for non-attention layers. For the sake of simplicity, dcp size is upper bounded by `tp_size / H`. If you want to further accelerate the decoding phase, you can consider increasing the `tp_size` first, and then increasing the dcp size.
|
||||
|
||||
Note that kv cache can grow during decoding, and the sharding strategy needs to be carefully implemented. We use an interleaving strategy to shard the KV cache along the `T` dimension, so that kv cache for future tokens can be naturally sharded along the `T` dimension. This is proposed by [Chao Hong from Moonshot](https://github.com/youzhedian), and also explained in details in [this paper](http://arxiv.org/abs/2507.07120).
|
||||
|
||||
Case study:
|
||||
|
||||
For DeepSeek-R1, we have 1 kv-head when MLA is enabled. The typical single-node deployment with `-tp 8` causes 8x KV cache duplication. We can consider adding `-dcp 8` to reduce the KV cache duplication.
|
||||
|
||||
For Kimi-K2, the architecture is similar to DeepSeek-R1, but with more parameters. When we deploy it with `-tp 16`, the KV cache duplication is 16x. We can add `-dcp 16` to completely remove the KV cache duplication, at the cost of more communication overhead. We can also add `-dcp 8` to reduce the KV cache duplication to 2x. Although it still duplicates the KV cache twice, the communication overhead is smaller since the DCP communication only happens inside one node.
|
||||
|
||||
For Qwen3-235B-A22B, we have 4 kv-heads. When we deploy it with `-tp 8`, the KV cache duplication is 2x. Then we can add `-dcp 2` to remove the KV cache duplication.
|
||||
|
||||
In short, for decode context parallel, try to increase `-tp` size until you get satisfactory performance, and then add `-dcp` to reduce the KV cache duplication.
|
||||
|
||||
Decode context parallel is supported in vLLM, for both MLA and GQA models. Some attention backends also support the combination of decode context parallel and MTP (multi-token prediction) to further accelerate the decoding phase.
|
||||
|
||||
## Technical Discussions
|
||||
|
||||
The main discussions happen in the `#sig-context-parallel` channel of [vLLM Slack](https://slack.vllm.ai/).
|
||||
@ -243,10 +243,10 @@ try:
|
||||
"remote_engine_id": None, # Will be populated by vLLM
|
||||
"remote_block_ids": None, # Will be populated by vLLM
|
||||
"remote_host": None, # Will be populated by vLLM
|
||||
"remote_port": None # Will be populated by vLLM
|
||||
"remote_port": None, # Will be populated by vLLM
|
||||
}
|
||||
},
|
||||
extra_headers={"X-Request-Id": request_id}
|
||||
extra_headers={"X-Request-Id": request_id},
|
||||
)
|
||||
|
||||
print("-" * 50)
|
||||
@ -262,7 +262,7 @@ try:
|
||||
extra_body={
|
||||
"kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info
|
||||
},
|
||||
extra_headers={"X-Request-Id": request_id} # Same request ID
|
||||
extra_headers={"X-Request-Id": request_id}, # Same request ID
|
||||
)
|
||||
|
||||
print("-" * 50)
|
||||
|
||||
@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
|
||||
```python
|
||||
from langchain_community.llms import VLLM
|
||||
|
||||
llm = VLLM(model="mosaicml/mpt-7b",
|
||||
trust_remote_code=True, # mandatory for hf models
|
||||
max_new_tokens=128,
|
||||
top_k=10,
|
||||
top_p=0.95,
|
||||
temperature=0.8,
|
||||
# tensor_parallel_size=... # for distributed inference
|
||||
llm = VLLM(
|
||||
model="mosaicml/mpt-7b",
|
||||
trust_remote_code=True, # mandatory for hf models
|
||||
max_new_tokens=128,
|
||||
top_k=10,
|
||||
top_p=0.95,
|
||||
temperature=0.8,
|
||||
# for distributed inference
|
||||
# tensor_parallel_size=...,
|
||||
)
|
||||
|
||||
print(llm("What is the capital of France ?"))
|
||||
|
||||
@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an
|
||||
completion = client.chat.completions.create(
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
messages=[
|
||||
{"role": "user", "content": "Hello!"}
|
||||
]
|
||||
{"role": "user", "content": "Hello!"},
|
||||
],
|
||||
)
|
||||
|
||||
print(completion.choices[0].message)
|
||||
@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below:
|
||||
completion = client.chat.completions.create(
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
messages=[
|
||||
{"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
|
||||
]
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"},
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
```
|
||||
|
||||
@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
|
||||
completion = client.chat.completions.create(
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
|
||||
],
|
||||
extra_body={
|
||||
"structured_outputs": {"choice": ["positive", "negative"]}
|
||||
}
|
||||
"structured_outputs": {"choice": ["positive", "negative"]},
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
@ -149,11 +154,11 @@ with `--enable-request-id-headers`.
|
||||
completion = client.chat.completions.create(
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
|
||||
],
|
||||
extra_headers={
|
||||
"x-request-id": "sentiment-classification-00001",
|
||||
}
|
||||
},
|
||||
)
|
||||
print(completion._request_id)
|
||||
|
||||
@ -162,7 +167,7 @@ with `--enable-request-id-headers`.
|
||||
prompt="A robot may not injure a human being",
|
||||
extra_headers={
|
||||
"x-request-id": "completion-test",
|
||||
}
|
||||
},
|
||||
)
|
||||
print(completion._request_id)
|
||||
```
|
||||
@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi
|
||||
model="openai/whisper-large-v3-turbo",
|
||||
file=audio_file,
|
||||
language="en",
|
||||
response_format="verbose_json"
|
||||
response_format="verbose_json",
|
||||
)
|
||||
|
||||
print(transcription.text)
|
||||
@ -812,22 +817,22 @@ You can pass multi-modal inputs to scoring models by passing `content` including
|
||||
"model": "jinaai/jina-reranker-m0",
|
||||
"text_1": "slm markdown",
|
||||
"text_2": {
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
},
|
||||
},
|
||||
]
|
||||
}
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
|
||||
@ -95,7 +95,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--compilation-config",
|
||||
type=int,
|
||||
help=("Compilation optimization (O) level 0-3."),
|
||||
help=("Compilation optimization (O) mode 0-3."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quantization",
|
||||
|
||||
@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
|
||||
"""
|
||||
try:
|
||||
url = s3_client.generate_presigned_url(
|
||||
ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in
|
||||
ClientMethod=client_method,
|
||||
Params=method_parameters,
|
||||
ExpiresIn=expires_in,
|
||||
)
|
||||
except ClientError:
|
||||
raise
|
||||
@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
|
||||
|
||||
s3_client = boto3.client("s3")
|
||||
input_url = generate_presigned_url(
|
||||
s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600
|
||||
s3_client,
|
||||
"get_object",
|
||||
{"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"},
|
||||
expires_in=3600,
|
||||
)
|
||||
output_url = generate_presigned_url(
|
||||
s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600
|
||||
s3_client,
|
||||
"put_object",
|
||||
{"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"},
|
||||
expires_in=3600,
|
||||
)
|
||||
print(f"{input_url=}")
|
||||
print(f"{output_url=}")
|
||||
|
||||
@ -26,6 +26,12 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
|
||||
python examples/offline_inference/pooling/embed_matryoshka_fy.py
|
||||
```
|
||||
|
||||
## Multi vector retrieval usage
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/multi_vector_retrieval.py
|
||||
```
|
||||
|
||||
## Named Entity Recognition (NER) usage
|
||||
|
||||
```bash
|
||||
|
||||
56
examples/offline_inference/pooling/multi_vector_retrieval.py
Normal file
56
examples/offline_inference/pooling/multi_vector_retrieval.py
Normal file
@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser()
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(
|
||||
model="BAAI/bge-m3",
|
||||
runner="pooling",
|
||||
enforce_eager=True,
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass runner="pooling" for embedding models
|
||||
llm = LLM(**vars(args))
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
outputs = llm.embed(prompts)
|
||||
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
embeds = output.outputs.embedding
|
||||
print(len(embeds))
|
||||
|
||||
# Generate embedding for each token. The output is a list of PoolingRequestOutput.
|
||||
outputs = llm.encode(prompts, pooling_task="token_embed")
|
||||
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
multi_vector = output.outputs.data
|
||||
print(multi_vector.shape)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -40,7 +40,7 @@ def main():
|
||||
model_impl="terratorch",
|
||||
)
|
||||
|
||||
pooling_params = PoolingParams(task="encode", softmax=False)
|
||||
pooling_params = PoolingParams(task="token_classify", activation=False)
|
||||
pooler_output = llm.encode(
|
||||
img_prompt,
|
||||
pooling_params=pooling_params,
|
||||
|
||||
@ -18,6 +18,12 @@ python examples/online_serving/pooling/embedding_embed_dtype_client.py
|
||||
python examples/online_serving/pooling/jinaai_rerank_client.py
|
||||
```
|
||||
|
||||
## Multi vector retrieval usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/multi_vector_retrieval_client.py
|
||||
```
|
||||
|
||||
## Named Entity Recognition (NER) usage
|
||||
|
||||
```bash
|
||||
|
||||
@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Example online usage of Pooling API for multi vector retrieval.
|
||||
|
||||
Run `vllm serve <model> --runner pooling`
|
||||
to start up the server in vLLM. e.g.
|
||||
|
||||
vllm serve BAAI/bge-m3
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
import requests
|
||||
import torch
|
||||
|
||||
|
||||
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||
headers = {"User-Agent": "Test Client"}
|
||||
response = requests.post(api_url, headers=headers, json=prompt)
|
||||
return response
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--model", type=str, default="BAAI/bge-m3")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
api_url = f"http://{args.host}:{args.port}/pooling"
|
||||
model_name = args.model
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
prompt = {"model": model_name, "input": prompts}
|
||||
|
||||
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
for output in pooling_response.json()["data"]:
|
||||
multi_vector = torch.tensor(output["data"])
|
||||
print(multi_vector.shape)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -84,7 +84,7 @@ directly to load models:
|
||||
from vllm import LLM
|
||||
llm = LLM(
|
||||
"s3://my-bucket/vllm/facebook/opt-125m/v1",
|
||||
load_format="tensorizer"
|
||||
load_format="tensorizer",
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@ -107,7 +107,6 @@ markers = [
|
||||
"distributed: run this test only in distributed GPU tests",
|
||||
"skip_v1: do not run this test with v1",
|
||||
"optional: optional tests that are automatically skipped, include --optional to run them",
|
||||
"extra_server_args: extra arguments to pass to the server fixture",
|
||||
]
|
||||
|
||||
[tool.ty.src]
|
||||
|
||||
@ -7,7 +7,7 @@ requests >= 2.26.0
|
||||
tqdm
|
||||
blake3
|
||||
py-cpuinfo
|
||||
transformers >= 4.55.2
|
||||
transformers >= 4.56.0
|
||||
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
|
||||
protobuf # Required by LlamaTokenizer.
|
||||
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
||||
|
||||
@ -11,6 +11,7 @@ from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import CompilationConfig
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
@ -32,13 +33,13 @@ def temporary_environ(env_vars):
|
||||
os.environ[k] = v
|
||||
|
||||
|
||||
test_params_full_cudagraph = []
|
||||
model_backends_full_cudagraph = []
|
||||
|
||||
# deepseek-ai/DeepSeek-V2-Lite with MLA
|
||||
MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
|
||||
for mla_backend in MLA_backends:
|
||||
test_params_full_cudagraph.append(
|
||||
pytest.param(("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend]))
|
||||
model_backends_full_cudagraph.append(
|
||||
("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])
|
||||
)
|
||||
|
||||
# Qwen/Qwen2-1.5B-Instruct with other backends
|
||||
@ -46,14 +47,18 @@ other_backend_configs = [
|
||||
backend_configs[c] for c in backend_configs if c not in MLA_backends
|
||||
]
|
||||
for backend_config in other_backend_configs:
|
||||
test_params_full_cudagraph.append(
|
||||
pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config))
|
||||
)
|
||||
model_backends_full_cudagraph.append(("Qwen/Qwen2-1.5B-Instruct", backend_config))
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def llm_pair(request):
|
||||
model, backend_config = request.param
|
||||
model, backend_config, use_inductor_graph_partition = request.param
|
||||
backend_config.comp_config["use_inductor_graph_partition"] = (
|
||||
use_inductor_graph_partition
|
||||
)
|
||||
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("Inductor graph partition only supported in torch>=2.9")
|
||||
|
||||
# Dynamically skip test if GPU capability is not met
|
||||
if (
|
||||
@ -104,7 +109,15 @@ def llm_pair(request):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True)
|
||||
@pytest.mark.parametrize(
|
||||
"llm_pair",
|
||||
[
|
||||
pytest.param((model, backend_config, use_inductor_graph_partition))
|
||||
for model, backend_config in model_backends_full_cudagraph
|
||||
for use_inductor_graph_partition in [True, False]
|
||||
],
|
||||
indirect=True,
|
||||
)
|
||||
class TestFullCUDAGraph:
|
||||
"""
|
||||
Use a class such that an llm pair is constructed once for all
|
||||
|
||||
@ -5,6 +5,7 @@ Test (piecewise) compilation with a simple model where multiple submodules
|
||||
are compiled and graph captured separately.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
@ -13,12 +14,13 @@ from vllm.compilation.counter import compilation_counter
|
||||
from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CompilationLevel,
|
||||
CompilationMode,
|
||||
CUDAGraphMode,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from .. import silly_attention # noqa: F401
|
||||
@ -190,16 +192,21 @@ def run_model(
|
||||
return output.cpu()
|
||||
|
||||
|
||||
def test_multi_graph_piecewise_compile_outputs_equal():
|
||||
@pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
|
||||
def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
outputs = []
|
||||
|
||||
# piecewise compile
|
||||
# vllmcompile compile
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_cudagraph=True,
|
||||
splitting_ops=["silly::attention"],
|
||||
cudagraph_capture_sizes=[1, 2],
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
)
|
||||
)
|
||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||
@ -220,23 +227,31 @@ def test_multi_graph_piecewise_compile_outputs_equal():
|
||||
# static tensor addresses
|
||||
inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=2, # two graphs for the model
|
||||
num_piecewise_graphs_seen=6,
|
||||
if use_inductor_graph_partition:
|
||||
# Splitting happens at Inductor lowering level,
|
||||
# total piecewise fx graphs is equal to total graphs
|
||||
num_piecewise_fx = 2
|
||||
num_piecewise_capturable_fx = 2
|
||||
else:
|
||||
# attn_one, attn_two each has 3 piecewise graphs
|
||||
# (pre attn, post attn, silly_attention) each
|
||||
num_piecewise_capturable_graphs_seen=4,
|
||||
num_piecewise_fx = 6
|
||||
# attn_one, attn_two has pre attn and post attn each, total=4
|
||||
num_backend_compilations=4, # num_piecewise_capturable_graphs_seen
|
||||
num_cudagraph_captured=8,
|
||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
num_piecewise_capturable_fx = 4
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=2, # two graphs for the model
|
||||
num_piecewise_graphs_seen=num_piecewise_fx,
|
||||
num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
|
||||
num_backend_compilations=num_piecewise_capturable_fx,
|
||||
num_cudagraph_captured=8, # num_cudagraph_sizes * num_partitions
|
||||
):
|
||||
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
||||
|
||||
# no compile or cudagraph
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.NO_COMPILATION,
|
||||
mode=CompilationMode.NONE,
|
||||
)
|
||||
)
|
||||
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
||||
@ -265,9 +280,10 @@ def test_multi_graph_piecewise_compile_outputs_equal():
|
||||
# piecewise compile without CUDA graph
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_cudagraph=False,
|
||||
splitting_ops=["silly::attention"],
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
)
|
||||
)
|
||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||
@ -286,9 +302,9 @@ def test_multi_graph_piecewise_compile_outputs_equal():
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=2,
|
||||
num_piecewise_graphs_seen=6,
|
||||
num_piecewise_capturable_graphs_seen=4,
|
||||
num_backend_compilations=4,
|
||||
num_piecewise_graphs_seen=num_piecewise_fx,
|
||||
num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
|
||||
num_backend_compilations=num_piecewise_capturable_fx,
|
||||
num_cudagraph_captured=0, # no cudagraph captured
|
||||
):
|
||||
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.compilation.counter import compilation_counter
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CompilationLevel,
|
||||
CompilationMode,
|
||||
CUDAGraphMode,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
@ -61,7 +61,7 @@ def _run_simple_model(
|
||||
):
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_cudagraph=True,
|
||||
use_inductor=use_inductor,
|
||||
splitting_ops=splitting_ops,
|
||||
|
||||
@ -9,6 +9,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
|
||||
initialized randomly with a fixed seed.
|
||||
"""
|
||||
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
@ -20,12 +21,13 @@ from vllm.compilation.counter import compilation_counter
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CompilationLevel,
|
||||
CompilationMode,
|
||||
CUDAGraphMode,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from .. import silly_attention # noqa: F401
|
||||
@ -257,27 +259,13 @@ def tractable_computation(
|
||||
|
||||
|
||||
@torch.inference_mode
|
||||
def run_model(
|
||||
llama_config, use_compile: bool, backend: str, split_attn: bool = False
|
||||
) -> torch.Tensor:
|
||||
if use_compile:
|
||||
compilation_config = CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
use_cudagraph=True,
|
||||
backend=backend,
|
||||
cudagraph_capture_sizes=[1, 2],
|
||||
)
|
||||
if split_attn:
|
||||
compilation_config.splitting_ops = ["silly::attention"]
|
||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||
else:
|
||||
compilation_config = CompilationConfig(
|
||||
level=CompilationLevel.NO_COMPILATION,
|
||||
)
|
||||
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
||||
def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
|
||||
# Start with a fresh copy to make sure there's no cache dir sharing
|
||||
compile_config = deepcopy(compile_config)
|
||||
cudagraph_runtime_mode = compile_config.cudagraph_mode
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=compilation_config, additional_config=llama_config
|
||||
compilation_config=compile_config, additional_config=llama_config
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
model = (
|
||||
@ -338,8 +326,25 @@ def run_model(
|
||||
return output.cpu()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("backend", ["inductor", "eager"])
|
||||
def test_toy_llama(backend: str):
|
||||
@pytest.mark.parametrize(
|
||||
"backend, use_inductor_graph_partition",
|
||||
[
|
||||
("eager", False), # No inductor
|
||||
("inductor", False), # Inductor, Dynamo partition
|
||||
("inductor", True), # Inductor, Inductor partition
|
||||
],
|
||||
)
|
||||
def test_toy_llama(
|
||||
backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
|
||||
):
|
||||
# We disable the vLLM compile cache into a new tmp dir for 2 reasons:
|
||||
# 1. To make sure we can properly track the number of Inductor compilations.
|
||||
# 2. Inductor partitioning does not play nicely with Autograd cache (below)
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("Inductor graph partition only supported in torch>=2.9")
|
||||
|
||||
# compare output with and without piecewise compilation
|
||||
|
||||
llama_config = LlamaConfig(
|
||||
@ -350,6 +355,32 @@ def test_toy_llama(backend: str):
|
||||
hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True
|
||||
)
|
||||
|
||||
compile_config_no_compile = CompilationConfig(
|
||||
level=CompilationMode.NONE,
|
||||
cudagraph_mode=CUDAGraphMode.NONE,
|
||||
backend="eager",
|
||||
)
|
||||
|
||||
compile_config_no_split = CompilationConfig(
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
backend=backend,
|
||||
cudagraph_capture_sizes=[1, 2],
|
||||
)
|
||||
|
||||
# FIXME(luka/boyuan): the graph from the previous test case
|
||||
# (no inductor partition) gets cached by AotAutograd so then the
|
||||
# compilation with inductor partitioning incorrectly loads an unpartitioned
|
||||
# graph and never partitions. I think this is a bug with custom inductor
|
||||
# partitioning but does not affect vLLM more generally as vLLM uses its own
|
||||
# cache (which takes inductor partitioning into account).
|
||||
if use_inductor_graph_partition:
|
||||
compile_config_no_split.inductor_compile_config["force_disable_caches"] = True
|
||||
|
||||
compile_config_split = deepcopy(compile_config_no_split)
|
||||
compile_config_split.splitting_ops = ["silly::attention"]
|
||||
|
||||
outputs = []
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=0,
|
||||
@ -358,8 +389,9 @@ def test_toy_llama(backend: str):
|
||||
num_backend_compilations=0,
|
||||
num_cudagraph_captured=0,
|
||||
):
|
||||
outputs.append(run_model(llama_config, backend="eager", use_compile=False))
|
||||
run_model(tractable_config, backend="eager", use_compile=False)
|
||||
outputs.append(run_model(llama_config, compile_config_no_compile))
|
||||
|
||||
run_model(tractable_config, compile_config_no_compile)
|
||||
|
||||
if backend == "inductor":
|
||||
kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
|
||||
@ -367,35 +399,34 @@ def test_toy_llama(backend: str):
|
||||
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
|
||||
|
||||
with compilation_counter.expect(
|
||||
# One graph for the model
|
||||
num_graphs_seen=1,
|
||||
num_graphs_seen=1, # one graph for the model
|
||||
num_piecewise_graphs_seen=1,
|
||||
num_piecewise_capturable_graphs_seen=1,
|
||||
# num_piecewise_capturable_graphs_seen
|
||||
num_backend_compilations=1,
|
||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
num_backend_compilations=1, # num_piecewise_capturable_graphs_seen
|
||||
num_cudagraph_captured=2,
|
||||
**kwargs,
|
||||
):
|
||||
outputs.append(run_model(llama_config, backend=backend, use_compile=True))
|
||||
run_model(tractable_config, backend=backend, use_compile=True)
|
||||
outputs.append(run_model(llama_config, compile_config_no_split))
|
||||
|
||||
run_model(tractable_config, compile_config_no_split)
|
||||
|
||||
if use_inductor_graph_partition:
|
||||
num_piecewise_fx = 1
|
||||
num_piecewise_capturable_fx = 1
|
||||
else:
|
||||
num_piecewise_fx = 2 * llama_config.num_layers + 1
|
||||
num_piecewise_capturable_fx = 1 + llama_config.num_layers
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1, # one graph for the model
|
||||
num_piecewise_graphs_seen=2 * llama_config.num_layers + 1, # 2 * num_layers + 1
|
||||
num_piecewise_capturable_graphs_seen=1
|
||||
+ llama_config.num_layers, # 1 + num_layers
|
||||
num_backend_compilations=1
|
||||
+ llama_config.num_layers, # num_piecewise_capturable_graphs_seen
|
||||
num_cudagraph_captured=2
|
||||
* (
|
||||
1 + llama_config.num_layers
|
||||
), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
num_piecewise_graphs_seen=num_piecewise_fx,
|
||||
num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
|
||||
num_backend_compilations=num_piecewise_capturable_fx,
|
||||
# num_cudagraph_sizes * num_partitions
|
||||
num_cudagraph_captured=2 * (1 + llama_config.num_layers),
|
||||
):
|
||||
outputs.append(
|
||||
run_model(llama_config, backend=backend, use_compile=True, split_attn=True)
|
||||
)
|
||||
run_model(tractable_config, backend=backend, use_compile=True, split_attn=True)
|
||||
outputs.append(run_model(llama_config, compile_config_split))
|
||||
run_model(tractable_config, compile_config_split)
|
||||
|
||||
for i in range(1, len(outputs)):
|
||||
assert torch.allclose(outputs[0], outputs[i])
|
||||
@ -427,14 +458,14 @@ def benchmark():
|
||||
for piecewise in [False, True]:
|
||||
if piecewise:
|
||||
compilation_config = CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_cudagraph=True,
|
||||
splitting_ops=["silly::attention"],
|
||||
cudagraph_capture_sizes=cudagraph_sizes,
|
||||
)
|
||||
else:
|
||||
compilation_config = CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
cudagraph_capture_sizes=cudagraph_sizes,
|
||||
)
|
||||
|
||||
|
||||
@ -62,5 +62,4 @@ direct_register_custom_op(
|
||||
mutates_args=["out"],
|
||||
fake_impl=silly_attention_fake,
|
||||
target_lib=silly_lib,
|
||||
tags=(torch._C.Tag.cudagraph_unsafe,),
|
||||
)
|
||||
|
||||
@ -10,7 +10,7 @@ import torch
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CompilationLevel,
|
||||
CompilationMode,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module):
|
||||
def make_vllm_config() -> VllmConfig:
|
||||
return VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@ -10,6 +10,7 @@ import vllm.envs as envs
|
||||
from vllm.compilation.collective_fusion import AsyncTPPass
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CompilationMode,
|
||||
DeviceConfig,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
@ -400,7 +401,7 @@ def test_async_tp_pass_correctness(
|
||||
common_args.append("--enforce-eager")
|
||||
|
||||
compilation_config = {
|
||||
"level": 3,
|
||||
"mode": CompilationMode.VLLM_COMPILE,
|
||||
"compile_sizes": [2, 4, 8],
|
||||
"splitting_ops": [],
|
||||
"pass_config": {"enable_async_tp": async_tp_enabled},
|
||||
|
||||
@ -4,7 +4,7 @@ import dataclasses
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import CompilationLevel
|
||||
from vllm.config import CompilationMode
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..utils import compare_all_settings
|
||||
@ -21,7 +21,7 @@ class TestSetting:
|
||||
|
||||
|
||||
# we cannot afford testing the full Cartesian product
|
||||
# of all models and all levels
|
||||
# of all models and all modes
|
||||
@pytest.mark.parametrize(
|
||||
"test_setting",
|
||||
[
|
||||
@ -121,15 +121,13 @@ def test_compile_correctness(
|
||||
all_args: list[list[str]] = []
|
||||
all_envs: list[dict[str, str] | None] = []
|
||||
|
||||
for comp_level in [
|
||||
CompilationLevel.DYNAMO_AS_IS,
|
||||
CompilationLevel.DYNAMO_ONCE,
|
||||
CompilationLevel.PIECEWISE,
|
||||
for comp_mode in [
|
||||
CompilationMode.STOCK_TORCH_COMPILE,
|
||||
CompilationMode.DYNAMO_TRACE_ONCE,
|
||||
CompilationMode.VLLM_COMPILE,
|
||||
]:
|
||||
for level in [CompilationLevel.NO_COMPILATION, comp_level]:
|
||||
all_args.append(
|
||||
final_args + [f"-O.level={level}", "-O.backend=inductor"]
|
||||
)
|
||||
for mode in [CompilationMode.NONE, comp_mode]:
|
||||
all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"])
|
||||
|
||||
# inductor will change the output, so we only compare if the output
|
||||
# is close, not exactly the same.
|
||||
@ -142,13 +140,13 @@ def test_compile_correctness(
|
||||
all_envs.clear()
|
||||
all_args.clear()
|
||||
|
||||
for level in [
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
CompilationLevel.DYNAMO_AS_IS,
|
||||
CompilationLevel.DYNAMO_ONCE,
|
||||
CompilationLevel.PIECEWISE,
|
||||
for mode in [
|
||||
CompilationMode.NONE,
|
||||
CompilationMode.STOCK_TORCH_COMPILE,
|
||||
CompilationMode.DYNAMO_TRACE_ONCE,
|
||||
CompilationMode.VLLM_COMPILE,
|
||||
]:
|
||||
all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"])
|
||||
all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"])
|
||||
all_envs.append({})
|
||||
all_envs.append({})
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@ import pytest
|
||||
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
|
||||
from vllm.config.compilation import CompilationLevel
|
||||
from vllm.config.compilation import CompilationMode
|
||||
from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
|
||||
|
||||
|
||||
@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
|
||||
|
||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||
@pytest.mark.forked
|
||||
def test_dynamo_as_is(vllm_runner, monkeypatch):
|
||||
def test_stock_torch_compile(vllm_runner, monkeypatch):
|
||||
# Disable multiprocessing so that the counter is in the same process
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
with (
|
||||
compilation_counter.expect(dynamo_as_is_count=1),
|
||||
compilation_counter.expect(stock_torch_compile_count=1),
|
||||
# loading the model causes compilation (if enabled) to happen
|
||||
vllm_runner(
|
||||
"facebook/opt-125m",
|
||||
compilation_config={"level": 1},
|
||||
compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE},
|
||||
gpu_memory_utilization=0.4,
|
||||
) as _,
|
||||
):
|
||||
@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch):
|
||||
# Disable multiprocessing so that the counter is in the same process
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
with (
|
||||
compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
|
||||
compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
|
||||
# loading the model causes compilation (if enabled) to happen
|
||||
vllm_runner(
|
||||
"facebook/opt-125m",
|
||||
compilation_config={"level": 0},
|
||||
compilation_config={"mode": CompilationMode.NONE},
|
||||
gpu_memory_utilization=0.4,
|
||||
) as _,
|
||||
):
|
||||
@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch):
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
with (
|
||||
compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
|
||||
compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
|
||||
# loading the model causes compilation (if enabled) to happen
|
||||
vllm_runner(
|
||||
"facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
|
||||
@ -151,7 +151,7 @@ def test_splitting_ops_dynamic():
|
||||
if is_torch_equal_or_newer("2.9.0.dev"):
|
||||
config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
use_inductor_graph_partition=True,
|
||||
splitting_ops=["vllm::unified_attention"],
|
||||
)
|
||||
@ -163,7 +163,7 @@ def test_splitting_ops_dynamic():
|
||||
# When attn_fusion pass enabled, splitting_ops now default to attention ops.
|
||||
config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
||||
custom_ops=["+quant_fp8"],
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
@ -178,7 +178,7 @@ def test_splitting_ops_dynamic():
|
||||
if is_torch_equal_or_newer("2.9.0.dev"):
|
||||
config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
use_inductor_graph_partition=True,
|
||||
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
||||
custom_ops=["+quant_fp8"],
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
@ -8,12 +9,13 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_comp
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
CompilationConfig,
|
||||
CompilationLevel,
|
||||
CompilationMode,
|
||||
CUDAGraphMode,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from . import silly_attention # noqa: F401
|
||||
@ -65,18 +67,40 @@ def run_model(
|
||||
return output.cpu()
|
||||
|
||||
|
||||
def test_ignore_torch_compile_decorator():
|
||||
@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
|
||||
def test_ignore_torch_compile_decorator(use_inductor_graph_partition, monkeypatch):
|
||||
# disable compile cache so that we can count the number of compilations
|
||||
# appropriately
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
# piecewise
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_cudagraph=True,
|
||||
splitting_ops=["silly::attention"],
|
||||
cudagraph_capture_sizes=[1, 2],
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
)
|
||||
)
|
||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||
|
||||
expected_num_graphs_seen = 1
|
||||
expected_num_cudagraph_captured = (
|
||||
4 # num_cudagraph_sizes * num cudagraphs to capture
|
||||
)
|
||||
if use_inductor_graph_partition:
|
||||
expected_num_piecewise_graphs_seen = 1
|
||||
expected_num_piecewise_capturable_graphs_seen = 1
|
||||
expected_num_backend_compilations = 1
|
||||
else:
|
||||
expected_num_piecewise_graphs_seen = 3
|
||||
expected_num_piecewise_capturable_graphs_seen = 2
|
||||
expected_num_backend_compilations = 2
|
||||
|
||||
@support_torch_compile
|
||||
class A(nn.Module):
|
||||
def __init__(
|
||||
@ -103,12 +127,11 @@ def test_ignore_torch_compile_decorator():
|
||||
|
||||
# A has support_torch_compile
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1,
|
||||
num_piecewise_graphs_seen=3,
|
||||
num_piecewise_capturable_graphs_seen=2,
|
||||
num_backend_compilations=2,
|
||||
num_cudagraph_captured=4,
|
||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
num_graphs_seen=expected_num_graphs_seen,
|
||||
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
||||
num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
|
||||
num_backend_compilations=expected_num_backend_compilations,
|
||||
num_cudagraph_captured=expected_num_cudagraph_captured,
|
||||
):
|
||||
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
|
||||
|
||||
@ -130,12 +153,11 @@ def test_ignore_torch_compile_decorator():
|
||||
|
||||
# C's support_torch_compile should override B's ignore_torch_compile
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1,
|
||||
num_piecewise_graphs_seen=3,
|
||||
num_piecewise_capturable_graphs_seen=2,
|
||||
num_backend_compilations=2,
|
||||
num_cudagraph_captured=4,
|
||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
num_graphs_seen=expected_num_graphs_seen,
|
||||
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
||||
num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
|
||||
num_backend_compilations=expected_num_backend_compilations,
|
||||
num_cudagraph_captured=expected_num_cudagraph_captured,
|
||||
):
|
||||
run_model(vllm_config, mod_C, cudagraph_runtime_mode)
|
||||
|
||||
@ -178,16 +200,25 @@ class A(nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
def test_conditional_compile_enable_if():
|
||||
@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
|
||||
def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch):
|
||||
# disable compile cache so that we can count the number of compilations
|
||||
# appropriately
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
cache_config=CacheConfig(
|
||||
kv_sharing_fast_prefill=True,
|
||||
),
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_cudagraph=True,
|
||||
splitting_ops=["silly::attention"],
|
||||
cudagraph_capture_sizes=[1, 2],
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
),
|
||||
)
|
||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||
@ -195,17 +226,26 @@ def test_conditional_compile_enable_if():
|
||||
with set_current_vllm_config(vllm_config):
|
||||
mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
|
||||
|
||||
if use_inductor_graph_partition:
|
||||
expected_num_piecewise_graphs_seen = 2
|
||||
expected_num_piecewise_capturable_graphs_seen = 2
|
||||
expected_num_backend_compilations = 2
|
||||
else:
|
||||
expected_num_piecewise_graphs_seen = 6
|
||||
expected_num_piecewise_capturable_graphs_seen = 4
|
||||
expected_num_backend_compilations = 4
|
||||
|
||||
# A has support_torch_compile but enable_if fn returns False
|
||||
# enalbe_if will be True for B, so we expect mod1 and mod2
|
||||
# to be compiled
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=2,
|
||||
num_piecewise_graphs_seen=6,
|
||||
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
||||
# 3 piecewise graphs per instance of B()
|
||||
num_piecewise_capturable_graphs_seen=4,
|
||||
num_backend_compilations=4,
|
||||
num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
|
||||
num_backend_compilations=expected_num_backend_compilations,
|
||||
num_cudagraph_captured=8,
|
||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
# num_cudagraph_sizes * num cudagraphable graphs to capture
|
||||
):
|
||||
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
|
||||
|
||||
@ -216,23 +256,34 @@ def test_conditional_compile_enable_if():
|
||||
kv_sharing_fast_prefill=False,
|
||||
),
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_cudagraph=True,
|
||||
splitting_ops=["silly::attention"],
|
||||
cudagraph_capture_sizes=[1, 2],
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
),
|
||||
)
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
|
||||
|
||||
if use_inductor_graph_partition:
|
||||
expected_num_piecewise_graphs_seen = 1
|
||||
expected_num_piecewise_capturable_graphs_seen = 1
|
||||
expected_num_backend_compilations = 1
|
||||
else:
|
||||
# 3 attn ops and 4 non-attn ops
|
||||
expected_num_piecewise_graphs_seen = 7
|
||||
expected_num_piecewise_capturable_graphs_seen = 4
|
||||
expected_num_backend_compilations = 4
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1,
|
||||
num_piecewise_graphs_seen=7,
|
||||
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
||||
# 3 attn ops and 4 non-attn ops
|
||||
num_piecewise_capturable_graphs_seen=4,
|
||||
num_backend_compilations=4,
|
||||
num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
|
||||
num_backend_compilations=expected_num_backend_compilations,
|
||||
num_cudagraph_captured=8,
|
||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
# num_cudagraph_sizes * num cudagraphable graphs to capture
|
||||
):
|
||||
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
|
||||
|
||||
@ -12,7 +12,7 @@ from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||
from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig
|
||||
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"optimization_level",
|
||||
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
|
||||
"compilation_mode",
|
||||
[CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
|
||||
)
|
||||
@pytest.mark.parametrize("model_info", models_list(all=True))
|
||||
@create_new_process_for_each_test()
|
||||
def test_full_graph(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
model_info: tuple[str, dict[str, Any]],
|
||||
optimization_level: int,
|
||||
compilation_mode: int,
|
||||
):
|
||||
model, model_kwargs = model_info
|
||||
|
||||
with monkeypatch.context():
|
||||
print(f"MODEL={model}")
|
||||
|
||||
run_model(optimization_level, model, model_kwargs)
|
||||
run_model(compilation_mode, model, model_kwargs)
|
||||
|
||||
|
||||
# TODO(luka) add other supported compilation config scenarios here
|
||||
@ -104,7 +104,7 @@ def test_full_graph(
|
||||
[
|
||||
# additional compile sizes, only some of the models
|
||||
(
|
||||
CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]),
|
||||
CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
|
||||
model,
|
||||
)
|
||||
for model in models_list(all=False)
|
||||
@ -113,7 +113,7 @@ def test_full_graph(
|
||||
# RMSNorm + quant fusion, only 8-bit quant models
|
||||
(
|
||||
CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
custom_ops=["+rms_norm"],
|
||||
pass_config=PassConfig(enable_fusion=True, enable_noop=True),
|
||||
),
|
||||
@ -125,7 +125,8 @@ def test_full_graph(
|
||||
# Test depyf integration works
|
||||
(
|
||||
CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir()
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
debug_dump_path=tempfile.gettempdir(),
|
||||
),
|
||||
("facebook/opt-125m", {}),
|
||||
),
|
||||
@ -134,7 +135,7 @@ def test_full_graph(
|
||||
# graph inductor partition
|
||||
(
|
||||
CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
# inductor graph partition uses
|
||||
# torch._C.Tag.cudagraph_unsafe to specify splitting ops
|
||||
use_inductor_graph_partition=True,
|
||||
@ -164,10 +165,10 @@ def test_custom_compile_config(
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"optimization_level",
|
||||
[CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE],
|
||||
"compilation_mode",
|
||||
[CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
|
||||
)
|
||||
def test_fp8_kv_scale_compile(optimization_level: int):
|
||||
def test_fp8_kv_scale_compile(compilation_mode: int):
|
||||
model = "Qwen/Qwen2-0.5B"
|
||||
model_kwargs = {
|
||||
"quantization": "fp8",
|
||||
@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int):
|
||||
"calculate_kv_scales": True,
|
||||
"max_model_len": 512,
|
||||
}
|
||||
run_model(optimization_level, model, model_kwargs)
|
||||
run_model(compilation_mode, model, model_kwargs)
|
||||
|
||||
|
||||
def test_inductor_graph_partition_attn_fusion(caplog_vllm):
|
||||
@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
|
||||
|
||||
model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
|
||||
compilation_config = CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_inductor_graph_partition=True,
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
custom_ops=["+quant_fp8"],
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.compilation.fusion import (
|
||||
)
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig
|
||||
from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
GroupShape,
|
||||
@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant(
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
custom_ops=["+rms_norm", "+quant_fp8"],
|
||||
pass_config=PassConfig(enable_fusion=True, enable_noop=True),
|
||||
)
|
||||
|
||||
@ -12,7 +12,7 @@ from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CompilationLevel,
|
||||
CompilationMode,
|
||||
DeviceConfig,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model(
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"]
|
||||
mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"]
|
||||
)
|
||||
)
|
||||
vllm_config.compilation_config.pass_config = PassConfig(
|
||||
|
||||
@ -19,7 +19,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
CompilationConfig,
|
||||
CompilationLevel,
|
||||
CompilationMode,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
SchedulerConfig,
|
||||
@ -321,7 +321,7 @@ def test_attention_quant_pattern(
|
||||
),
|
||||
scheduler_config=SchedulerConfig(max_num_seqs=1024),
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
custom_ops=["+quant_fp8"],
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
),
|
||||
@ -421,7 +421,9 @@ def test_attention_quant_pattern(
|
||||
]
|
||||
if any(attn_fusion_supported):
|
||||
# Check quantization ops in the graph before and after fusion
|
||||
test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=True)
|
||||
# Note: fully_replaced=False because query quant ops remain in graph.
|
||||
# Only output quant ops are fused into attention.
|
||||
test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=False)
|
||||
|
||||
# access the underlying `AttnFusionPass` on the `LazyInitPass`
|
||||
assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)
|
||||
|
||||
@ -6,7 +6,7 @@ import torch
|
||||
|
||||
import vllm
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig
|
||||
from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
|
||||
|
||||
from .backend import TestBackend
|
||||
|
||||
@ -50,7 +50,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
pass_config=PassConfig(enable_noop=True),
|
||||
)
|
||||
)
|
||||
@ -98,7 +98,7 @@ def test_non_noop_slice_preserved():
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
pass_config=PassConfig(enable_noop=True),
|
||||
)
|
||||
)
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
import torch
|
||||
|
||||
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
|
||||
from vllm.config import CompilationLevel
|
||||
from vllm.config import CompilationMode
|
||||
|
||||
|
||||
class MyMod(torch.nn.Module):
|
||||
@ -20,7 +20,7 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
|
||||
self.model = model
|
||||
compiled_callable = torch.compile(self.forward, backend="eager")
|
||||
super().__init__(
|
||||
compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
|
||||
compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||
|
||||
@ -334,7 +334,7 @@ class HfRunner:
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
self.device = self.get_default_device()
|
||||
self.dtype = torch_dtype = _get_and_verify_dtype(
|
||||
self.dtype = dtype = _get_and_verify_dtype(
|
||||
self.model_name,
|
||||
self.config,
|
||||
dtype=dtype,
|
||||
@ -342,7 +342,7 @@ class HfRunner:
|
||||
)
|
||||
|
||||
model_kwargs = model_kwargs if model_kwargs is not None else {}
|
||||
model_kwargs.setdefault("torch_dtype", torch_dtype)
|
||||
model_kwargs.setdefault("dtype", dtype)
|
||||
|
||||
if is_sentence_transformer:
|
||||
# Lazy init required for AMD CI
|
||||
@ -388,7 +388,7 @@ class HfRunner:
|
||||
if not skip_tokenizer_init:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=dtype,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
|
||||
@ -398,7 +398,7 @@ class HfRunner:
|
||||
|
||||
self.processor = AutoProcessor.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=dtype,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
if skip_tokenizer_init:
|
||||
@ -1011,8 +1011,12 @@ class VllmRunner:
|
||||
req_outputs = self.llm.embed(inputs, *args, **kwargs)
|
||||
return [req_output.outputs.embedding for req_output in req_outputs]
|
||||
|
||||
def encode(self, prompts: list[str]) -> list[list[float]]:
|
||||
req_outputs = self.llm.encode(prompts)
|
||||
def token_embed(self, prompts: list[str]) -> list[list[float]]:
|
||||
req_outputs = self.llm.encode(prompts, pooling_task="token_embed")
|
||||
return [req_output.outputs.data for req_output in req_outputs]
|
||||
|
||||
def token_classify(self, prompts: list[str]) -> list[list[float]]:
|
||||
req_outputs = self.llm.encode(prompts, pooling_task="token_classify")
|
||||
return [req_output.outputs.data for req_output in req_outputs]
|
||||
|
||||
def reward(self, prompts: list[str]) -> list[list[float]]:
|
||||
|
||||
@ -15,6 +15,7 @@ from typing import Literal, NamedTuple
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config.compilation import CompilationMode
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@ -234,7 +235,7 @@ def _compare_sp(
|
||||
common_args.append("--skip-tokenizer-init")
|
||||
|
||||
compilation_config = {
|
||||
"level": 3,
|
||||
"mode": CompilationMode.VLLM_COMPILE,
|
||||
"custom_ops": ["+rms_norm"],
|
||||
"compile_sizes": [4, 8],
|
||||
"pass_config": {
|
||||
|
||||
@ -226,30 +226,30 @@ def test_compilation_config():
|
||||
|
||||
# set to O3
|
||||
args = parser.parse_args(["-O0"])
|
||||
assert args.compilation_config.level == 0
|
||||
assert args.compilation_config.mode == 0
|
||||
|
||||
# set to O 3 (space)
|
||||
args = parser.parse_args(["-O", "1"])
|
||||
assert args.compilation_config.level == 1
|
||||
assert args.compilation_config.mode == 1
|
||||
|
||||
# set to O 3 (equals)
|
||||
args = parser.parse_args(["-O=2"])
|
||||
assert args.compilation_config.level == 2
|
||||
assert args.compilation_config.mode == 2
|
||||
|
||||
# set to O.level 3
|
||||
args = parser.parse_args(["-O.level", "3"])
|
||||
assert args.compilation_config.level == 3
|
||||
# set to O.mode 3
|
||||
args = parser.parse_args(["-O.mode", "3"])
|
||||
assert args.compilation_config.mode == 3
|
||||
|
||||
# set to string form of a dict
|
||||
args = parser.parse_args(
|
||||
[
|
||||
"-O",
|
||||
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
||||
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
||||
'"use_inductor": false}',
|
||||
]
|
||||
)
|
||||
assert (
|
||||
args.compilation_config.level == 3
|
||||
args.compilation_config.mode == 3
|
||||
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
||||
and not args.compilation_config.use_inductor
|
||||
)
|
||||
@ -258,12 +258,12 @@ def test_compilation_config():
|
||||
args = parser.parse_args(
|
||||
[
|
||||
"--compilation-config="
|
||||
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
||||
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
||||
'"use_inductor": true}',
|
||||
]
|
||||
)
|
||||
assert (
|
||||
args.compilation_config.level == 3
|
||||
args.compilation_config.mode == 3
|
||||
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
||||
and args.compilation_config.use_inductor
|
||||
)
|
||||
|
||||
@ -53,21 +53,34 @@ def base64_encoded_audio() -> dict[str, str]:
|
||||
}
|
||||
|
||||
|
||||
def dummy_messages_from_audio_url(
|
||||
audio_urls: str | list[str],
|
||||
content_text: str = "What's happening in this audio?",
|
||||
):
|
||||
if isinstance(audio_urls, str):
|
||||
audio_urls = [audio_urls]
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "audio_url", "audio_url": {"url": audio_url}}
|
||||
for audio_url in audio_urls
|
||||
),
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
|
||||
async def test_single_chat_session_audio(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "audio_url", "audio_url": {"url": audio_url}},
|
||||
{"type": "text", "text": "What's happening in this audio?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_audio_url(audio_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
@ -138,20 +151,9 @@ async def test_single_chat_session_audio_base64encoded(
|
||||
audio_url: str,
|
||||
base64_encoded_audio: dict[str, str],
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's happening in this audio?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_audio_url(
|
||||
f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
|
||||
)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
@ -252,15 +254,7 @@ async def test_single_chat_session_input_audio(
|
||||
async def test_chat_streaming_audio(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "audio_url", "audio_url": {"url": audio_url}},
|
||||
{"type": "text", "text": "What's happening in this audio?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_audio_url(audio_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
@ -365,18 +359,7 @@ async def test_chat_streaming_input_audio(
|
||||
async def test_multi_audio_input(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "audio_url", "audio_url": {"url": audio_url}}
|
||||
for audio_url in audio_urls
|
||||
),
|
||||
{"type": "text", "text": "What's happening in this audio?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_audio_url(audio_urls)
|
||||
|
||||
if len(audio_urls) > MAXIMUM_AUDIOS:
|
||||
with pytest.raises(openai.BadRequestError): # test multi-audio input
|
||||
|
||||
@ -55,21 +55,34 @@ def base64_encoded_video() -> dict[str, str]:
|
||||
}
|
||||
|
||||
|
||||
def dummy_messages_from_video_url(
|
||||
video_urls: str | list[str],
|
||||
content_text: str = "What's in this video?",
|
||||
):
|
||||
if isinstance(video_urls, str):
|
||||
video_urls = [video_urls]
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "video_url", "video_url": {"url": video_url}}
|
||||
for video_url in video_urls
|
||||
),
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_single_chat_session_video(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video_url", "video_url": {"url": video_url}},
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_video_url(video_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
@ -137,15 +150,7 @@ async def test_error_on_invalid_video_url_type(
|
||||
async def test_single_chat_session_video_beamsearch(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video_url", "video_url": {"url": video_url}},
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_video_url(video_url)
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
@ -172,20 +177,9 @@ async def test_single_chat_session_video_base64encoded(
|
||||
video_url: str,
|
||||
base64_encoded_video: dict[str, str],
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_video_url(
|
||||
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||
)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
@ -231,20 +225,10 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
|
||||
video_url: str,
|
||||
base64_encoded_video: dict[str, str],
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_video_url(
|
||||
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||
)
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
@ -265,15 +249,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
|
||||
async def test_chat_streaming_video(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video_url", "video_url": {"url": video_url}},
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_video_url(video_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
@ -318,18 +294,7 @@ async def test_chat_streaming_video(
|
||||
async def test_multi_video_input(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "video_url", "video_url": {"url": video_url}}
|
||||
for video_url in video_urls
|
||||
),
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_video_url(video_urls)
|
||||
|
||||
if len(video_urls) > MAXIMUM_VIDEOS:
|
||||
with pytest.raises(openai.BadRequestError): # test multi-video input
|
||||
|
||||
@ -78,6 +78,27 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
}
|
||||
|
||||
|
||||
def dummy_messages_from_image_url(
|
||||
image_urls: str | list[str],
|
||||
content_text: str = "What's in this image?",
|
||||
):
|
||||
if isinstance(image_urls, str):
|
||||
image_urls = [image_urls]
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "image_url", "image_url": {"url": image_url}}
|
||||
for image_url in image_urls
|
||||
),
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_name, trust_remote_code=True, num_crops=4
|
||||
@ -107,15 +128,7 @@ async def test_single_chat_session_image(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_image_url(image_url, content_text)
|
||||
|
||||
max_completion_tokens = 10
|
||||
# test single completion
|
||||
@ -188,15 +201,8 @@ async def test_error_on_invalid_image_url_type(
|
||||
async def test_single_chat_session_image_beamsearch(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
content_text = "What's in this image?"
|
||||
messages = dummy_messages_from_image_url(image_url, content_text)
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
@ -226,20 +232,10 @@ async def test_single_chat_session_image_base64encoded(
|
||||
base64_encoded_image: dict[str, str],
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_image_url(
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
|
||||
content_text,
|
||||
)
|
||||
|
||||
max_completion_tokens = 10
|
||||
# test single completion
|
||||
@ -293,20 +289,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
|
||||
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_image_url(
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
||||
)
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
@ -326,15 +312,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
async def test_chat_streaming_image(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_image_url(image_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
@ -381,18 +359,7 @@ async def test_chat_streaming_image(
|
||||
async def test_multi_image_input(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "image_url", "image_url": {"url": image_url}}
|
||||
for image_url in image_urls
|
||||
),
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
messages = dummy_messages_from_image_url(image_urls)
|
||||
|
||||
if len(image_urls) > MAXIMUM_IMAGES:
|
||||
with pytest.raises(openai.BadRequestError): # test multi-image input
|
||||
|
||||
243
tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
Normal file
243
tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
Normal file
@ -0,0 +1,243 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.entrypoints.openai.tool_parsers.utils import (
|
||||
run_tool_extraction,
|
||||
run_tool_extraction_streaming,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall
|
||||
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
|
||||
|
||||
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
|
||||
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
|
||||
SIMPLE_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments='{"city": "San Francisco", "metric": "celsius"}',
|
||||
)
|
||||
MORE_TYPES_FUNCTION_OUTPUT = (
|
||||
"register_user(name='John Doe', "
|
||||
"age=37, "
|
||||
"address={'city': 'San Francisco', 'state': 'CA'}, "
|
||||
"role=None, "
|
||||
"passed_test=True, "
|
||||
"aliases=['John', 'Johnny'])"
|
||||
)
|
||||
MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS = (
|
||||
"register_user(name='John Doe', "
|
||||
"age=37, "
|
||||
"address={'city': 'San Francisco', 'state': 'CA'}, "
|
||||
"role=null, "
|
||||
"passed_test=true, "
|
||||
"aliases=['John', 'Johnny'])"
|
||||
)
|
||||
MORE_TYPES_FUNCTION_CALL = FunctionCall(
|
||||
name="register_user",
|
||||
arguments='{"name": "John Doe", '
|
||||
'"age": 37, '
|
||||
'"address": {"city": "San Francisco", "state": "CA"}, '
|
||||
'"role": null, '
|
||||
'"passed_test": true, '
|
||||
'"aliases": ["John", "Johnny"]}',
|
||||
)
|
||||
PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
|
||||
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments="{}",
|
||||
)
|
||||
EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
|
||||
EMPTY_DICT_FUNCTION_CALL = FunctionCall(
|
||||
name="do_something_cool",
|
||||
arguments='{"additional_data": {}}',
|
||||
)
|
||||
EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
|
||||
EMPTY_LIST_FUNCTION_CALL = FunctionCall(
|
||||
name="do_something_cool",
|
||||
arguments='{"steps": []}',
|
||||
)
|
||||
ESCAPED_STRING_FUNCTION_OUTPUT = (
|
||||
r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
|
||||
)
|
||||
ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
|
||||
name="get_weather",
|
||||
arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [True, False])
|
||||
def test_no_tool_call(streaming: bool):
|
||||
mock_tokenizer = MagicMock()
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
|
||||
model_output = "How can I help you today?"
|
||||
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, model_output, streaming=streaming
|
||||
)
|
||||
|
||||
assert content == model_output
|
||||
assert len(tool_calls) == 0
|
||||
|
||||
|
||||
TEST_CASES = [
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
|
||||
[SIMPLE_FUNCTION_CALL],
|
||||
id="simple_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
|
||||
[SIMPLE_FUNCTION_CALL],
|
||||
id="simple_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_streaming_json_literals",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
|
||||
[MORE_TYPES_FUNCTION_CALL],
|
||||
id="more_types_nonstreaming_json_literals",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
|
||||
[PARAMETERLESS_FUNCTION_CALL],
|
||||
id="parameterless_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
|
||||
[PARAMETERLESS_FUNCTION_CALL],
|
||||
id="parameterless_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
|
||||
[EMPTY_DICT_FUNCTION_CALL],
|
||||
id="empty_dict_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
|
||||
[EMPTY_DICT_FUNCTION_CALL],
|
||||
id="empty_dict_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
|
||||
[EMPTY_LIST_FUNCTION_CALL],
|
||||
id="empty_list_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
|
||||
[EMPTY_LIST_FUNCTION_CALL],
|
||||
id="empty_list_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
|
||||
[ESCAPED_STRING_FUNCTION_CALL],
|
||||
id="escaped_string_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
|
||||
[ESCAPED_STRING_FUNCTION_CALL],
|
||||
id="escaped_string_nonstreaming",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
|
||||
id="parallel_calls_streaming",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
|
||||
id="parallel_calls_nonstreaming",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
|
||||
def test_tool_call(
|
||||
streaming: bool, model_output: str, expected_tool_calls: list[FunctionCall]
|
||||
):
|
||||
mock_tokenizer = MagicMock()
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
|
||||
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, model_output, streaming=streaming
|
||||
)
|
||||
|
||||
assert content is None
|
||||
assert len(tool_calls) == len(expected_tool_calls)
|
||||
for actual, expected in zip(tool_calls, expected_tool_calls):
|
||||
assert actual.type == "function"
|
||||
assert actual.function == expected
|
||||
|
||||
|
||||
def test_streaming_tool_call_with_large_steps():
|
||||
mock_tokenizer = MagicMock()
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
|
||||
model_output_deltas = [
|
||||
"<function_calls>get_weather(city='San",
|
||||
" Francisco', metric='celsius')\n"
|
||||
f"{PARAMETERLESS_FUNCTION_OUTPUT}\n"
|
||||
f"{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
|
||||
]
|
||||
|
||||
reconstructor = run_tool_extraction_streaming(
|
||||
tool_parser, model_output_deltas, assert_one_tool_per_delta=False
|
||||
)
|
||||
|
||||
assert reconstructor.other_content == ""
|
||||
assert len(reconstructor.tool_calls) == 3
|
||||
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
|
||||
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [False])
|
||||
def test_regex_timeout_handling(streaming: bool):
|
||||
"""test regex timeout is handled gracefully"""
|
||||
mock_tokenizer = MagicMock()
|
||||
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
|
||||
|
||||
fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
|
||||
|
||||
# create a mock regex that raises TimeoutError
|
||||
mock_regex = MagicMock()
|
||||
mock_regex.match.side_effect = TimeoutError("Regex timeout")
|
||||
|
||||
with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
|
||||
content, tool_calls = run_tool_extraction(
|
||||
tool_parser, fake_problematic_input, streaming=streaming
|
||||
)
|
||||
|
||||
# should treat as regular text when regex times out
|
||||
assert content == fake_problematic_input
|
||||
assert len(tool_calls) == 0
|
||||
mock_regex.match.assert_called_once()
|
||||
@ -63,7 +63,7 @@ def test_encode_api(llm: LLM):
|
||||
# chunked prefill does not support all pooling
|
||||
err_msg = "pooling_task must be one of.+"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
llm.encode(prompts, use_tqdm=False)
|
||||
llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
|
||||
|
||||
|
||||
def test_score_api(llm: LLM):
|
||||
|
||||
@ -35,6 +35,13 @@ def llm():
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_encode_api(llm: LLM):
|
||||
outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
|
||||
multi_vector = outputs[0].outputs.data
|
||||
assert multi_vector.shape == (11, 384)
|
||||
|
||||
|
||||
def test_pooling_params(llm: LLM):
|
||||
def get_outputs(normalize):
|
||||
outputs = llm.embed(
|
||||
|
||||
@ -57,20 +57,24 @@ def test_multiple_pooling_params(llm: LLM):
|
||||
]
|
||||
|
||||
# Multiple PoolingParams should be matched with each prompt
|
||||
outputs = llm.encode(PROMPTS, pooling_params=pooling_params)
|
||||
outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed")
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3])
|
||||
outputs = llm.encode(
|
||||
PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed"
|
||||
)
|
||||
|
||||
# Single PoolingParams should be applied to every prompt
|
||||
single_pooling_params = PoolingParams()
|
||||
outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params)
|
||||
outputs = llm.encode(
|
||||
PROMPTS, pooling_params=single_pooling_params, pooling_task="embed"
|
||||
)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# pooling_params is None, default params should be applied
|
||||
outputs = llm.encode(PROMPTS, pooling_params=None)
|
||||
outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed")
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
|
||||
|
||||
@ -36,22 +36,23 @@ def llm():
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
def get_outputs(softmax):
|
||||
def get_outputs(activation):
|
||||
outputs = llm.reward(
|
||||
prompts, pooling_params=PoolingParams(softmax=softmax), use_tqdm=False
|
||||
prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False
|
||||
)
|
||||
return torch.cat([x.outputs.data for x in outputs])
|
||||
|
||||
default = get_outputs(softmax=None)
|
||||
w_softmax = get_outputs(softmax=True)
|
||||
wo_softmax = get_outputs(softmax=False)
|
||||
default = get_outputs(activation=None)
|
||||
w_activation = get_outputs(activation=True)
|
||||
wo_activation = get_outputs(activation=False)
|
||||
|
||||
assert torch.allclose(default, w_softmax, atol=1e-2), "Default should use softmax."
|
||||
assert not torch.allclose(w_softmax, wo_softmax, atol=1e-2), (
|
||||
"wo_softmax should not use softmax."
|
||||
assert torch.allclose(default, w_activation, atol=1e-2), (
|
||||
"Default should use activation."
|
||||
)
|
||||
assert torch.allclose(softmax(wo_softmax), w_softmax, atol=1e-2), (
|
||||
"w_softmax should be close to softmax(wo_softmax)."
|
||||
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
|
||||
"wo_activation should not use activation."
|
||||
)
|
||||
assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
|
||||
"w_activation should be close to activation(wo_activation)."
|
||||
)
|
||||
|
||||
@ -17,6 +17,7 @@ from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
EMBED_DTYPE_TO_TORCH_DTYPE,
|
||||
EmbeddingResponse,
|
||||
PoolingResponse,
|
||||
)
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
@ -509,3 +510,20 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str):
|
||||
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
|
||||
"w_normal should be close to normal(wo_normal)."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_pooling(server: RemoteOpenAIServer, model_name: str):
|
||||
input_text = ["The chef prepared a delicious meal."]
|
||||
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={"model": model_name, "input": input_text, "encoding_format": "float"},
|
||||
)
|
||||
|
||||
poolings = PoolingResponse.model_validate(response.json())
|
||||
|
||||
assert len(poolings.data) == 1
|
||||
assert len(poolings.data[0].data) == 11
|
||||
assert len(poolings.data[0].data[0]) == 384
|
||||
|
||||
@ -7,7 +7,7 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.openai.protocol import RerankResponse
|
||||
from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
|
||||
|
||||
MODEL_NAME = "BAAI/bge-reranker-base"
|
||||
DTYPE = "bfloat16"
|
||||
@ -159,3 +159,20 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str):
|
||||
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
|
||||
"w_activation should be close to activation(wo_activation)."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_pooling(server: RemoteOpenAIServer, model_name: str):
|
||||
input_text = ["The chef prepared a delicious meal."]
|
||||
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={"model": model_name, "input": input_text, "encoding_format": "float"},
|
||||
)
|
||||
|
||||
poolings = PoolingResponse.model_validate(response.json())
|
||||
|
||||
assert len(poolings.data) == 1
|
||||
assert len(poolings.data[0].data) == 11
|
||||
assert len(poolings.data[0].data[0]) == 1
|
||||
|
||||
@ -6,7 +6,11 @@ from unittest.mock import MagicMock, patch
|
||||
import pytest
|
||||
from openai_harmony import Author, Message, Role, StreamState, TextContent
|
||||
|
||||
from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext
|
||||
from vllm.entrypoints.context import (
|
||||
HarmonyContext,
|
||||
StreamingHarmonyContext,
|
||||
TurnMetrics,
|
||||
)
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
|
||||
|
||||
@ -101,8 +105,12 @@ def test_single_turn_token_counting():
|
||||
|
||||
# Verify internal state tracking
|
||||
assert not context.is_first_turn
|
||||
assert context.previous_turn.input_tokens == 5
|
||||
assert context.previous_turn.output_tokens == 3
|
||||
assert len(context.all_turn_metrics) == 1
|
||||
previous_turn = context.all_turn_metrics[0]
|
||||
assert previous_turn.input_tokens == 5
|
||||
assert previous_turn.output_tokens == 3
|
||||
assert previous_turn.cached_input_tokens == 2
|
||||
assert previous_turn.tool_output_tokens == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -156,6 +164,15 @@ async def test_multi_turn_token_counting():
|
||||
assert context.num_tool_output_tokens == expected_tool_output
|
||||
assert context.num_cached_tokens == 5 + 15
|
||||
|
||||
# Validate all turn metrics
|
||||
assert len(context.all_turn_metrics) == 3
|
||||
for i, turn in enumerate(context.all_turn_metrics):
|
||||
assert turn.input_tokens == prompt_token_counts[i]
|
||||
assert turn.output_tokens == output_token_counts[i]
|
||||
assert turn.cached_input_tokens == cached_token_counts[i]
|
||||
assert context.all_turn_metrics[1].tool_output_tokens == 7
|
||||
assert context.all_turn_metrics[2].tool_output_tokens == 1
|
||||
|
||||
|
||||
def test_empty_output_tokens():
|
||||
"""Test behavior when RequestOutput has empty output tokens."""
|
||||
@ -314,6 +331,10 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
||||
# Create a streaming context
|
||||
context = StreamingHarmonyContext(messages=[], available_tools=["browser"])
|
||||
|
||||
num_prompt_tokens = [3, 8, 13]
|
||||
num_output_tokens = [3, 3, 2]
|
||||
num_cached_tokens = [0, 3, 8]
|
||||
|
||||
# Simulate three turns of conversation:
|
||||
# Turn 1: stream tokens one by one, then finish the message
|
||||
# Turn 2: new prompt, stream more tokens with a reasoning segment
|
||||
@ -325,7 +346,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
||||
create_mock_request_output(
|
||||
prompt_token_ids=[1, 2, 3], # 3 prompt tokens
|
||||
output_token_ids=[101], # Single token
|
||||
num_cached_tokens=0,
|
||||
num_cached_tokens=num_cached_tokens[0],
|
||||
finished=False, # Not end of message yet
|
||||
)
|
||||
)
|
||||
@ -370,7 +391,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
||||
5,
|
||||
], # 8 tokens (includes previous)
|
||||
output_token_ids=[201],
|
||||
num_cached_tokens=3, # Some tokens cached
|
||||
num_cached_tokens=num_cached_tokens[1], # Some tokens cached
|
||||
finished=False,
|
||||
)
|
||||
)
|
||||
@ -422,7 +443,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
||||
7,
|
||||
], # 13 tokens
|
||||
output_token_ids=[301],
|
||||
num_cached_tokens=8, # More cached tokens
|
||||
num_cached_tokens=num_cached_tokens[2], # More cached tokens
|
||||
finished=False,
|
||||
)
|
||||
)
|
||||
@ -435,10 +456,12 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
||||
)
|
||||
|
||||
# Final token counts check
|
||||
assert context.num_prompt_tokens == 3 + 8 + 13 # All prompts
|
||||
assert context.num_output_tokens == 3 + 3 + 2 # All outputs
|
||||
assert context.num_prompt_tokens == sum(num_prompt_tokens) # All prompts
|
||||
assert context.num_output_tokens == sum(num_output_tokens) # All outputs
|
||||
assert context.num_reasoning_tokens == 3 # Unchanged from second turn
|
||||
assert context.num_cached_tokens == 3 + 8 # Accumulated cached tokens
|
||||
assert context.num_cached_tokens == sum(
|
||||
num_cached_tokens
|
||||
) # Accumulated cached tokens
|
||||
|
||||
# Additional tool tokens from third turn
|
||||
# Formula: this turn prompt - last turn prompt - last turn output
|
||||
@ -447,6 +470,15 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
||||
context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens
|
||||
)
|
||||
|
||||
# Validate all turn metrics
|
||||
assert len(context.all_turn_metrics) == 3
|
||||
for i, turn in enumerate(context.all_turn_metrics):
|
||||
assert turn.input_tokens == num_prompt_tokens[i]
|
||||
assert turn.output_tokens == num_output_tokens[i]
|
||||
assert turn.cached_input_tokens == num_cached_tokens[i]
|
||||
assert context.all_turn_metrics[1].tool_output_tokens == 2
|
||||
assert context.all_turn_metrics[2].tool_output_tokens == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_message_synchronization(mock_parser):
|
||||
@ -522,3 +554,46 @@ async def test_streaming_message_synchronization(mock_parser):
|
||||
assert len(context._messages) == 3
|
||||
assert context.num_init_messages == 1
|
||||
assert context._messages[2].content[0].text == "Response 4"
|
||||
|
||||
|
||||
def test_turn_metrics_copy_and_reset():
|
||||
"""Test TurnMetrics copy and reset methods work correctly."""
|
||||
# Create a TurnMetrics with specific values
|
||||
original_metrics = TurnMetrics(
|
||||
input_tokens=10,
|
||||
output_tokens=20,
|
||||
cached_input_tokens=5,
|
||||
tool_output_tokens=3,
|
||||
)
|
||||
|
||||
# Test copy functionality
|
||||
copied_metrics = original_metrics.copy()
|
||||
|
||||
# Verify copy has same values
|
||||
assert copied_metrics.input_tokens == 10
|
||||
assert copied_metrics.output_tokens == 20
|
||||
assert copied_metrics.cached_input_tokens == 5
|
||||
assert copied_metrics.tool_output_tokens == 3
|
||||
|
||||
# Verify they are separate objects
|
||||
assert copied_metrics is not original_metrics
|
||||
|
||||
# Modify copy to ensure independence
|
||||
copied_metrics.input_tokens = 999
|
||||
assert original_metrics.input_tokens == 10 # Original unchanged
|
||||
assert copied_metrics.input_tokens == 999
|
||||
|
||||
# Test reset functionality
|
||||
original_metrics.reset()
|
||||
|
||||
# Verify all fields are reset to zero
|
||||
assert original_metrics.input_tokens == 0
|
||||
assert original_metrics.output_tokens == 0
|
||||
assert original_metrics.cached_input_tokens == 0
|
||||
assert original_metrics.tool_output_tokens == 0
|
||||
|
||||
# Verify copied metrics are unaffected by reset
|
||||
assert copied_metrics.input_tokens == 999
|
||||
assert copied_metrics.output_tokens == 20
|
||||
assert copied_metrics.cached_input_tokens == 5
|
||||
assert copied_metrics.tool_output_tokens == 3
|
||||
|
||||
@ -12,7 +12,7 @@ from vllm.entrypoints.openai.api_server import (
|
||||
from vllm.inputs import TextPrompt
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import merge_async_iterators
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
|
||||
MODEL_PATH = "zai-org/chatglm3-6b"
|
||||
LORA_RANK = 64
|
||||
|
||||
45
tests/models/language/pooling/test_multi_vector_retrieval.py
Normal file
45
tests/models/language/pooling/test_multi_vector_retrieval.py
Normal file
@ -0,0 +1,45 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
|
||||
from tests.models.utils import check_embeddings_close
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["BAAI/bge-m3"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@torch.inference_mode
|
||||
def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str):
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.token_embed(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
auto_cls=AutoModel,
|
||||
) as hf_model:
|
||||
tokenizer = hf_model.tokenizer
|
||||
hf_outputs = []
|
||||
for prompt in example_prompts:
|
||||
inputs = tokenizer([prompt], return_tensors="pt")
|
||||
inputs = hf_model.wrap_device(inputs)
|
||||
output = hf_model.model(**inputs)
|
||||
embedding = output.last_hidden_state[0].float()
|
||||
# normal
|
||||
hf_outputs.append(embedding.cpu())
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_output,
|
||||
embeddings_1_lst=vllm_output,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
@ -93,7 +93,7 @@ def test_embed_models_using_normalize(
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_reward_models_using_softmax(
|
||||
def test_reward_models_using_activation(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
@ -104,22 +104,64 @@ def test_reward_models_using_softmax(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=False),
|
||||
pooler_config=PoolerConfig(activation=False),
|
||||
) as vllm_model:
|
||||
wo_softmax = vllm_model.encode(example_prompts)
|
||||
wo_activation = vllm_model.reward(example_prompts)
|
||||
|
||||
with vllm_runner(
|
||||
model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True)
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=True),
|
||||
) as vllm_model:
|
||||
w_softmax = vllm_model.encode(example_prompts)
|
||||
w_activation = vllm_model.reward(example_prompts)
|
||||
|
||||
for wo, w in zip(wo_softmax, w_softmax):
|
||||
for wo, w in zip(wo_activation, w_activation):
|
||||
wo = torch.tensor(wo)
|
||||
w = torch.tensor(w)
|
||||
|
||||
assert not torch.allclose(wo, w, atol=1e-2), (
|
||||
"pooler_config softmax is not working"
|
||||
"pooler_config activation is not working"
|
||||
)
|
||||
assert torch.allclose(softmax(wo), w, atol=1e-2), (
|
||||
"w_softmax should be close to softmax(wo_softmax)."
|
||||
"w_activation should be close to activation(wo_activation)."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"intfloat/multilingual-e5-small",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_multi_vector_retrieval_models_using_normalize(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=False),
|
||||
) as vllm_model:
|
||||
wo_normalize = vllm_model.token_embed(example_prompts)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True),
|
||||
) as vllm_model:
|
||||
w_normalize = vllm_model.token_embed(example_prompts)
|
||||
|
||||
for wo, w in zip(wo_normalize, w_normalize):
|
||||
assert not torch.allclose(wo, w, atol=1e-2), (
|
||||
"pooler_config normalize is not working"
|
||||
)
|
||||
assert torch.allclose(F.normalize(wo, p=2, dim=-1), w, atol=1e-2), (
|
||||
"w_normal should be close to normal(wo_normal)."
|
||||
)
|
||||
|
||||
@ -19,7 +19,7 @@ def test_bert_models(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
vllm_outputs = vllm_model.token_classify(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
|
||||
@ -50,7 +50,7 @@ def test_modernbert_models(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
vllm_outputs = vllm_model.token_classify(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
|
||||
|
||||
@ -17,7 +17,7 @@ from transformers import (
|
||||
)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import identity
|
||||
from vllm.utils.func import identity
|
||||
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
|
||||
@ -38,7 +38,7 @@ def run_intern_vit_test(
|
||||
config.norm_type = "rms_norm"
|
||||
|
||||
hf_model = AutoModel.from_pretrained(
|
||||
model, torch_dtype=torch_dtype, trust_remote_code=True
|
||||
model, dtype=torch_dtype, trust_remote_code=True
|
||||
).to("cuda")
|
||||
hf_outputs_per_image = [
|
||||
hf_model(pixel_value.to("cuda")).last_hidden_state
|
||||
|
||||
@ -39,7 +39,7 @@ def _run_test(
|
||||
max_num_seqs=32,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
vllm_model.encode(prompt)
|
||||
vllm_model.llm.encode(prompt, pooling_task="token_classify")
|
||||
|
||||
|
||||
MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
|
||||
|
||||
@ -45,7 +45,7 @@ def run_radio_test(
|
||||
hf_model = AutoModel.from_pretrained(
|
||||
model_id,
|
||||
config=config,
|
||||
torch_dtype=torch_dtype,
|
||||
dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
).to("cuda")
|
||||
hf_model.eval()
|
||||
|
||||
@ -30,7 +30,7 @@ class MyGemma2Embedding(nn.Module):
|
||||
|
||||
self.pooler = DispatchPooler(
|
||||
{
|
||||
"encode": Pooler.for_encode(pooler_config),
|
||||
"token_embed": Pooler.for_token_embed(pooler_config),
|
||||
"embed": Pooler.for_embed(pooler_config),
|
||||
}
|
||||
)
|
||||
|
||||
@ -93,7 +93,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
|
||||
out_data_format="b64_json",
|
||||
)
|
||||
|
||||
pooling_params = PoolingParams(task="encode", softmax=False)
|
||||
pooling_params = PoolingParams(activation=False)
|
||||
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
@ -108,8 +108,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
|
||||
io_processor_plugin="prithvi_to_tiff",
|
||||
) as llm_runner:
|
||||
pooler_output = llm_runner.get_llm().encode(
|
||||
img_prompt,
|
||||
pooling_params=pooling_params,
|
||||
img_prompt, pooling_params=pooling_params, pooling_task="token_classify"
|
||||
)
|
||||
output = pooler_output[0].outputs
|
||||
|
||||
|
||||
@ -697,7 +697,8 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
|
||||
@pytest.mark.parametrize(
|
||||
"args",
|
||||
[
|
||||
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
|
||||
# TODO: Enable once model is available again
|
||||
# ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
|
||||
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4),
|
||||
],
|
||||
)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user