diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml new file mode 100644 index 0000000000000..56ec933c9cc0e --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml @@ -0,0 +1,12 @@ +# For vllm script, with -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 +model_name: "HandH1998/QQQ-Llama-3-8b-g128" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.419 + - name: "exact_match,flexible-extract" + value: 0.416 +limit: 1000 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml new file mode 100644 index 0000000000000..f10b937249975 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml @@ -0,0 +1,11 @@ +# For hf script, without -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8 +model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +backend: "vllm-vlm" +tasks: +- name: "chartqa" + metrics: + - name: "relaxed_accuracy,none" + value: 0.90 +limit: 100 +num_fewshot: 0 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml new file mode 100644 index 0000000000000..96eeed04a9dc0 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml @@ -0,0 +1,11 @@ +# For hf script, without -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5 +model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +backend: "vllm-vlm" +tasks: +- name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.80 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml index a2f235f485815..aa4fb9fa03d6d 100644 --- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml +++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml @@ -1,4 +1,5 @@ -# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1 +# For vllm script, with -t option (tensor parallel size) +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" tasks: - name: "gsm8k" diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml new file mode 100644 index 0000000000000..5f3c31743e75b --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -0,0 +1,12 @@ +# For vllm script, with -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1 + +model_name: "Qwen/Qwen2.5-VL-7B-Instruct" +backend: "vllm-vlm" +tasks: +- name: "chartqa" + metrics: + - name: "relaxed_accuracy,none" + value: 0.855 +limit: 2500 +num_fewshot: 0 diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt new file mode 100644 index 0000000000000..4fb0b84bc4d81 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt new file mode 100644 index 0000000000000..91e22b6459c12 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-mm-small.txt b/.buildkite/lm-eval-harness/configs/models-mm-small.txt new file mode 100644 index 0000000000000..1097d220245fc --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt @@ -0,0 +1 @@ +Qwen2.5-VL-7B-Instruct.yaml \ No newline at end of file diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh new file mode 100755 index 0000000000000..c8db951381b0b --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on chartqa for vllm. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.9 + +usage() { + echo`` + echo "Runs lm eval harness on ChartQA using multimodal vllm." + echo "This pathway is intended to be used to create baselines for " + echo "our correctness tests in vllm's CI." + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -l - limit number of samples to run" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:l:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm-vlm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \ + --tasks chartqa \ + --batch_size auto \ + --apply_chat_template \ + --limit $LIMIT diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh old mode 100644 new mode 100755 diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh new file mode 100644 index 0000000000000..d85a1721db9a5 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on MMLUPRO for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] + +usage() { + echo`` + echo "Runs lm eval harness on MMLU Pro using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ + --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size auto diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index ceea01166b7f4..f10de82b1d8e8 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -19,21 +19,27 @@ RTOL = 0.08 def launch_lm_eval(eval_config, tp_size): trust_remote_code = eval_config.get("trust_remote_code", False) max_model_len = eval_config.get("max_model_len", 4096) + batch_size = eval_config.get("batch_size", "auto") + backend = eval_config.get("backend", "vllm") model_args = ( f"pretrained={eval_config['model_name']}," f"tensor_parallel_size={tp_size}," f"enforce_eager=true," f"add_bos_token=true," f"trust_remote_code={trust_remote_code}," - f"max_model_len={max_model_len}" + f"max_model_len={max_model_len}," ) results = lm_eval.simple_evaluate( - model="vllm", + model=backend, model_args=model_args, tasks=[task["name"] for task in eval_config["tasks"]], num_fewshot=eval_config["num_fewshot"], limit=eval_config["limit"], - batch_size="auto", + # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help + # text models. however, this is regressing measured strict-match for + # existing text models in CI, so only apply it for mm. + apply_chat_template=backend == "vllm-vlm", + batch_size=batch_size, ) return results diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index b2a3a0a775baa..50b2b61124af0 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -63,7 +63,7 @@ steps: - label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins timeout_in_minutes: 10 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking source_file_dependencies: @@ -353,7 +353,7 @@ steps: - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - label: V1 Test others (CPU) # 5 mins - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking source_file_dependencies: @@ -459,6 +459,7 @@ steps: - pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_decorator.py - pytest -v -s compile/test_noop_elimination.py + - pytest -v -s compile/test_aot_compile.py - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -487,14 +488,14 @@ steps: - label: Kernels Core Operation Test # 48min timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - csrc/ - tests/kernels/core commands: - - pytest -v -s kernels/core + - pytest -v -s kernels/core kernels/test_top_k_per_row.py - label: Kernels Attention Test %N # 23min timeout_in_minutes: 35 @@ -603,7 +604,8 @@ steps: # since torchao nightly is only compatible with torch nightly currently # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved - - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - label: LM Eval Small Models # 53min @@ -631,7 +633,7 @@ steps: - label: OpenAI-Compatible Tool Use # 23 min timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking fast_check: false diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ebe0602a1b5db..a8a5bf3ad234d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -527,7 +527,8 @@ steps: # since torchao nightly is only compatible with torch nightly currently # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved - - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - label: LM Eval Small Models # 53min @@ -733,6 +734,16 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work +- label: Multi-Modal Accuracy Eval (Small Models) # 50min + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + - label: Multi-Modal Models Test (Extended) 1 mirror_hardwares: [amdexperimental] optional: true diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 61ac9fefc59f4..3fbc38d9a26c7 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,9 +5,7 @@ /vllm/attention @LucasWilkinson /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn -/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/model_executor/layers/fused_moe @mgoin -/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/model_loader @22quinn @@ -26,7 +24,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 # vLLM V1 -/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1/attention @LucasWilkinson /vllm/v1/attention/backends/flashinfer.py @mgoin /vllm/v1/attention/backends/triton_attn.py @tdoublep diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index d3040e9738f7b..9298d3b58dfb9 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -631,7 +631,7 @@ def main(args: argparse.Namespace): else: ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") shard_intermediate_size = 2 * intermediate_size // args.tp_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" block_quant_shape = get_weight_block_size_safety(config) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 04d2205aa3722..459eafa6d907d 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -344,7 +344,7 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok hidden_size = config.hidden_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" use_customized_permute = args.use_customized_permute diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake index 9aace7693077a..5a59a409999ad 100644 --- a/cmake/external_projects/qutlass.cmake +++ b/cmake/external_projects/qutlass.cmake @@ -22,10 +22,10 @@ else() CONFIGURE_COMMAND "" BUILD_COMMAND "" ) - FetchContent_Populate(qutlass) - set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}") endif() +FetchContent_Populate(qutlass) + if(NOT qutlass_SOURCE_DIR) message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.") endif() diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index 6c3685f6f7cdc..aa7927f09cbbf 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -2,6 +2,7 @@ #include "dispatch_utils.h" #include "cub_helpers.h" #include "core/batch_invariant.hpp" +#include "quantization/vectorization_utils.cuh" #include #include @@ -18,11 +19,22 @@ __global__ void rms_norm_kernel( const float epsilon, const int num_tokens, const int hidden_size) { __shared__ float s_variance; float variance = 0.0f; + const scalar_t* input_row = input + blockIdx.x * input_stride; - for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - const float x = (float)input[blockIdx.x * input_stride + idx]; + constexpr int VEC_SIZE = 8; + auto vec_op = [&variance](const vec_n_t& vec) { +#pragma unroll + for (int i = 0; i < VEC_SIZE; ++i) { + float x = static_cast(vec.val[i]); + variance += x * x; + } + }; + auto scalar_op = [&variance](const scalar_t& val) { + float x = static_cast(val); variance += x * x; - } + }; + vllm::vectorize_read_with_alignment( + input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op); using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu index 0fc462194fcde..7f9a0bccdd348 100644 --- a/csrc/layernorm_quant_kernels.cu +++ b/csrc/layernorm_quant_kernels.cu @@ -10,6 +10,7 @@ #include "dispatch_utils.h" #include "cub_helpers.h" #include "core/batch_invariant.hpp" +#include "quantization/vectorization_utils.cuh" #include #include @@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel( __shared__ float s_variance; float variance = 0.0f; - for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - const float x = (float)input[blockIdx.x * input_stride + idx]; + const scalar_t* input_row = input + blockIdx.x * input_stride; + + constexpr int VEC_SIZE = 8; + auto vec_op = [&variance](const vec_n_t& vec) { +#pragma unroll + for (int i = 0; i < VEC_SIZE; ++i) { + float x = static_cast(vec.val[i]); + variance += x * x; + } + }; + auto scalar_op = [&variance](const scalar_t& val) { + float x = static_cast(val); variance += x * x; - } + }; + vllm::vectorize_read_with_alignment( + input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op); using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 2b0654fa6d463..85906d23dee33 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc ```python from vllm import LLM - from vllm.config import CompilationConfig, CompilationLevel + from vllm.config import CompilationConfig, CompilationMode llm = LLM( model="meta-llama/Llama-3.1-8B-Instruct", compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, # By default, it goes up to max_num_seqs cudagraph_capture_sizes=[1, 2, 4, 8, 16], ), diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md index 315746b0ef674..c6d71589be985 100644 --- a/docs/design/cuda_graphs.md +++ b/docs/design/cuda_graphs.md @@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum): """NO CUDA Graphs support""" ``` -Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. +Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. The following table lists backends that support full CUDA Graphs at the time of writing. @@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG") import vllm from vllm.config import CUDAGraphMode -compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} +compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} model = vllm.LLM( model="meta-llama/Llama-3.1-8B-Instruct", dtype="auto", diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md index 182127bc91cc8..e77e8b5a1f415 100644 --- a/docs/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the from awq import AutoAWQForCausalLM from transformers import AutoTokenizer - model_path = 'mistralai/Mistral-7B-Instruct-v0.2' - quant_path = 'mistral-instruct-v0.2-awq' - quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } + model_path = "mistralai/Mistral-7B-Instruct-v0.2" + quant_path = "mistral-instruct-v0.2-awq" + quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"} # Load model model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + model_path, + low_cpu_mem_usage=True, + use_cache=False, ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md index ac766d5e29228..9c14f362b663f 100644 --- a/docs/features/quantization/auto_round.md +++ b/docs/features/quantization/auto_round.md @@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound model_name = "Qwen/Qwen3-0.6B" -model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) bits, group_size, sym = 4, 128, True diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md index 53b689ad53ff6..c3a1276576223 100644 --- a/docs/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -34,7 +34,7 @@ llm = LLM( model=model_id, dtype=torch.bfloat16, trust_remote_code=True, - quantization="bitblas" + quantization="bitblas", ) ``` @@ -53,6 +53,6 @@ llm = LLM( dtype=torch.float16, trust_remote_code=True, quantization="bitblas", - max_model_len=1024 + max_model_len=1024, ) ``` diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md index 3b15a6072d47a..2348c7739c066 100644 --- a/docs/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit" llm = LLM( model=model_id, dtype=torch.bfloat16, - trust_remote_code=True + trust_remote_code=True, ) ``` @@ -43,7 +43,7 @@ llm = LLM( model=model_id, dtype=torch.bfloat16, trust_remote_code=True, - quantization="bitsandbytes" + quantization="bitsandbytes", ) ``` diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index 834c03cbe05b0..0c5111fb8af0d 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` @@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio # Configure the simple PTQ quantization recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) + targets="Linear", + scheme="FP8_DYNAMIC", + ignore=["lm_head"], + ) # Apply the quantization algorithm. oneshot(model=model, recipe=recipe) diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md index 2a1c3bdd775f1..2a731e9b7e032 100644 --- a/docs/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint: conversation = [ { "role": "system", - "content": "You are a helpful assistant" + "content": "You are a helpful assistant", }, { "role": "user", - "content": "Hello" + "content": "Hello", }, { "role": "assistant", - "content": "Hello! How can I assist you today?" + "content": "Hello! How can I assist you today?", }, { "role": "user", @@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint: sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. - llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") + llm = LLM( + model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", + tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.chat(conversation, sampling_params) diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md index 47cb2d65bae47..f14a931725da4 100644 --- a/docs/features/quantization/gptqmodel.md +++ b/docs/features/quantization/gptqmodel.md @@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: calibration_dataset = load_dataset( "allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", - split="train" + split="train", ).select(range(1024))["text"] quant_config = QuantizeConfig(bits=4, group_size=128) diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index d6fdac7b07f7f..035e7ea291f9e 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` @@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y }, ignore=["lm_head"], update_size=NUM_CALIBRATION_SAMPLES, - dampening_frac=0.01 + dampening_frac=0.01, ) ``` diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index af3650e701ad0..ec8a77f74ffef 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md index 39ae03b1bdac0..c48ccb719a79d 100644 --- a/docs/features/quantization/modelopt.md +++ b/docs/features/quantization/modelopt.md @@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll from vllm import LLM, SamplingParams def main(): - model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" - # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint + + # Ensure you specify quantization="modelopt" when loading the modelopt checkpoint llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) sampling_params = SamplingParams(temperature=0.8, top_p=0.9) diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index b2b417309e92b..56cf057678be6 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization: from vllm import LLM, SamplingParams sampling_params = SamplingParams(temperature=0.7, top_p=0.8) - llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - calculate_kv_scales=True) + llm = LLM( + model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + calculate_kv_scales=True, + ) prompt = "London is the capital of" out = llm.generate(prompt, sampling_params)[0].outputs[0].text print(out) @@ -80,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models # Select model and load it MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" - model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") + model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index 85b7d8ec84ed3..385e3bbb8712f 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -48,7 +48,9 @@ to fetch model and tokenizer. MAX_SEQ_LEN = 512 model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + dtype="auto", ) model.eval() @@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") text_data = dataset["text"][:NUM_CALIBRATION_DATA] - tokenized_outputs = tokenizer(text_data, return_tensors="pt", - padding=True, truncation=True, max_length=MAX_SEQ_LEN) - calib_dataloader = DataLoader(tokenized_outputs['input_ids'], - batch_size=BATCH_SIZE, drop_last=True) + tokenized_outputs = tokenizer( + text_data, + return_tensors="pt", + padding=True, + truncation=True, + max_length=MAX_SEQ_LEN, + ) + calib_dataloader = DataLoader( + tokenized_outputs['input_ids'], + batch_size=BATCH_SIZE, + drop_last=True, + ) ``` ### 3. Set the Quantization Configuration @@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. load_quant_algo_config_from_file) # Define fp8/per-tensor/static spec. - FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", - is_dynamic=False).to_quantization_spec() + FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec( + observer_method="min_max", + is_dynamic=False, + ).to_quantization_spec() # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. - global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, - weight=FP8_PER_TENSOR_SPEC) + global_quant_config = QuantizationConfig( + input_tensors=FP8_PER_TENSOR_SPEC, + weight=FP8_PER_TENSOR_SPEC, + ) # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] - kv_cache_quant_config = {name : - QuantizationConfig(input_tensors=global_quant_config.input_tensors, - weight=global_quant_config.weight, - output_tensors=KV_CACHE_SPEC) - for name in kv_cache_layer_names_for_llama} + kv_cache_quant_config = { + name: QuantizationConfig( + input_tensors=global_quant_config.input_tensors, + weight=global_quant_config.weight, + output_tensors=KV_CACHE_SPEC, + ) + for name in kv_cache_layer_names_for_llama + } layer_quant_config = kv_cache_quant_config.copy() # Define algorithm config by config file. - LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = - 'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json' + LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json" algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) EXCLUDE_LAYERS = ["lm_head"] @@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. layer_quant_config=layer_quant_config, kv_cache_quant_config=kv_cache_quant_config, exclude=EXCLUDE_LAYERS, - algo_config=algo_config) + algo_config=algo_config, + ) ``` ### 4. Quantize the Model and Export @@ -165,8 +182,11 @@ for more exporting format details. EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) with torch.no_grad(): - exporter.export_safetensors_model(freezed_model, - quant_config=quant_config, tokenizer=tokenizer) + exporter.export_safetensors_model( + freezed_model, + quant_config=quant_config, + tokenizer=tokenizer, + ) ``` ### 5. Evaluation in vLLM @@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. - llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", - kv_cache_dtype='fp8',quantization='quark') + llm = LLM( + model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", + kv_cache_dtype="fp8", + quantization="quark", + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md index 6932445997012..b95b560882bb1 100644 --- a/docs/features/quantization/torchao.md +++ b/docs/features/quantization/torchao.md @@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) quantized_model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype="auto", + dtype="auto", device_map="auto", quantization_config=quantization_config ) diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index ab04a1efcc083..0b00b8805bb2c 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -11,6 +11,7 @@ vLLM currently supports the following reasoning models: | Model Series | Parser Name | Structured Output Support | Tool Calling | |--------------|-------------|------------------|-------------| | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ | +| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ | | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ | | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ | | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ | @@ -20,8 +21,9 @@ vLLM currently supports the following reasoning models: | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ | !!! note - IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. + IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. + DeepSeek-V3.1 tool calling is supported in non-thinking mode. ## Quickstart diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 02a700c09d391..5829bfa44e428 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -352,6 +352,16 @@ Supported models: Flags: `--tool-call-parser qwen3_xml` +### Olmo 3 Models (`olmo3`) + +Olmo 3 models output tool calls in a format that is very similar to the one expected by the `pythonic` parser (see below), with a few differences. Each tool call is a pythonic string, but the parallel tool calls are newline-delimited, and the calls are wrapped within XML tags as `..`. In addition, the parser also allows JSON boolean and null literals (`true`, `false`, and `null`) in addition to the pythonic ones (`True`, `False`, and `None`). + +Supported models: + +* TODO (will be updated after Olmo 3 release) + +Flags: `--tool-call-parser olmo3` + ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md index 15fce69b44871..9cae9ed1a212e 100644 --- a/docs/getting_started/installation/cpu/arm.inc.md +++ b/docs/getting_started/installation/cpu/arm.inc.md @@ -23,7 +23,46 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] ---8<-- "docs/getting_started/installation/cpu/build.inc.md:extra-information" +First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: + +```bash +sudo apt-get update -y +sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof +sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +``` + +Second, clone the vLLM project: + +```bash +git clone https://github.com/vllm-project/vllm.git vllm_source +cd vllm_source +``` + +Third, install required dependencies: + +```bash +uv pip install -r requirements/cpu-build.txt --torch-backend cpu +uv pip install -r requirements/cpu.txt --torch-backend cpu +``` + +??? console "pip" + ```bash + pip install --upgrade pip + pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu + ``` + +Finally, build and install vLLM: + +```bash +VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation +``` + +If you want to develop vLLM, install it in editable mode instead. + +```bash +VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation +``` Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md deleted file mode 100644 index f99497128fd37..0000000000000 --- a/docs/getting_started/installation/cpu/build.inc.md +++ /dev/null @@ -1,44 +0,0 @@ -# --8<-- [start:extra-information] - -First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: - -```bash -sudo apt-get update -y -sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof -sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 -``` - -Second, clone the vLLM project: - -```bash -git clone https://github.com/vllm-project/vllm.git vllm_source -cd vllm_source -``` - -Third, install required dependencies: - -```bash -uv pip install -r requirements/cpu-build.txt --torch-backend cpu -uv pip install -r requirements/cpu.txt --torch-backend cpu -``` - -??? console "pip" - ```bash - pip install --upgrade pip - pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu - pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu - ``` - -Finally, build and install vLLM: - -```bash -VLLM_TARGET_DEVICE=cpu python setup.py install -``` - -If you want to develop vLLM, install it in editable mode instead. - -```bash -VLLM_TARGET_DEVICE=cpu python setup.py develop -``` - -# --8<-- [end:extra-information] diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 49e1f6fac7151..1cba21cf5f6d9 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep api_key=openai_api_key, base_url=openai_api_base, ) - completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", - prompt="San Francisco is a") + completion = client.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + prompt="San Francisco is a", + ) print("Completion result:", completion) ``` @@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package: messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a joke."}, - ] + ], ) print("Chat response:", chat_response) ``` diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index ecd71ee1f3f66..a4da5b933e159 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -22,6 +22,11 @@ sys.modules["vllm._C"] = MagicMock() class PydanticMagicMock(MagicMock): """`MagicMock` that's able to generate pydantic-core schemas.""" + def __init__(self, *args, **kwargs): + name = kwargs.pop("name", None) + super().__init__(*args, **kwargs) + self.__spec__ = importlib.machinery.ModuleSpec(name, None) + def __get_pydantic_core_schema__(self, source_type, handler): return core_schema.any_schema() @@ -42,7 +47,9 @@ def auto_mock(module, attr, max_mocks=50): raise e except ModuleNotFoundError as e: logger.info("Mocking %s for argparse doc generation", e.name) - sys.modules[e.name] = PydanticMagicMock() + sys.modules[e.name] = PydanticMagicMock(name=e.name) + except Exception as e: + logger.warning("Failed to import %s.%s: %s", module, attr, e) raise ImportError( f"Failed to import {module}.{attr} after mocking {max_mocks} imports" diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index f70ab0c6f4e5c..3df80d5af6c4d 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -60,7 +60,7 @@ from vllm import LLM llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", load_format="tensorizer", - enable_lora=True + enable_lora=True, ) ``` @@ -97,6 +97,6 @@ llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", load_format="tensorizer", enable_lora=True, - model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}} + model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}, ) ``` diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index 05f8d16cc4ca7..9ea32ed616457 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc conversation = [ { "role": "system", - "content": "You are a helpful assistant" + "content": "You are a helpful assistant", }, { "role": "user", - "content": "Hello" + "content": "Hello", }, { "role": "assistant", - "content": "Hello! How can I assist you today?" + "content": "Hello! How can I assist you today?", }, { "role": "user", diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 50982d3d0d0f3..45bfba2cbf594 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u from vllm import LLM llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") -(output,) = llm.score("What is the capital of France?", - "The capital of Brazil is Brasilia.") +(output,) = llm.score( + "What is the capital of France?", + "The capital of Brazil is Brasilia.", +) score = output.outputs.score print(f"Score: {score}") @@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please Here is an example to serve a model with Matryoshka Embeddings enabled. -```text +```bash vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}' ``` @@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka ```python from vllm import LLM, PoolingParams -llm = LLM(model="jinaai/jina-embeddings-v3", - runner="pooling", - trust_remote_code=True) -outputs = llm.embed(["Follow the white rabbit."], - pooling_params=PoolingParams(dimensions=32)) +llm = LLM( + model="jinaai/jina-embeddings-v3", + runner="pooling", + trust_remote_code=True, +) +outputs = llm.embed( + ["Follow the white rabbit."], + pooling_params=PoolingParams(dimensions=32), +) print(outputs[0].outputs) ``` @@ -234,13 +240,13 @@ A code example can be found here: ```python import os -os.environ['http_proxy'] = 'http://your.proxy.server:port' -os.environ['https_proxy'] = 'http://your.proxy.server:port' +os.environ["http_proxy"] = "http://your.proxy.server:port" +os.environ["https_proxy"] = "http://your.proxy.server:port" ``` ### ModelScope diff --git a/docs/serving/context_parallel_deployment.md b/docs/serving/context_parallel_deployment.md new file mode 100644 index 0000000000000..dacdf312ee55b --- /dev/null +++ b/docs/serving/context_parallel_deployment.md @@ -0,0 +1,47 @@ +# Context Parallel Deployment + +Context parallel mainly solves the problem of serving long context requests. As prefill and decode present quite different characteristics and have quite different SLO (service level objectives), we need to implement context parallel separately for them. The major considerations are: + +- For long context prefill, we need to control the TTFT (time to first token) by amortizing the computation time of the prefill across query tokens. +- For long context decode, we need more space for KV cache to increase the batchsize (and hence the throughput). + +## Prefill Context Parallel + +During prefill, for a long request with `T` new tokens, we need to compute query/key/value tensors for these new tokens. Say we have `N` GPUs, we can split the request into `N` chunks, and each GPU computes one chunk of the query/key/value tensors. + +Depending on the use case, there're two possible strategies: + +1. Partial query, full key/value: If the request token length is moderately long (we can afford holding the full key/value tensors), and the goal is to accelerate the prefill (and amortize the computation time of the prefill across query tokens), then we can gather the key/value tensors from all GPUs and let each GPU compute the attention output corresponding to the query tokens of its chunk. +2. Partial query, partial key/value: If the request token length is too long, we cannot afford holding the full key/value tensors anymore, then we can only compute one chunk of query/key/value tensors for each GPU, and use techniques like [ring-attention](http://arxiv.org/abs/2310.01889) to send/recv key/value tensors chunk by chunk. + +Both approaches are under active development. + +## Decode Context Parallel + +Due to the auto-regressive nature of decoding, every decoding step needs to compute a small amount of query tokens w.r.t. a large number of key/value tokens stored in the paged KV cache. The core of decode context parallel is how to shard the KV cache across GPUs. + +For a model with `H` kv-heads, a request with `T` tokens in the context needs to store `H * T` key/value tensors in the KV cache. + +1. If one GPU can hold them all, and the performance is good enough, then no parallelization is needed. +2. If one GPU cannot hold them all, or we want to hold more requests in the KV cache, we can first shard the KV cache along the `H` dimension, that's the plain tensor parallel sharding. It's as simple as adding `-tp ` to the command line. +3. Since `H` is limited (determined by the model architecture), when we continue to increase the tensor parallel size, the KV cache for each GPU will be duplicated for `tp_size / H` times. Of course, duplication is not good for efficiency. Then we need to add decode context parallel to further shard the KV cache along the `T` dimension. This is as simple as adding `-dcp ` to the command line. Note that `size` does not increase the number of GPUs we need to launch, but just reduces the KV cache duplication. The dcp size should lie in the range of `[1, tp_size/H]`. With larger dcp size, the KV cache duplication is reduced, but the communication overhead increases. + +Theoretically, it is possible to extend the dcp size beyond `tp_size / H` to further shard the KV cache and accelerate the decoding phase. However, since the number of query tokens is limited in decoding, it's unclear what should we do for the remaining `dcp_size - tp_size / H` GPUs for non-attention layers. For the sake of simplicity, dcp size is upper bounded by `tp_size / H`. If you want to further accelerate the decoding phase, you can consider increasing the `tp_size` first, and then increasing the dcp size. + +Note that kv cache can grow during decoding, and the sharding strategy needs to be carefully implemented. We use an interleaving strategy to shard the KV cache along the `T` dimension, so that kv cache for future tokens can be naturally sharded along the `T` dimension. This is proposed by [Chao Hong from Moonshot](https://github.com/youzhedian), and also explained in details in [this paper](http://arxiv.org/abs/2507.07120). + +Case study: + +For DeepSeek-R1, we have 1 kv-head when MLA is enabled. The typical single-node deployment with `-tp 8` causes 8x KV cache duplication. We can consider adding `-dcp 8` to reduce the KV cache duplication. + +For Kimi-K2, the architecture is similar to DeepSeek-R1, but with more parameters. When we deploy it with `-tp 16`, the KV cache duplication is 16x. We can add `-dcp 16` to completely remove the KV cache duplication, at the cost of more communication overhead. We can also add `-dcp 8` to reduce the KV cache duplication to 2x. Although it still duplicates the KV cache twice, the communication overhead is smaller since the DCP communication only happens inside one node. + +For Qwen3-235B-A22B, we have 4 kv-heads. When we deploy it with `-tp 8`, the KV cache duplication is 2x. Then we can add `-dcp 2` to remove the KV cache duplication. + +In short, for decode context parallel, try to increase `-tp` size until you get satisfactory performance, and then add `-dcp` to reduce the KV cache duplication. + +Decode context parallel is supported in vLLM, for both MLA and GQA models. Some attention backends also support the combination of decode context parallel and MTP (multi-token prediction) to further accelerate the decoding phase. + +## Technical Discussions + +The main discussions happen in the `#sig-context-parallel` channel of [vLLM Slack](https://slack.vllm.ai/). diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index cd6515dde75ef..f1dfb05ea5d45 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -243,10 +243,10 @@ try: "remote_engine_id": None, # Will be populated by vLLM "remote_block_ids": None, # Will be populated by vLLM "remote_host": None, # Will be populated by vLLM - "remote_port": None # Will be populated by vLLM + "remote_port": None, # Will be populated by vLLM } }, - extra_headers={"X-Request-Id": request_id} + extra_headers={"X-Request-Id": request_id}, ) print("-" * 50) @@ -262,7 +262,7 @@ try: extra_body={ "kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info }, - extra_headers={"X-Request-Id": request_id} # Same request ID + extra_headers={"X-Request-Id": request_id}, # Same request ID ) print("-" * 50) diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md index 47074f411ac99..192a61ea5b903 100644 --- a/docs/serving/integrations/langchain.md +++ b/docs/serving/integrations/langchain.md @@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain` ```python from langchain_community.llms import VLLM - llm = VLLM(model="mosaicml/mpt-7b", - trust_remote_code=True, # mandatory for hf models - max_new_tokens=128, - top_k=10, - top_p=0.95, - temperature=0.8, - # tensor_parallel_size=... # for distributed inference + llm = VLLM( + model="mosaicml/mpt-7b", + trust_remote_code=True, # mandatory for hf models + max_new_tokens=128, + top_k=10, + top_p=0.95, + temperature=0.8, + # for distributed inference + # tensor_parallel_size=..., ) print(llm("What is the capital of France ?")) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index fe0e1e3df378b..215c7bf0ced3c 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Hello!"} - ] + {"role": "user", "content": "Hello!"}, + ], ) print(completion.choices[0].message) @@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below: completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} - ] + { + "role": "user", + "content": [ + {"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}, + ], + }, + ], ) ``` @@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}, ], extra_body={ - "structured_outputs": {"choice": ["positive", "negative"]} - } + "structured_outputs": {"choice": ["positive", "negative"]}, + }, ) ``` @@ -149,11 +154,11 @@ with `--enable-request-id-headers`. completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}, ], extra_headers={ "x-request-id": "sentiment-classification-00001", - } + }, ) print(completion._request_id) @@ -162,7 +167,7 @@ with `--enable-request-id-headers`. prompt="A robot may not injure a human being", extra_headers={ "x-request-id": "completion-test", - } + }, ) print(completion._request_id) ``` @@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi model="openai/whisper-large-v3-turbo", file=audio_file, language="en", - response_format="verbose_json" + response_format="verbose_json", ) print(transcription.text) @@ -812,22 +817,22 @@ You can pass multi-modal inputs to scoring models by passing `content` including "model": "jinaai/jina-reranker-m0", "text_1": "slm markdown", "text_2": { - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" - }, - }, - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" - }, - }, - ] - } + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" + }, + }, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" + }, + }, + ], }, + }, ) response.raise_for_status() response_json = response.json() diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 0076d4d30ee8e..a3e671a0f4cca 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -95,7 +95,7 @@ def parse_args(): parser.add_argument( "--compilation-config", type=int, - help=("Compilation optimization (O) level 0-3."), + help=("Compilation optimization (O) mode 0-3."), ) parser.add_argument( "--quantization", diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md index 3c6f6c7a6c588..7d5a1af8f5a4a 100644 --- a/examples/offline_inference/openai_batch/README.md +++ b/examples/offline_inference/openai_batch/README.md @@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_ """ try: url = s3_client.generate_presigned_url( - ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in + ClientMethod=client_method, + Params=method_parameters, + ExpiresIn=expires_in, ) except ClientError: raise @@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_ s3_client = boto3.client("s3") input_url = generate_presigned_url( - s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600 + s3_client, + "get_object", + {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, + expires_in=3600, ) output_url = generate_presigned_url( - s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600 + s3_client, + "put_object", + {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, + expires_in=3600, ) print(f"{input_url=}") print(f"{output_url=}") diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md index 79afbd9cfac47..7c535e91afac8 100644 --- a/examples/offline_inference/pooling/README.md +++ b/examples/offline_inference/pooling/README.md @@ -26,6 +26,12 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py python examples/offline_inference/pooling/embed_matryoshka_fy.py ``` +## Multi vector retrieval usage + +```bash +python examples/offline_inference/pooling/multi_vector_retrieval.py +``` + ## Named Entity Recognition (NER) usage ```bash diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/offline_inference/pooling/multi_vector_retrieval.py new file mode 100644 index 0000000000000..8b8892117d378 --- /dev/null +++ b/examples/offline_inference/pooling/multi_vector_retrieval.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from argparse import Namespace + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults( + model="BAAI/bge-m3", + runner="pooling", + enforce_eager=True, + ) + return parser.parse_args() + + +def main(args: Namespace): + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + # Create an LLM. + # You should pass runner="pooling" for embedding models + llm = LLM(**vars(args)) + + # Generate embedding. The output is a list of EmbeddingRequestOutputs. + outputs = llm.embed(prompts) + + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for prompt, output in zip(prompts, outputs): + embeds = output.outputs.embedding + print(len(embeds)) + + # Generate embedding for each token. The output is a list of PoolingRequestOutput. + outputs = llm.encode(prompts, pooling_task="token_embed") + + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for prompt, output in zip(prompts, outputs): + multi_vector = output.outputs.data + print(multi_vector.shape) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py index 418c40645f9f2..6c47b57154386 100644 --- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py +++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py @@ -40,7 +40,7 @@ def main(): model_impl="terratorch", ) - pooling_params = PoolingParams(task="encode", softmax=False) + pooling_params = PoolingParams(task="token_classify", activation=False) pooler_output = llm.encode( img_prompt, pooling_params=pooling_params, diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index ac4e40221edf1..91345e0ae7785 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -18,6 +18,12 @@ python examples/online_serving/pooling/embedding_embed_dtype_client.py python examples/online_serving/pooling/jinaai_rerank_client.py ``` +## Multi vector retrieval usage + +```bash +python examples/online_serving/pooling/multi_vector_retrieval_client.py +``` + ## Named Entity Recognition (NER) usage ```bash diff --git a/examples/online_serving/pooling/multi_vector_retrieval_client.py b/examples/online_serving/pooling/multi_vector_retrieval_client.py new file mode 100644 index 0000000000000..ef8c4745aa531 --- /dev/null +++ b/examples/online_serving/pooling/multi_vector_retrieval_client.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Example online usage of Pooling API for multi vector retrieval. + +Run `vllm serve --runner pooling` +to start up the server in vLLM. e.g. + +vllm serve BAAI/bge-m3 +""" + +import argparse + +import requests +import torch + + +def post_http_request(prompt: dict, api_url: str) -> requests.Response: + headers = {"User-Agent": "Test Client"} + response = requests.post(api_url, headers=headers, json=prompt) + return response + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--model", type=str, default="BAAI/bge-m3") + + return parser.parse_args() + + +def main(args): + api_url = f"http://{args.host}:{args.port}/pooling" + model_name = args.model + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + prompt = {"model": model_name, "input": prompts} + + pooling_response = post_http_request(prompt=prompt, api_url=api_url) + for output in pooling_response.json()["data"]: + multi_vector = torch.tensor(output["data"]) + print(multi_vector.shape) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index acbfd8cda489a..2601c9eff971b 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -84,7 +84,7 @@ directly to load models: from vllm import LLM llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", - load_format="tensorizer" + load_format="tensorizer", ) ``` diff --git a/pyproject.toml b/pyproject.toml index 95dda76063bc1..eb9bdb593baac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,7 +107,6 @@ markers = [ "distributed: run this test only in distributed GPU tests", "skip_v1: do not run this test with v1", "optional: optional tests that are automatically skipped, include --optional to run them", - "extra_server_args: extra arguments to pass to the server fixture", ] [tool.ty.src] diff --git a/requirements/common.txt b/requirements/common.txt index ec668e16d0e97..5e7769561c4f4 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -7,7 +7,7 @@ requests >= 2.26.0 tqdm blake3 py-cpuinfo -transformers >= 4.55.2 +transformers >= 4.56.0 tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index 84194f3ed01e8..e01b58220959f 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -11,6 +11,7 @@ from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from vllm import LLM, SamplingParams from vllm.config import CompilationConfig from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer @contextlib.contextmanager @@ -32,13 +33,13 @@ def temporary_environ(env_vars): os.environ[k] = v -test_params_full_cudagraph = [] +model_backends_full_cudagraph = [] # deepseek-ai/DeepSeek-V2-Lite with MLA MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"] for mla_backend in MLA_backends: - test_params_full_cudagraph.append( - pytest.param(("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])) + model_backends_full_cudagraph.append( + ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend]) ) # Qwen/Qwen2-1.5B-Instruct with other backends @@ -46,14 +47,18 @@ other_backend_configs = [ backend_configs[c] for c in backend_configs if c not in MLA_backends ] for backend_config in other_backend_configs: - test_params_full_cudagraph.append( - pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config)) - ) + model_backends_full_cudagraph.append(("Qwen/Qwen2-1.5B-Instruct", backend_config)) @pytest.fixture(scope="class") def llm_pair(request): - model, backend_config = request.param + model, backend_config, use_inductor_graph_partition = request.param + backend_config.comp_config["use_inductor_graph_partition"] = ( + use_inductor_graph_partition + ) + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("Inductor graph partition only supported in torch>=2.9") # Dynamically skip test if GPU capability is not met if ( @@ -104,7 +109,15 @@ def llm_pair(request): ) -@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True) +@pytest.mark.parametrize( + "llm_pair", + [ + pytest.param((model, backend_config, use_inductor_graph_partition)) + for model, backend_config in model_backends_full_cudagraph + for use_inductor_graph_partition in [True, False] + ], + indirect=True, +) class TestFullCUDAGraph: """ Use a class such that an llm pair is constructed once for all diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index d88645e3bfd62..246239b87d5fe 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -5,6 +5,7 @@ Test (piecewise) compilation with a simple model where multiple submodules are compiled and graph captured separately. """ +import pytest import torch from torch import nn @@ -13,12 +14,13 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, ) from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from .. import silly_attention # noqa: F401 @@ -190,16 +192,21 @@ def run_model( return output.cpu() -def test_multi_graph_piecewise_compile_outputs_equal(): +@pytest.mark.parametrize("use_inductor_graph_partition", [False, True]) +def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available in PyTorch 2.9+") + outputs = [] - # piecewise compile + # vllmcompile compile vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=use_inductor_graph_partition, ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -220,23 +227,31 @@ def test_multi_graph_piecewise_compile_outputs_equal(): # static tensor addresses inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda() - with compilation_counter.expect( - num_graphs_seen=2, # two graphs for the model - num_piecewise_graphs_seen=6, + if use_inductor_graph_partition: + # Splitting happens at Inductor lowering level, + # total piecewise fx graphs is equal to total graphs + num_piecewise_fx = 2 + num_piecewise_capturable_fx = 2 + else: # attn_one, attn_two each has 3 piecewise graphs # (pre attn, post attn, silly_attention) each - num_piecewise_capturable_graphs_seen=4, + num_piecewise_fx = 6 # attn_one, attn_two has pre attn and post attn each, total=4 - num_backend_compilations=4, # num_piecewise_capturable_graphs_seen - num_cudagraph_captured=8, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_piecewise_capturable_fx = 4 + + with compilation_counter.expect( + num_graphs_seen=2, # two graphs for the model + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, + num_cudagraph_captured=8, # num_cudagraph_sizes * num_partitions ): outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) # no compile or cudagraph vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.NO_COMPILATION, + mode=CompilationMode.NONE, ) ) cudagraph_runtime_mode = CUDAGraphMode.NONE @@ -265,9 +280,10 @@ def test_multi_graph_piecewise_compile_outputs_equal(): # piecewise compile without CUDA graph vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=False, splitting_ops=["silly::attention"], + use_inductor_graph_partition=use_inductor_graph_partition, ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -286,9 +302,9 @@ def test_multi_graph_piecewise_compile_outputs_equal(): with compilation_counter.expect( num_graphs_seen=2, - num_piecewise_graphs_seen=6, - num_piecewise_capturable_graphs_seen=4, - num_backend_compilations=4, + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, num_cudagraph_captured=0, # no cudagraph captured ): outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index bc65e3da0ae74..f61a0a4eb740d 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -13,7 +13,7 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -61,7 +61,7 @@ def _run_simple_model( ): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, use_inductor=use_inductor, splitting_ops=splitting_ops, diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index eaf0a15479e97..75a89d692fa8f 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -9,6 +9,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are initialized randomly with a fixed seed. """ +from copy import deepcopy from dataclasses import dataclass from typing import Any @@ -20,12 +21,13 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, ) from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from .. import silly_attention # noqa: F401 @@ -257,27 +259,13 @@ def tractable_computation( @torch.inference_mode -def run_model( - llama_config, use_compile: bool, backend: str, split_attn: bool = False -) -> torch.Tensor: - if use_compile: - compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, - use_cudagraph=True, - backend=backend, - cudagraph_capture_sizes=[1, 2], - ) - if split_attn: - compilation_config.splitting_ops = ["silly::attention"] - cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE - else: - compilation_config = CompilationConfig( - level=CompilationLevel.NO_COMPILATION, - ) - cudagraph_runtime_mode = CUDAGraphMode.NONE +def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor: + # Start with a fresh copy to make sure there's no cache dir sharing + compile_config = deepcopy(compile_config) + cudagraph_runtime_mode = compile_config.cudagraph_mode vllm_config = VllmConfig( - compilation_config=compilation_config, additional_config=llama_config + compilation_config=compile_config, additional_config=llama_config ) with set_current_vllm_config(vllm_config): model = ( @@ -338,8 +326,25 @@ def run_model( return output.cpu() -@pytest.mark.parametrize("backend", ["inductor", "eager"]) -def test_toy_llama(backend: str): +@pytest.mark.parametrize( + "backend, use_inductor_graph_partition", + [ + ("eager", False), # No inductor + ("inductor", False), # Inductor, Dynamo partition + ("inductor", True), # Inductor, Inductor partition + ], +) +def test_toy_llama( + backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path +): + # We disable the vLLM compile cache into a new tmp dir for 2 reasons: + # 1. To make sure we can properly track the number of Inductor compilations. + # 2. Inductor partitioning does not play nicely with Autograd cache (below) + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("Inductor graph partition only supported in torch>=2.9") + # compare output with and without piecewise compilation llama_config = LlamaConfig( @@ -350,6 +355,32 @@ def test_toy_llama(backend: str): hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True ) + compile_config_no_compile = CompilationConfig( + level=CompilationMode.NONE, + cudagraph_mode=CUDAGraphMode.NONE, + backend="eager", + ) + + compile_config_no_split = CompilationConfig( + level=CompilationMode.VLLM_COMPILE, + use_inductor_graph_partition=use_inductor_graph_partition, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + backend=backend, + cudagraph_capture_sizes=[1, 2], + ) + + # FIXME(luka/boyuan): the graph from the previous test case + # (no inductor partition) gets cached by AotAutograd so then the + # compilation with inductor partitioning incorrectly loads an unpartitioned + # graph and never partitions. I think this is a bug with custom inductor + # partitioning but does not affect vLLM more generally as vLLM uses its own + # cache (which takes inductor partitioning into account). + if use_inductor_graph_partition: + compile_config_no_split.inductor_compile_config["force_disable_caches"] = True + + compile_config_split = deepcopy(compile_config_no_split) + compile_config_split.splitting_ops = ["silly::attention"] + outputs = [] with compilation_counter.expect( num_graphs_seen=0, @@ -358,8 +389,9 @@ def test_toy_llama(backend: str): num_backend_compilations=0, num_cudagraph_captured=0, ): - outputs.append(run_model(llama_config, backend="eager", use_compile=False)) - run_model(tractable_config, backend="eager", use_compile=False) + outputs.append(run_model(llama_config, compile_config_no_compile)) + + run_model(tractable_config, compile_config_no_compile) if backend == "inductor": kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0} @@ -367,35 +399,34 @@ def test_toy_llama(backend: str): kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0} with compilation_counter.expect( - # One graph for the model - num_graphs_seen=1, + num_graphs_seen=1, # one graph for the model num_piecewise_graphs_seen=1, num_piecewise_capturable_graphs_seen=1, - # num_piecewise_capturable_graphs_seen - num_backend_compilations=1, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_backend_compilations=1, # num_piecewise_capturable_graphs_seen num_cudagraph_captured=2, **kwargs, ): - outputs.append(run_model(llama_config, backend=backend, use_compile=True)) - run_model(tractable_config, backend=backend, use_compile=True) + outputs.append(run_model(llama_config, compile_config_no_split)) + + run_model(tractable_config, compile_config_no_split) + + if use_inductor_graph_partition: + num_piecewise_fx = 1 + num_piecewise_capturable_fx = 1 + else: + num_piecewise_fx = 2 * llama_config.num_layers + 1 + num_piecewise_capturable_fx = 1 + llama_config.num_layers with compilation_counter.expect( num_graphs_seen=1, # one graph for the model - num_piecewise_graphs_seen=2 * llama_config.num_layers + 1, # 2 * num_layers + 1 - num_piecewise_capturable_graphs_seen=1 - + llama_config.num_layers, # 1 + num_layers - num_backend_compilations=1 - + llama_config.num_layers, # num_piecewise_capturable_graphs_seen - num_cudagraph_captured=2 - * ( - 1 + llama_config.num_layers - ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, + # num_cudagraph_sizes * num_partitions + num_cudagraph_captured=2 * (1 + llama_config.num_layers), ): - outputs.append( - run_model(llama_config, backend=backend, use_compile=True, split_attn=True) - ) - run_model(tractable_config, backend=backend, use_compile=True, split_attn=True) + outputs.append(run_model(llama_config, compile_config_split)) + run_model(tractable_config, compile_config_split) for i in range(1, len(outputs)): assert torch.allclose(outputs[0], outputs[i]) @@ -427,14 +458,14 @@ def benchmark(): for piecewise in [False, True]: if piecewise: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=cudagraph_sizes, ) else: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, cudagraph_capture_sizes=cudagraph_sizes, ) diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py index c0d3f908149f6..f33c5772906a6 100644 --- a/tests/compile/silly_attention.py +++ b/tests/compile/silly_attention.py @@ -62,5 +62,4 @@ direct_register_custom_op( mutates_args=["out"], fake_impl=silly_attention_fake, target_lib=silly_lib, - tags=(torch._C.Tag.cudagraph_unsafe,), ) diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py index 08f79d90cd367..1701d85fe84e7 100644 --- a/tests/compile/test_aot_compile.py +++ b/tests/compile/test_aot_compile.py @@ -10,7 +10,7 @@ import torch from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, VllmConfig, set_current_vllm_config, ) @@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module): def make_vllm_config() -> VllmConfig: return VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, ) ) diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 102a929bf2409..60856f5a58067 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -10,6 +10,7 @@ import vllm.envs as envs from vllm.compilation.collective_fusion import AsyncTPPass from vllm.config import ( CompilationConfig, + CompilationMode, DeviceConfig, ModelConfig, PassConfig, @@ -400,7 +401,7 @@ def test_async_tp_pass_correctness( common_args.append("--enforce-eager") compilation_config = { - "level": 3, + "mode": CompilationMode.VLLM_COMPILE, "compile_sizes": [2, 4, 8], "splitting_ops": [], "pass_config": {"enable_async_tp": async_tp_enabled}, diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index ab6a17e149fcd..954774a8e3983 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -4,7 +4,7 @@ import dataclasses import pytest -from vllm.config import CompilationLevel +from vllm.config import CompilationMode from vllm.utils import cuda_device_count_stateless from ..utils import compare_all_settings @@ -21,7 +21,7 @@ class TestSetting: # we cannot afford testing the full Cartesian product -# of all models and all levels +# of all models and all modes @pytest.mark.parametrize( "test_setting", [ @@ -121,15 +121,13 @@ def test_compile_correctness( all_args: list[list[str]] = [] all_envs: list[dict[str, str] | None] = [] - for comp_level in [ - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - CompilationLevel.PIECEWISE, + for comp_mode in [ + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, ]: - for level in [CompilationLevel.NO_COMPILATION, comp_level]: - all_args.append( - final_args + [f"-O.level={level}", "-O.backend=inductor"] - ) + for mode in [CompilationMode.NONE, comp_mode]: + all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"]) # inductor will change the output, so we only compare if the output # is close, not exactly the same. @@ -142,13 +140,13 @@ def test_compile_correctness( all_envs.clear() all_args.clear() - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - CompilationLevel.PIECEWISE, + for mode in [ + CompilationMode.NONE, + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, ]: - all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"]) + all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"]) all_envs.append({}) all_envs.append({}) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index ae8b0b226c313..7f51c763da73c 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -4,7 +4,7 @@ import pytest from vllm.compilation.counter import compilation_counter from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig -from vllm.config.compilation import CompilationLevel +from vllm.config.compilation import CompilationMode from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer @@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @pytest.mark.forked -def test_dynamo_as_is(vllm_runner, monkeypatch): +def test_stock_torch_compile(vllm_runner, monkeypatch): # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(dynamo_as_is_count=1), + compilation_counter.expect(stock_torch_compile_count=1), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", - compilation_config={"level": 1}, + compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE}, gpu_memory_utilization=0.4, ) as _, ): @@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch): # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), + compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", - compilation_config={"level": 0}, + compilation_config={"mode": CompilationMode.NONE}, gpu_memory_utilization=0.4, ) as _, ): @@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch): monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), + compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4 @@ -151,7 +151,7 @@ def test_splitting_ops_dynamic(): if is_torch_equal_or_newer("2.9.0.dev"): config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, splitting_ops=["vllm::unified_attention"], ) @@ -163,7 +163,7 @@ def test_splitting_ops_dynamic(): # When attn_fusion pass enabled, splitting_ops now default to attention ops. config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, pass_config={"enable_attn_fusion": True, "enable_noop": True}, custom_ops=["+quant_fp8"], cudagraph_mode=CUDAGraphMode.PIECEWISE, @@ -178,7 +178,7 @@ def test_splitting_ops_dynamic(): if is_torch_equal_or_newer("2.9.0.dev"): config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, pass_config={"enable_attn_fusion": True, "enable_noop": True}, custom_ops=["+quant_fp8"], diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py index 6b050207ec41b..e459bc539f2b8 100644 --- a/tests/compile/test_decorator.py +++ b/tests/compile/test_decorator.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest import torch from torch import nn @@ -8,12 +9,13 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_comp from vllm.config import ( CacheConfig, CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, ) from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from . import silly_attention # noqa: F401 @@ -65,18 +67,40 @@ def run_model( return output.cpu() -def test_ignore_torch_compile_decorator(): +@pytest.mark.parametrize("use_inductor_graph_partition", [True, False]) +def test_ignore_torch_compile_decorator(use_inductor_graph_partition, monkeypatch): + # disable compile cache so that we can count the number of compilations + # appropriately + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available in PyTorch 2.9+") + # piecewise vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=use_inductor_graph_partition, ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE + expected_num_graphs_seen = 1 + expected_num_cudagraph_captured = ( + 4 # num_cudagraph_sizes * num cudagraphs to capture + ) + if use_inductor_graph_partition: + expected_num_piecewise_graphs_seen = 1 + expected_num_piecewise_capturable_graphs_seen = 1 + expected_num_backend_compilations = 1 + else: + expected_num_piecewise_graphs_seen = 3 + expected_num_piecewise_capturable_graphs_seen = 2 + expected_num_backend_compilations = 2 + @support_torch_compile class A(nn.Module): def __init__( @@ -103,12 +127,11 @@ def test_ignore_torch_compile_decorator(): # A has support_torch_compile with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_graphs_seen=expected_num_graphs_seen, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, + num_cudagraph_captured=expected_num_cudagraph_captured, ): run_model(vllm_config, mod_A, cudagraph_runtime_mode) @@ -130,12 +153,11 @@ def test_ignore_torch_compile_decorator(): # C's support_torch_compile should override B's ignore_torch_compile with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_graphs_seen=expected_num_graphs_seen, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, + num_cudagraph_captured=expected_num_cudagraph_captured, ): run_model(vllm_config, mod_C, cudagraph_runtime_mode) @@ -178,16 +200,25 @@ class A(nn.Module): return x -def test_conditional_compile_enable_if(): +@pytest.mark.parametrize("use_inductor_graph_partition", [True, False]) +def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch): + # disable compile cache so that we can count the number of compilations + # appropriately + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available in PyTorch 2.9+") + vllm_config = VllmConfig( cache_config=CacheConfig( kv_sharing_fast_prefill=True, ), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=use_inductor_graph_partition, ), ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -195,17 +226,26 @@ def test_conditional_compile_enable_if(): with set_current_vllm_config(vllm_config): mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda() + if use_inductor_graph_partition: + expected_num_piecewise_graphs_seen = 2 + expected_num_piecewise_capturable_graphs_seen = 2 + expected_num_backend_compilations = 2 + else: + expected_num_piecewise_graphs_seen = 6 + expected_num_piecewise_capturable_graphs_seen = 4 + expected_num_backend_compilations = 4 + # A has support_torch_compile but enable_if fn returns False # enalbe_if will be True for B, so we expect mod1 and mod2 # to be compiled with compilation_counter.expect( num_graphs_seen=2, - num_piecewise_graphs_seen=6, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, # 3 piecewise graphs per instance of B() - num_piecewise_capturable_graphs_seen=4, - num_backend_compilations=4, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, num_cudagraph_captured=8, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + # num_cudagraph_sizes * num cudagraphable graphs to capture ): run_model(vllm_config, mod_A, cudagraph_runtime_mode) @@ -216,23 +256,34 @@ def test_conditional_compile_enable_if(): kv_sharing_fast_prefill=False, ), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=use_inductor_graph_partition, ), ) with set_current_vllm_config(vllm_config): mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda() + if use_inductor_graph_partition: + expected_num_piecewise_graphs_seen = 1 + expected_num_piecewise_capturable_graphs_seen = 1 + expected_num_backend_compilations = 1 + else: + # 3 attn ops and 4 non-attn ops + expected_num_piecewise_graphs_seen = 7 + expected_num_piecewise_capturable_graphs_seen = 4 + expected_num_backend_compilations = 4 + with compilation_counter.expect( num_graphs_seen=1, - num_piecewise_graphs_seen=7, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, # 3 attn ops and 4 non-attn ops - num_piecewise_capturable_graphs_seen=4, - num_backend_compilations=4, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, num_cudagraph_captured=8, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + # num_cudagraph_sizes * num cudagraphable graphs to capture ): run_model(vllm_config, mod_A, cudagraph_runtime_mode) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 2f3794c90b204..2d290771f9ad7 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -12,7 +12,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams from vllm.attention.backends.registry import _Backend from vllm.attention.selector import global_force_attn_backend_context_manager -from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig +from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.platforms import current_platform from vllm.utils import is_torch_equal_or_newer @@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None): @pytest.mark.parametrize( - "optimization_level", - [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], + "compilation_mode", + [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE], ) @pytest.mark.parametrize("model_info", models_list(all=True)) @create_new_process_for_each_test() def test_full_graph( monkeypatch: pytest.MonkeyPatch, model_info: tuple[str, dict[str, Any]], - optimization_level: int, + compilation_mode: int, ): model, model_kwargs = model_info with monkeypatch.context(): print(f"MODEL={model}") - run_model(optimization_level, model, model_kwargs) + run_model(compilation_mode, model, model_kwargs) # TODO(luka) add other supported compilation config scenarios here @@ -104,7 +104,7 @@ def test_full_graph( [ # additional compile sizes, only some of the models ( - CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]), + CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]), model, ) for model in models_list(all=False) @@ -113,7 +113,7 @@ def test_full_graph( # RMSNorm + quant fusion, only 8-bit quant models ( CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm"], pass_config=PassConfig(enable_fusion=True, enable_noop=True), ), @@ -125,7 +125,8 @@ def test_full_graph( # Test depyf integration works ( CompilationConfig( - level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir() + mode=CompilationMode.VLLM_COMPILE, + debug_dump_path=tempfile.gettempdir(), ), ("facebook/opt-125m", {}), ), @@ -134,7 +135,7 @@ def test_full_graph( # graph inductor partition ( CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, # inductor graph partition uses # torch._C.Tag.cudagraph_unsafe to specify splitting ops use_inductor_graph_partition=True, @@ -164,10 +165,10 @@ def test_custom_compile_config( @pytest.mark.parametrize( - "optimization_level", - [CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE], + "compilation_mode", + [CompilationMode.NONE, CompilationMode.VLLM_COMPILE], ) -def test_fp8_kv_scale_compile(optimization_level: int): +def test_fp8_kv_scale_compile(compilation_mode: int): model = "Qwen/Qwen2-0.5B" model_kwargs = { "quantization": "fp8", @@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int): "calculate_kv_scales": True, "max_model_len": 512, } - run_model(optimization_level, model, model_kwargs) + run_model(compilation_mode, model, model_kwargs) def test_inductor_graph_partition_attn_fusion(caplog_vllm): @@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm): model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, cudagraph_mode=CUDAGraphMode.PIECEWISE, custom_ops=["+quant_fp8"], diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 7c22336432299..1a5eaf2639b36 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -13,7 +13,7 @@ from vllm.compilation.fusion import ( ) from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass -from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, @@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant( vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"], pass_config=PassConfig(enable_fusion=True, enable_noop=True), ) diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index 455d1bb039057..fbcd6c71fb723 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -12,7 +12,7 @@ from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, DeviceConfig, ModelConfig, PassConfig, @@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model( vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"] + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"] ) ) vllm_config.compilation_config.pass_config = PassConfig( diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index d1ab85cfb875c..4d6f4b471a3a4 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -19,7 +19,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import ( CacheConfig, CompilationConfig, - CompilationLevel, + CompilationMode, ModelConfig, PassConfig, SchedulerConfig, @@ -321,7 +321,7 @@ def test_attention_quant_pattern( ), scheduler_config=SchedulerConfig(max_num_seqs=1024), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+quant_fp8"], use_inductor_graph_partition=use_inductor_graph_partition, ), @@ -421,7 +421,9 @@ def test_attention_quant_pattern( ] if any(attn_fusion_supported): # Check quantization ops in the graph before and after fusion - test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=True) + # Note: fully_replaced=False because query quant ops remain in graph. + # Only output quant ops are fused into attention. + test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=False) # access the underlying `AttnFusionPass` on the `LazyInitPass` assert attn_pass.pass_.matched_count == sum(attn_fusion_supported) diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py index 188f4514dda5f..0ccc1a0161629 100644 --- a/tests/compile/test_noop_elimination.py +++ b/tests/compile/test_noop_elimination.py @@ -6,7 +6,7 @@ import torch import vllm from vllm.compilation.noop_elimination import NoOpEliminationPass -from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig from .backend import TestBackend @@ -50,7 +50,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig(enable_noop=True), ) ) @@ -98,7 +98,7 @@ def test_non_noop_slice_preserved(): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig(enable_noop=True), ) ) diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index b2fff822bbbb5..da0afd9eaa49f 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -5,7 +5,7 @@ import torch from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel +from vllm.config import CompilationMode class MyMod(torch.nn.Module): @@ -20,7 +20,7 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher): self.model = model compiled_callable = torch.compile(self.forward, backend="eager") super().__init__( - compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE + compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE ) def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): diff --git a/tests/conftest.py b/tests/conftest.py index 2fde7f97836d6..369acb92cfb91 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -334,7 +334,7 @@ class HfRunner: trust_remote_code=trust_remote_code, ) self.device = self.get_default_device() - self.dtype = torch_dtype = _get_and_verify_dtype( + self.dtype = dtype = _get_and_verify_dtype( self.model_name, self.config, dtype=dtype, @@ -342,7 +342,7 @@ class HfRunner: ) model_kwargs = model_kwargs if model_kwargs is not None else {} - model_kwargs.setdefault("torch_dtype", torch_dtype) + model_kwargs.setdefault("dtype", dtype) if is_sentence_transformer: # Lazy init required for AMD CI @@ -388,7 +388,7 @@ class HfRunner: if not skip_tokenizer_init: self.tokenizer = AutoTokenizer.from_pretrained( model_name, - torch_dtype=torch_dtype, + dtype=dtype, trust_remote_code=trust_remote_code, ) @@ -398,7 +398,7 @@ class HfRunner: self.processor = AutoProcessor.from_pretrained( model_name, - torch_dtype=torch_dtype, + dtype=dtype, trust_remote_code=trust_remote_code, ) if skip_tokenizer_init: @@ -1011,8 +1011,12 @@ class VllmRunner: req_outputs = self.llm.embed(inputs, *args, **kwargs) return [req_output.outputs.embedding for req_output in req_outputs] - def encode(self, prompts: list[str]) -> list[list[float]]: - req_outputs = self.llm.encode(prompts) + def token_embed(self, prompts: list[str]) -> list[list[float]]: + req_outputs = self.llm.encode(prompts, pooling_task="token_embed") + return [req_output.outputs.data for req_output in req_outputs] + + def token_classify(self, prompts: list[str]) -> list[list[float]]: + req_outputs = self.llm.encode(prompts, pooling_task="token_classify") return [req_output.outputs.data for req_output in req_outputs] def reward(self, prompts: list[str]) -> list[list[float]]: diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index a431bf30fc890..362e9daf5ae04 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -15,6 +15,7 @@ from typing import Literal, NamedTuple import pytest +from vllm.config.compilation import CompilationMode from vllm.config.model import RunnerOption from vllm.logger import init_logger @@ -234,7 +235,7 @@ def _compare_sp( common_args.append("--skip-tokenizer-init") compilation_config = { - "level": 3, + "mode": CompilationMode.VLLM_COMPILE, "custom_ops": ["+rms_norm"], "compile_sizes": [4, 8], "pass_config": { diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 78928a53942f9..c73083b0b5ef6 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -226,30 +226,30 @@ def test_compilation_config(): # set to O3 args = parser.parse_args(["-O0"]) - assert args.compilation_config.level == 0 + assert args.compilation_config.mode == 0 # set to O 3 (space) args = parser.parse_args(["-O", "1"]) - assert args.compilation_config.level == 1 + assert args.compilation_config.mode == 1 # set to O 3 (equals) args = parser.parse_args(["-O=2"]) - assert args.compilation_config.level == 2 + assert args.compilation_config.mode == 2 - # set to O.level 3 - args = parser.parse_args(["-O.level", "3"]) - assert args.compilation_config.level == 3 + # set to O.mode 3 + args = parser.parse_args(["-O.mode", "3"]) + assert args.compilation_config.mode == 3 # set to string form of a dict args = parser.parse_args( [ "-O", - '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '"use_inductor": false}', ] ) assert ( - args.compilation_config.level == 3 + args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and not args.compilation_config.use_inductor ) @@ -258,12 +258,12 @@ def test_compilation_config(): args = parser.parse_args( [ "--compilation-config=" - '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '"use_inductor": true}', ] ) assert ( - args.compilation_config.level == 3 + args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.use_inductor ) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index a96f0134c2ffb..a2d8993441fcd 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -53,21 +53,34 @@ def base64_encoded_audio() -> dict[str, str]: } +def dummy_messages_from_audio_url( + audio_urls: str | list[str], + content_text: str = "What's happening in this audio?", +): + if isinstance(audio_urls, str): + audio_urls = [audio_urls] + + return [ + { + "role": "user", + "content": [ + *( + {"type": "audio_url", "audio_url": {"url": audio_url}} + for audio_url in audio_urls + ), + {"type": "text", "text": content_text}, + ], + } + ] + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) async def test_single_chat_session_audio( client: openai.AsyncOpenAI, model_name: str, audio_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "audio_url", "audio_url": {"url": audio_url}}, - {"type": "text", "text": "What's happening in this audio?"}, - ], - } - ] + messages = dummy_messages_from_audio_url(audio_url) # test single completion chat_completion = await client.chat.completions.create( @@ -138,20 +151,9 @@ async def test_single_chat_session_audio_base64encoded( audio_url: str, base64_encoded_audio: dict[str, str], ): - messages = [ - { - "role": "user", - "content": [ - { - "type": "audio_url", - "audio_url": { - "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's happening in this audio?"}, - ], - } - ] + messages = dummy_messages_from_audio_url( + f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" + ) # test single completion chat_completion = await client.chat.completions.create( @@ -252,15 +254,7 @@ async def test_single_chat_session_input_audio( async def test_chat_streaming_audio( client: openai.AsyncOpenAI, model_name: str, audio_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "audio_url", "audio_url": {"url": audio_url}}, - {"type": "text", "text": "What's happening in this audio?"}, - ], - } - ] + messages = dummy_messages_from_audio_url(audio_url) # test single completion chat_completion = await client.chat.completions.create( @@ -365,18 +359,7 @@ async def test_chat_streaming_input_audio( async def test_multi_audio_input( client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str] ): - messages = [ - { - "role": "user", - "content": [ - *( - {"type": "audio_url", "audio_url": {"url": audio_url}} - for audio_url in audio_urls - ), - {"type": "text", "text": "What's happening in this audio?"}, - ], - } - ] + messages = dummy_messages_from_audio_url(audio_urls) if len(audio_urls) > MAXIMUM_AUDIOS: with pytest.raises(openai.BadRequestError): # test multi-audio input diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 4c7d1c14ca17b..7ecdac518f97f 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -55,21 +55,34 @@ def base64_encoded_video() -> dict[str, str]: } +def dummy_messages_from_video_url( + video_urls: str | list[str], + content_text: str = "What's in this video?", +): + if isinstance(video_urls, str): + video_urls = [video_urls] + + return [ + { + "role": "user", + "content": [ + *( + {"type": "video_url", "video_url": {"url": video_url}} + for video_url in video_urls + ), + {"type": "text", "text": content_text}, + ], + } + ] + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) async def test_single_chat_session_video( client: openai.AsyncOpenAI, model_name: str, video_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "video_url", "video_url": {"url": video_url}}, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url(video_url) # test single completion chat_completion = await client.chat.completions.create( @@ -137,15 +150,7 @@ async def test_error_on_invalid_video_url_type( async def test_single_chat_session_video_beamsearch( client: openai.AsyncOpenAI, model_name: str, video_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "video_url", "video_url": {"url": video_url}}, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url(video_url) chat_completion = await client.chat.completions.create( model=model_name, @@ -172,20 +177,9 @@ async def test_single_chat_session_video_base64encoded( video_url: str, base64_encoded_video: dict[str, str], ): - messages = [ - { - "role": "user", - "content": [ - { - "type": "video_url", - "video_url": { - "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url( + f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" + ) # test single completion chat_completion = await client.chat.completions.create( @@ -231,20 +225,10 @@ async def test_single_chat_session_video_base64encoded_beamsearch( video_url: str, base64_encoded_video: dict[str, str], ): - messages = [ - { - "role": "user", - "content": [ - { - "type": "video_url", - "video_url": { - "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url( + f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" + ) + chat_completion = await client.chat.completions.create( model=model_name, messages=messages, @@ -265,15 +249,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch( async def test_chat_streaming_video( client: openai.AsyncOpenAI, model_name: str, video_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "video_url", "video_url": {"url": video_url}}, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url(video_url) # test single completion chat_completion = await client.chat.completions.create( @@ -318,18 +294,7 @@ async def test_chat_streaming_video( async def test_multi_video_input( client: openai.AsyncOpenAI, model_name: str, video_urls: list[str] ): - messages = [ - { - "role": "user", - "content": [ - *( - {"type": "video_url", "video_url": {"url": video_url}} - for video_url in video_urls - ), - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url(video_urls) if len(video_urls) > MAXIMUM_VIDEOS: with pytest.raises(openai.BadRequestError): # test multi-video input diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 5a15a352f45cc..09bd0dabb799a 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -78,6 +78,27 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]: } +def dummy_messages_from_image_url( + image_urls: str | list[str], + content_text: str = "What's in this image?", +): + if isinstance(image_urls, str): + image_urls = [image_urls] + + return [ + { + "role": "user", + "content": [ + *( + {"type": "image_url", "image_url": {"url": image_url}} + for image_url in image_urls + ), + {"type": "text", "text": content_text}, + ], + } + ] + + def get_hf_prompt_tokens(model_name, content, image_url): processor = AutoProcessor.from_pretrained( model_name, trust_remote_code=True, num_crops=4 @@ -107,15 +128,7 @@ async def test_single_chat_session_image( client: openai.AsyncOpenAI, model_name: str, image_url: str ): content_text = "What's in this image?" - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": content_text}, - ], - } - ] + messages = dummy_messages_from_image_url(image_url, content_text) max_completion_tokens = 10 # test single completion @@ -188,15 +201,8 @@ async def test_error_on_invalid_image_url_type( async def test_single_chat_session_image_beamsearch( client: openai.AsyncOpenAI, model_name: str, image_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + content_text = "What's in this image?" + messages = dummy_messages_from_image_url(image_url, content_text) chat_completion = await client.chat.completions.create( model=model_name, @@ -226,20 +232,10 @@ async def test_single_chat_session_image_base64encoded( base64_encoded_image: dict[str, str], ): content_text = "What's in this image?" - messages = [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": content_text}, - ], - } - ] + messages = dummy_messages_from_image_url( + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", + content_text, + ) max_completion_tokens = 10 # test single completion @@ -293,20 +289,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch( raw_image_url = TEST_IMAGE_ASSETS[image_idx] expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] - messages = [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + messages = dummy_messages_from_image_url( + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" + ) + chat_completion = await client.chat.completions.create( model=model_name, messages=messages, @@ -326,15 +312,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( async def test_chat_streaming_image( client: openai.AsyncOpenAI, model_name: str, image_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + messages = dummy_messages_from_image_url(image_url) # test single completion chat_completion = await client.chat.completions.create( @@ -381,18 +359,7 @@ async def test_chat_streaming_image( async def test_multi_image_input( client: openai.AsyncOpenAI, model_name: str, image_urls: list[str] ): - messages = [ - { - "role": "user", - "content": [ - *( - {"type": "image_url", "image_url": {"url": image_url}} - for image_url in image_urls - ), - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + messages = dummy_messages_from_image_url(image_urls) if len(image_urls) > MAXIMUM_IMAGES: with pytest.raises(openai.BadRequestError): # test multi-image input diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py new file mode 100644 index 0000000000000..224196b9a0b2e --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py @@ -0,0 +1,243 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock, patch + +import pytest + +from tests.entrypoints.openai.tool_parsers.utils import ( + run_tool_extraction, + run_tool_extraction_streaming, +) +from vllm.entrypoints.openai.protocol import FunctionCall +from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager + +# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1 +SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')" +SIMPLE_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{"city": "San Francisco", "metric": "celsius"}', +) +MORE_TYPES_FUNCTION_OUTPUT = ( + "register_user(name='John Doe', " + "age=37, " + "address={'city': 'San Francisco', 'state': 'CA'}, " + "role=None, " + "passed_test=True, " + "aliases=['John', 'Johnny'])" +) +MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS = ( + "register_user(name='John Doe', " + "age=37, " + "address={'city': 'San Francisco', 'state': 'CA'}, " + "role=null, " + "passed_test=true, " + "aliases=['John', 'Johnny'])" +) +MORE_TYPES_FUNCTION_CALL = FunctionCall( + name="register_user", + arguments='{"name": "John Doe", ' + '"age": 37, ' + '"address": {"city": "San Francisco", "state": "CA"}, ' + '"role": null, ' + '"passed_test": true, ' + '"aliases": ["John", "Johnny"]}', +) +PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()" +PARAMETERLESS_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments="{}", +) +EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})" +EMPTY_DICT_FUNCTION_CALL = FunctionCall( + name="do_something_cool", + arguments='{"additional_data": {}}', +) +EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])" +EMPTY_LIST_FUNCTION_CALL = FunctionCall( + name="do_something_cool", + arguments='{"steps": []}', +) +ESCAPED_STRING_FUNCTION_OUTPUT = ( + r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')" +) +ESCAPED_STRING_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}', +) + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_no_tool_call(streaming: bool): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer) + model_output = "How can I help you today?" + + content, tool_calls = run_tool_extraction( + tool_parser, model_output, streaming=streaming + ) + + assert content == model_output + assert len(tool_calls) == 0 + + +TEST_CASES = [ + pytest.param( + True, + f"{SIMPLE_FUNCTION_OUTPUT}", + [SIMPLE_FUNCTION_CALL], + id="simple_streaming", + ), + pytest.param( + False, + f"{SIMPLE_FUNCTION_OUTPUT}", + [SIMPLE_FUNCTION_CALL], + id="simple_nonstreaming", + ), + pytest.param( + True, + f"{MORE_TYPES_FUNCTION_OUTPUT}", + [MORE_TYPES_FUNCTION_CALL], + id="more_types_streaming", + ), + pytest.param( + False, + f"{MORE_TYPES_FUNCTION_OUTPUT}", + [MORE_TYPES_FUNCTION_CALL], + id="more_types_nonstreaming", + ), + pytest.param( + True, + f"{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}", + [MORE_TYPES_FUNCTION_CALL], + id="more_types_streaming_json_literals", + ), + pytest.param( + False, + f"{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}", + [MORE_TYPES_FUNCTION_CALL], + id="more_types_nonstreaming_json_literals", + ), + pytest.param( + True, + f"{PARAMETERLESS_FUNCTION_OUTPUT}", + [PARAMETERLESS_FUNCTION_CALL], + id="parameterless_streaming", + ), + pytest.param( + False, + f"{PARAMETERLESS_FUNCTION_OUTPUT}", + [PARAMETERLESS_FUNCTION_CALL], + id="parameterless_nonstreaming", + ), + pytest.param( + True, + f"{EMPTY_DICT_FUNCTION_OUTPUT}", + [EMPTY_DICT_FUNCTION_CALL], + id="empty_dict_streaming", + ), + pytest.param( + False, + f"{EMPTY_DICT_FUNCTION_OUTPUT}", + [EMPTY_DICT_FUNCTION_CALL], + id="empty_dict_nonstreaming", + ), + pytest.param( + True, + f"{EMPTY_LIST_FUNCTION_OUTPUT}", + [EMPTY_LIST_FUNCTION_CALL], + id="empty_list_streaming", + ), + pytest.param( + False, + f"{EMPTY_LIST_FUNCTION_OUTPUT}", + [EMPTY_LIST_FUNCTION_CALL], + id="empty_list_nonstreaming", + ), + pytest.param( + True, + f"{ESCAPED_STRING_FUNCTION_OUTPUT}", + [ESCAPED_STRING_FUNCTION_CALL], + id="escaped_string_streaming", + ), + pytest.param( + False, + f"{ESCAPED_STRING_FUNCTION_OUTPUT}", + [ESCAPED_STRING_FUNCTION_CALL], + id="escaped_string_nonstreaming", + ), + pytest.param( + True, + f"{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}", + [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL], + id="parallel_calls_streaming", + ), + pytest.param( + False, + f"{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}", + [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL], + id="parallel_calls_nonstreaming", + ), +] + + +@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES) +def test_tool_call( + streaming: bool, model_output: str, expected_tool_calls: list[FunctionCall] +): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer) + + content, tool_calls = run_tool_extraction( + tool_parser, model_output, streaming=streaming + ) + + assert content is None + assert len(tool_calls) == len(expected_tool_calls) + for actual, expected in zip(tool_calls, expected_tool_calls): + assert actual.type == "function" + assert actual.function == expected + + +def test_streaming_tool_call_with_large_steps(): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer) + model_output_deltas = [ + "get_weather(city='San", + " Francisco', metric='celsius')\n" + f"{PARAMETERLESS_FUNCTION_OUTPUT}\n" + f"{EMPTY_LIST_FUNCTION_OUTPUT}", + ] + + reconstructor = run_tool_extraction_streaming( + tool_parser, model_output_deltas, assert_one_tool_per_delta=False + ) + + assert reconstructor.other_content == "" + assert len(reconstructor.tool_calls) == 3 + assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL + assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL + assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL + + +@pytest.mark.parametrize("streaming", [False]) +def test_regex_timeout_handling(streaming: bool): + """test regex timeout is handled gracefully""" + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer) + + fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2 + + # create a mock regex that raises TimeoutError + mock_regex = MagicMock() + mock_regex.match.side_effect = TimeoutError("Regex timeout") + + with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex): + content, tool_calls = run_tool_extraction( + tool_parser, fake_problematic_input, streaming=streaming + ) + + # should treat as regular text when regex times out + assert content == fake_problematic_input + assert len(tool_calls) == 0 + mock_regex.match.assert_called_once() diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py index 488c82c9fe7fd..96f634ee0a8c7 100644 --- a/tests/entrypoints/pooling/llm/test_classify.py +++ b/tests/entrypoints/pooling/llm/test_classify.py @@ -63,7 +63,7 @@ def test_encode_api(llm: LLM): # chunked prefill does not support all pooling err_msg = "pooling_task must be one of.+" with pytest.raises(ValueError, match=err_msg): - llm.encode(prompts, use_tqdm=False) + llm.encode(prompts, pooling_task="token_classify", use_tqdm=False) def test_score_api(llm: LLM): diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py index c53941390bd10..5455b5f91fc09 100644 --- a/tests/entrypoints/pooling/llm/test_embedding.py +++ b/tests/entrypoints/pooling/llm/test_embedding.py @@ -35,6 +35,13 @@ def llm(): cleanup_dist_env_and_memory() +@pytest.mark.skip_global_cleanup +def test_encode_api(llm: LLM): + outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False) + multi_vector = outputs[0].outputs.data + assert multi_vector.shape == (11, 384) + + def test_pooling_params(llm: LLM): def get_outputs(normalize): outputs = llm.embed( diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py index 9ba380334e5a2..ca85d2758fce4 100644 --- a/tests/entrypoints/pooling/llm/test_encode.py +++ b/tests/entrypoints/pooling/llm/test_encode.py @@ -57,20 +57,24 @@ def test_multiple_pooling_params(llm: LLM): ] # Multiple PoolingParams should be matched with each prompt - outputs = llm.encode(PROMPTS, pooling_params=pooling_params) + outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed") assert len(PROMPTS) == len(outputs) # Exception raised, if the size of params does not match the size of prompts with pytest.raises(ValueError): - outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3]) + outputs = llm.encode( + PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed" + ) # Single PoolingParams should be applied to every prompt single_pooling_params = PoolingParams() - outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params) + outputs = llm.encode( + PROMPTS, pooling_params=single_pooling_params, pooling_task="embed" + ) assert len(PROMPTS) == len(outputs) # pooling_params is None, default params should be applied - outputs = llm.encode(PROMPTS, pooling_params=None) + outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed") assert len(PROMPTS) == len(outputs) diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/llm/test_reward.py index 8312ff180b36f..81058dbad891b 100644 --- a/tests/entrypoints/pooling/llm/test_reward.py +++ b/tests/entrypoints/pooling/llm/test_reward.py @@ -36,22 +36,23 @@ def llm(): cleanup_dist_env_and_memory() -@pytest.mark.skip_global_cleanup def test_pooling_params(llm: LLM): - def get_outputs(softmax): + def get_outputs(activation): outputs = llm.reward( - prompts, pooling_params=PoolingParams(softmax=softmax), use_tqdm=False + prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False ) return torch.cat([x.outputs.data for x in outputs]) - default = get_outputs(softmax=None) - w_softmax = get_outputs(softmax=True) - wo_softmax = get_outputs(softmax=False) + default = get_outputs(activation=None) + w_activation = get_outputs(activation=True) + wo_activation = get_outputs(activation=False) - assert torch.allclose(default, w_softmax, atol=1e-2), "Default should use softmax." - assert not torch.allclose(w_softmax, wo_softmax, atol=1e-2), ( - "wo_softmax should not use softmax." + assert torch.allclose(default, w_activation, atol=1e-2), ( + "Default should use activation." ) - assert torch.allclose(softmax(wo_softmax), w_softmax, atol=1e-2), ( - "w_softmax should be close to softmax(wo_softmax)." + assert not torch.allclose(w_activation, wo_activation, atol=1e-2), ( + "wo_activation should not use activation." + ) + assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), ( + "w_activation should be close to activation(wo_activation)." ) diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py index 8a3d298a48e2e..ab8ca9d68e0e7 100644 --- a/tests/entrypoints/pooling/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -17,6 +17,7 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import ( EMBED_DTYPE_TO_TORCH_DTYPE, EmbeddingResponse, + PoolingResponse, ) from vllm.transformers_utils.tokenizer import get_tokenizer @@ -509,3 +510,20 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str): assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), ( "w_normal should be close to normal(wo_normal)." ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling(server: RemoteOpenAIServer, model_name: str): + input_text = ["The chef prepared a delicious meal."] + + response = requests.post( + server.url_for("pooling"), + json={"model": model_name, "input": input_text, "encoding_format": "float"}, + ) + + poolings = PoolingResponse.model_validate(response.json()) + + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 11 + assert len(poolings.data[0].data[0]) == 384 diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py index 9980fcff16c15..e43148d25feeb 100644 --- a/tests/entrypoints/pooling/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F from tests.utils import RemoteOpenAIServer -from vllm.entrypoints.openai.protocol import RerankResponse +from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" @@ -159,3 +159,20 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str): assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), ( "w_activation should be close to activation(wo_activation)." ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling(server: RemoteOpenAIServer, model_name: str): + input_text = ["The chef prepared a delicious meal."] + + response = requests.post( + server.url_for("pooling"), + json={"model": model_name, "input": input_text, "encoding_format": "float"}, + ) + + poolings = PoolingResponse.model_validate(response.json()) + + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 11 + assert len(poolings.data[0].data[0]) == 1 diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py index b0faa870a9272..31ea856224f90 100644 --- a/tests/entrypoints/test_context.py +++ b/tests/entrypoints/test_context.py @@ -6,7 +6,11 @@ from unittest.mock import MagicMock, patch import pytest from openai_harmony import Author, Message, Role, StreamState, TextContent -from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext +from vllm.entrypoints.context import ( + HarmonyContext, + StreamingHarmonyContext, + TurnMetrics, +) from vllm.outputs import CompletionOutput, RequestOutput @@ -101,8 +105,12 @@ def test_single_turn_token_counting(): # Verify internal state tracking assert not context.is_first_turn - assert context.previous_turn.input_tokens == 5 - assert context.previous_turn.output_tokens == 3 + assert len(context.all_turn_metrics) == 1 + previous_turn = context.all_turn_metrics[0] + assert previous_turn.input_tokens == 5 + assert previous_turn.output_tokens == 3 + assert previous_turn.cached_input_tokens == 2 + assert previous_turn.tool_output_tokens == 0 @pytest.mark.asyncio @@ -156,6 +164,15 @@ async def test_multi_turn_token_counting(): assert context.num_tool_output_tokens == expected_tool_output assert context.num_cached_tokens == 5 + 15 + # Validate all turn metrics + assert len(context.all_turn_metrics) == 3 + for i, turn in enumerate(context.all_turn_metrics): + assert turn.input_tokens == prompt_token_counts[i] + assert turn.output_tokens == output_token_counts[i] + assert turn.cached_input_tokens == cached_token_counts[i] + assert context.all_turn_metrics[1].tool_output_tokens == 7 + assert context.all_turn_metrics[2].tool_output_tokens == 1 + def test_empty_output_tokens(): """Test behavior when RequestOutput has empty output tokens.""" @@ -314,6 +331,10 @@ async def test_streaming_multi_turn_token_counting(mock_parser): # Create a streaming context context = StreamingHarmonyContext(messages=[], available_tools=["browser"]) + num_prompt_tokens = [3, 8, 13] + num_output_tokens = [3, 3, 2] + num_cached_tokens = [0, 3, 8] + # Simulate three turns of conversation: # Turn 1: stream tokens one by one, then finish the message # Turn 2: new prompt, stream more tokens with a reasoning segment @@ -325,7 +346,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): create_mock_request_output( prompt_token_ids=[1, 2, 3], # 3 prompt tokens output_token_ids=[101], # Single token - num_cached_tokens=0, + num_cached_tokens=num_cached_tokens[0], finished=False, # Not end of message yet ) ) @@ -370,7 +391,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): 5, ], # 8 tokens (includes previous) output_token_ids=[201], - num_cached_tokens=3, # Some tokens cached + num_cached_tokens=num_cached_tokens[1], # Some tokens cached finished=False, ) ) @@ -422,7 +443,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): 7, ], # 13 tokens output_token_ids=[301], - num_cached_tokens=8, # More cached tokens + num_cached_tokens=num_cached_tokens[2], # More cached tokens finished=False, ) ) @@ -435,10 +456,12 @@ async def test_streaming_multi_turn_token_counting(mock_parser): ) # Final token counts check - assert context.num_prompt_tokens == 3 + 8 + 13 # All prompts - assert context.num_output_tokens == 3 + 3 + 2 # All outputs + assert context.num_prompt_tokens == sum(num_prompt_tokens) # All prompts + assert context.num_output_tokens == sum(num_output_tokens) # All outputs assert context.num_reasoning_tokens == 3 # Unchanged from second turn - assert context.num_cached_tokens == 3 + 8 # Accumulated cached tokens + assert context.num_cached_tokens == sum( + num_cached_tokens + ) # Accumulated cached tokens # Additional tool tokens from third turn # Formula: this turn prompt - last turn prompt - last turn output @@ -447,6 +470,15 @@ async def test_streaming_multi_turn_token_counting(mock_parser): context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens ) + # Validate all turn metrics + assert len(context.all_turn_metrics) == 3 + for i, turn in enumerate(context.all_turn_metrics): + assert turn.input_tokens == num_prompt_tokens[i] + assert turn.output_tokens == num_output_tokens[i] + assert turn.cached_input_tokens == num_cached_tokens[i] + assert context.all_turn_metrics[1].tool_output_tokens == 2 + assert context.all_turn_metrics[2].tool_output_tokens == 2 + @pytest.mark.asyncio async def test_streaming_message_synchronization(mock_parser): @@ -522,3 +554,46 @@ async def test_streaming_message_synchronization(mock_parser): assert len(context._messages) == 3 assert context.num_init_messages == 1 assert context._messages[2].content[0].text == "Response 4" + + +def test_turn_metrics_copy_and_reset(): + """Test TurnMetrics copy and reset methods work correctly.""" + # Create a TurnMetrics with specific values + original_metrics = TurnMetrics( + input_tokens=10, + output_tokens=20, + cached_input_tokens=5, + tool_output_tokens=3, + ) + + # Test copy functionality + copied_metrics = original_metrics.copy() + + # Verify copy has same values + assert copied_metrics.input_tokens == 10 + assert copied_metrics.output_tokens == 20 + assert copied_metrics.cached_input_tokens == 5 + assert copied_metrics.tool_output_tokens == 3 + + # Verify they are separate objects + assert copied_metrics is not original_metrics + + # Modify copy to ensure independence + copied_metrics.input_tokens = 999 + assert original_metrics.input_tokens == 10 # Original unchanged + assert copied_metrics.input_tokens == 999 + + # Test reset functionality + original_metrics.reset() + + # Verify all fields are reset to zero + assert original_metrics.input_tokens == 0 + assert original_metrics.output_tokens == 0 + assert original_metrics.cached_input_tokens == 0 + assert original_metrics.tool_output_tokens == 0 + + # Verify copied metrics are unaffected by reset + assert copied_metrics.input_tokens == 999 + assert copied_metrics.output_tokens == 20 + assert copied_metrics.cached_input_tokens == 5 + assert copied_metrics.tool_output_tokens == 3 diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index 2f28253bce536..9a82ab99ea9c9 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -12,7 +12,7 @@ from vllm.entrypoints.openai.api_server import ( from vllm.inputs import TextPrompt from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams -from vllm.utils import merge_async_iterators +from vllm.utils.async_utils import merge_async_iterators MODEL_PATH = "zai-org/chatglm3-6b" LORA_RANK = 64 diff --git a/tests/models/language/pooling/test_multi_vector_retrieval.py b/tests/models/language/pooling/test_multi_vector_retrieval.py new file mode 100644 index 0000000000000..302f2df135579 --- /dev/null +++ b/tests/models/language/pooling/test_multi_vector_retrieval.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch +from transformers import AutoModel + +from tests.models.utils import check_embeddings_close + + +@pytest.mark.parametrize( + "model", + ["BAAI/bge-m3"], +) +@pytest.mark.parametrize("dtype", ["half"]) +@torch.inference_mode +def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str): + with vllm_runner( + model, + runner="pooling", + max_model_len=None, + ) as vllm_model: + vllm_outputs = vllm_model.token_embed(example_prompts) + + with hf_runner( + model, + auto_cls=AutoModel, + ) as hf_model: + tokenizer = hf_model.tokenizer + hf_outputs = [] + for prompt in example_prompts: + inputs = tokenizer([prompt], return_tensors="pt") + inputs = hf_model.wrap_device(inputs) + output = hf_model.model(**inputs) + embedding = output.last_hidden_state[0].float() + # normal + hf_outputs.append(embedding.cpu()) + + for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): + check_embeddings_close( + embeddings_0_lst=hf_output, + embeddings_1_lst=vllm_output, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py index 674bf02b7b98b..55663ee3f1b41 100644 --- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py +++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py @@ -93,7 +93,7 @@ def test_embed_models_using_normalize( ], ) @pytest.mark.parametrize("dtype", ["half"]) -def test_reward_models_using_softmax( +def test_reward_models_using_activation( hf_runner, vllm_runner, example_prompts, @@ -104,22 +104,64 @@ def test_reward_models_using_softmax( model, max_model_len=1024, dtype=dtype, - pooler_config=PoolerConfig(softmax=False), + pooler_config=PoolerConfig(activation=False), ) as vllm_model: - wo_softmax = vllm_model.encode(example_prompts) + wo_activation = vllm_model.reward(example_prompts) with vllm_runner( - model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True) + model, + max_model_len=1024, + dtype=dtype, + pooler_config=PoolerConfig(activation=True), ) as vllm_model: - w_softmax = vllm_model.encode(example_prompts) + w_activation = vllm_model.reward(example_prompts) - for wo, w in zip(wo_softmax, w_softmax): + for wo, w in zip(wo_activation, w_activation): wo = torch.tensor(wo) w = torch.tensor(w) assert not torch.allclose(wo, w, atol=1e-2), ( - "pooler_config softmax is not working" + "pooler_config activation is not working" ) assert torch.allclose(softmax(wo), w, atol=1e-2), ( - "w_softmax should be close to softmax(wo_softmax)." + "w_activation should be close to activation(wo_activation)." + ) + + +@pytest.mark.parametrize( + "model", + [ + "intfloat/multilingual-e5-small", + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_multi_vector_retrieval_models_using_normalize( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +) -> None: + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(normalize=False), + ) as vllm_model: + wo_normalize = vllm_model.token_embed(example_prompts) + + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(normalize=True), + ) as vllm_model: + w_normalize = vllm_model.token_embed(example_prompts) + + for wo, w in zip(wo_normalize, w_normalize): + assert not torch.allclose(wo, w, atol=1e-2), ( + "pooler_config normalize is not working" + ) + assert torch.allclose(F.normalize(wo, p=2, dim=-1), w, atol=1e-2), ( + "w_normal should be close to normal(wo_normal)." ) diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py index 784d9fc312679..2dfc0072126bc 100644 --- a/tests/models/language/pooling/test_token_classification.py +++ b/tests/models/language/pooling/test_token_classification.py @@ -19,7 +19,7 @@ def test_bert_models( dtype: str, ) -> None: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) + vllm_outputs = vllm_model.token_classify(example_prompts) with hf_runner( model, dtype=dtype, auto_cls=AutoModelForTokenClassification @@ -50,7 +50,7 @@ def test_modernbert_models( dtype: str, ) -> None: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) + vllm_outputs = vllm_model.token_classify(example_prompts) with hf_runner( model, dtype=dtype, auto_cls=AutoModelForTokenClassification diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index f124220bb16d9..af7dad079a9b3 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -17,7 +17,7 @@ from transformers import ( ) from vllm.platforms import current_platform -from vllm.utils import identity +from vllm.utils.func import identity from ....conftest import ( IMAGE_ASSETS, diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py index b474e851319ae..74e30c4307fac 100644 --- a/tests/models/multimodal/pooling/test_intern_vit.py +++ b/tests/models/multimodal/pooling/test_intern_vit.py @@ -38,7 +38,7 @@ def run_intern_vit_test( config.norm_type = "rms_norm" hf_model = AutoModel.from_pretrained( - model, torch_dtype=torch_dtype, trust_remote_code=True + model, dtype=torch_dtype, trust_remote_code=True ).to("cuda") hf_outputs_per_image = [ hf_model(pixel_value.to("cuda")).last_hidden_state diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py index abf4150a91329..62154b0834878 100644 --- a/tests/models/multimodal/pooling/test_prithvi_mae.py +++ b/tests/models/multimodal/pooling/test_prithvi_mae.py @@ -39,7 +39,7 @@ def _run_test( max_num_seqs=32, default_torch_num_threads=1, ) as vllm_model: - vllm_model.encode(prompt) + vllm_model.llm.encode(prompt, pooling_task="token_classify") MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] diff --git a/tests/models/multimodal/pooling/test_radio.py b/tests/models/multimodal/pooling/test_radio.py index 80f594021ca8a..414e99a71e7b0 100644 --- a/tests/models/multimodal/pooling/test_radio.py +++ b/tests/models/multimodal/pooling/test_radio.py @@ -45,7 +45,7 @@ def run_radio_test( hf_model = AutoModel.from_pretrained( model_id, config=config, - torch_dtype=torch_dtype, + dtype=torch_dtype, trust_remote_code=True, ).to("cuda") hf_model.eval() diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index d1dae587d38eb..98245cdf0c984 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -30,7 +30,7 @@ class MyGemma2Embedding(nn.Module): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_embed": Pooler.for_token_embed(pooler_config), "embed": Pooler.for_embed(pooler_config), } ) diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py index 912b32755e80f..936f27fb69bc6 100644 --- a/tests/plugins_tests/test_io_processor_plugins.py +++ b/tests/plugins_tests/test_io_processor_plugins.py @@ -93,7 +93,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str): out_data_format="b64_json", ) - pooling_params = PoolingParams(task="encode", softmax=False) + pooling_params = PoolingParams(activation=False) with vllm_runner( model_name, @@ -108,8 +108,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str): io_processor_plugin="prithvi_to_tiff", ) as llm_runner: pooler_output = llm_runner.get_llm().encode( - img_prompt, - pooling_params=pooling_params, + img_prompt, pooling_params=pooling_params, pooling_task="token_classify" ) output = pooler_output[0].outputs diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index ef7164c8813da..5aeb002238cf9 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -697,7 +697,8 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): @pytest.mark.parametrize( "args", [ - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4), + # TODO: Enable once model is available again + # ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4), ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4), ], ) diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py new file mode 100644 index 0000000000000..3d12f3e5b30e8 --- /dev/null +++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.reasoning import ( + DeepSeekR1ReasoningParser, + DeepSeekV3ReasoningParser, + IdentityReasoningParser, +) + +REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-V3.1" + + +@pytest.fixture(scope="module") +def tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +@pytest.mark.parametrize( + "thinking,expected_parser_type", + [ + (True, DeepSeekR1ReasoningParser), + (False, IdentityReasoningParser), + ], +) +def test_parser_selection(tokenizer, thinking, expected_parser_type): + parser = DeepSeekV3ReasoningParser( + tokenizer, chat_template_kwargs={"thinking": thinking} + ) + + assert isinstance(parser._parser, expected_parser_type) + + +def test_identity_reasoning_parser_basic(tokenizer): + parser = IdentityReasoningParser(tokenizer) + + # Test is_reasoning_end always returns True + input_text = "This is some output" + input_tokens = tokenizer.tokenize(input_text) + input_ids = tokenizer.convert_tokens_to_ids(input_tokens) + assert parser.is_reasoning_end(input_ids) is True + + # Test extract_content_ids returns all input_ids + assert parser.extract_content_ids(input_ids) == input_ids + + # Test extract_reasoning_content returns (None, model_output) + request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0) + reasoning, content = parser.extract_reasoning_content(input_text, request) + assert reasoning is None + assert content == input_text + + # Test extract_reasoning_content_streaming returns DeltaMessage or None + result = parser.extract_reasoning_content_streaming( + previous_text="", + current_text="Hello world", + delta_text="Hello world", + previous_token_ids=[], + current_token_ids=input_ids, + delta_token_ids=input_ids, + ) + assert isinstance(result, DeltaMessage) + assert result.content == "Hello world" + + # If delta_text is empty, should return None + result_none = parser.extract_reasoning_content_streaming( + previous_text="Hello world", + current_text="Hello world", + delta_text="", + previous_token_ids=input_ids, + current_token_ids=input_ids, + delta_token_ids=[], + ) + assert result_none is None diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py index e3561ac3a577e..e73d7efc1483a 100644 --- a/tests/test_pooling_params.py +++ b/tests/test_pooling_params.py @@ -1,10 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + import pytest from tests.models.utils import EmbedModelInfo from vllm import PoolingParams -from vllm.config import ModelConfig +from vllm.config import ModelConfig, PoolerConfig EMBEDDING_MODELS = [ EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), @@ -15,6 +17,15 @@ EMBEDDING_MODELS = [ ), ] +classify_parameters = ["activation"] +embed_parameters = ["dimensions", "normalize"] +step_pooling_parameters = ["step_tag_id", "returned_token_ids"] + + +@dataclass() +class MockModelConfig: + pooler_config: PoolerConfig + def test_task(): pooling_params = PoolingParams() @@ -24,25 +35,27 @@ def test_task(): pooling_params.verify(task="score") with pytest.raises(ValueError): - pooling_params.verify(task="encode") + pooling_params.verify(task="classify") def test_embed(): task = "embed" + model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS")) + pooling_params = PoolingParams(normalize=None) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(normalize=True) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(normalize=False) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) - invalid_parameters = ["activation", "softmax"] + invalid_parameters = classify_parameters + step_pooling_parameters for p in invalid_parameters: with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) @pytest.mark.parametrize("model_info", EMBEDDING_MODELS) @@ -73,35 +86,71 @@ def test_embed_dimensions(model_info: EmbedModelInfo): @pytest.mark.parametrize("task", ["score", "classify"]) def test_classify(task): + model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS")) + pooling_params = PoolingParams(activation=None) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(activation=True) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(activation=False) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) - invalid_parameters = ["dimensions", "normalize", "softmax"] + invalid_parameters = embed_parameters + step_pooling_parameters for p in invalid_parameters: with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) -def test_encode(): - task = "encode" - pooling_params = PoolingParams(softmax=None) - pooling_params.verify(task=task) +@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"]) +def test_token_embed(pooling_type: str): + task = "token_embed" + model_config = MockModelConfig( + pooler_config=PoolerConfig(pooling_type=pooling_type) + ) - pooling_params = PoolingParams(softmax=True) - pooling_params.verify(task=task) + pooling_params = PoolingParams(normalize=None) + pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(softmax=False) - pooling_params.verify(task=task) + pooling_params = PoolingParams(normalize=True) + pooling_params.verify(task=task, model_config=model_config) + + pooling_params = PoolingParams(normalize=False) + pooling_params.verify(task=task, model_config=model_config) + + invalid_parameters = classify_parameters + if pooling_type != "STEP": + invalid_parameters = classify_parameters + step_pooling_parameters - invalid_parameters = ["dimensions", "normalize", "activation"] for p in invalid_parameters: with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) + + +@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"]) +def test_token_classify(pooling_type: str): + task = "token_classify" + model_config = MockModelConfig( + pooler_config=PoolerConfig(pooling_type=pooling_type) + ) + + pooling_params = PoolingParams(activation=None) + pooling_params.verify(task=task, model_config=model_config) + + pooling_params = PoolingParams(activation=True) + pooling_params.verify(task=task, model_config=model_config) + + pooling_params = PoolingParams(activation=False) + pooling_params.verify(task=task, model_config=model_config) + + invalid_parameters = embed_parameters + if pooling_type != "STEP": + invalid_parameters = embed_parameters + step_pooling_parameters + + for p in invalid_parameters: + with pytest.raises(ValueError): + pooling_params = PoolingParams(**{p: True}) + pooling_params.verify(task=task, model_config=model_config) diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index b4f0989b1b19c..93ef1049fc07e 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -40,7 +40,7 @@ def qwen3_xml_tool_parser(qwen3_tokenizer): return Qwen3XMLToolParser(qwen3_tokenizer) -@pytest.fixture(params=["original", "xml"]) +@pytest.fixture(params=["xml"]) def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser, request): """Parameterized fixture that provides both parser types for testing""" if request.param == "original": @@ -664,6 +664,9 @@ def test_extract_tool_calls_streaming( # Verify we got all expected tool calls assert len(tool_states) == len(expected_tool_calls) + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == len( + expected_tool_calls + ) # Verify each tool call for idx, expected_tool in enumerate(expected_tool_calls): @@ -780,9 +783,10 @@ fahrenheit # Verify content was streamed assert "Let me check the weather for you:" in other_content - # Verify we got the tool call assert len(tool_states) == 1 + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 + state = tool_states[0] assert state["id"] is not None assert state["type"] == "function" @@ -892,3 +896,83 @@ def test_extract_tool_calls_complex_type_with_single_quote( args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) assert args["obj_param"] == {"key": "value"} + + +def test_extract_tool_calls_streaming_missing_opening_tag( + qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools +): + """Test streaming with missing opening tag + + This tests that the streaming parser correctly handles + tool calls that start directly with + """ + model_output = """I'll check the weather for you. + + + +Dallas + + +TX + + +fahrenheit + + +""" + + request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) + + other_content = "" + tool_states = {} + + for delta_message in stream_delta_message_generator( + qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request + ): + if delta_message.content: + other_content += delta_message.content + + if delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + + if idx not in tool_states: + tool_states[idx] = { + "id": None, + "name": None, + "arguments": "", + "type": None, + } + + if tool_call.id: + tool_states[idx]["id"] = tool_call.id + + if tool_call.type: + assert tool_call.type == "function" + tool_states[idx]["type"] = tool_call.type + + if tool_call.function: + if tool_call.function.name: + tool_states[idx]["name"] = tool_call.function.name + + if tool_call.function.arguments is not None: + tool_states[idx]["arguments"] += tool_call.function.arguments + + # Verify content was streamed + assert "I'll check the weather for you." in other_content + + # Verify we got the tool call + assert len(tool_states) == 1 + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 + + state = tool_states[0] + assert state["id"] is not None + assert state["type"] == "function" + assert state["name"] == "get_current_weather" + + # Verify arguments were parsed correctly despite missing opening tag + assert state["arguments"] is not None + args = json.loads(state["arguments"]) + assert args["city"] == "Dallas" + assert args["state"] == "TX" + assert args["unit"] == "fahrenheit" diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index 102e5ddf16d6d..cf455ff3edbd3 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -3,7 +3,7 @@ import pytest -from vllm.config import CompilationLevel +from vllm.config import CompilationMode from ..utils import compare_two_settings @@ -21,13 +21,13 @@ def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch): "--max-model-len=256", "--max-num-seqs=32", "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_ONCE}", + f"-O{CompilationMode.DYNAMO_TRACE_ONCE}", ], arg2=[ "--max-model-len=256", "--max-num-seqs=32", "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_AS_IS}", + f"-O{CompilationMode.STOCK_TORCH_COMPILE}", ], env1={}, env2={}, diff --git a/tests/utils_/test_async_utils.py b/tests/utils_/test_async_utils.py new file mode 100644 index 0000000000000..03d116bdfd814 --- /dev/null +++ b/tests/utils_/test_async_utils.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +from collections.abc import AsyncIterator + +import pytest + +from vllm.utils.async_utils import merge_async_iterators + + +async def _mock_async_iterator(idx: int): + try: + while True: + yield f"item from iterator {idx}" + await asyncio.sleep(0.1) + except asyncio.CancelledError: + print(f"iterator {idx} cancelled") + + +@pytest.mark.asyncio +async def test_merge_async_iterators(): + iterators = [_mock_async_iterator(i) for i in range(3)] + merged_iterator = merge_async_iterators(*iterators) + + async def stream_output(generator: AsyncIterator[tuple[int, str]]): + async for idx, output in generator: + print(f"idx: {idx}, output: {output}") + + task = asyncio.create_task(stream_output(merged_iterator)) + await asyncio.sleep(0.5) + task.cancel() + with pytest.raises(asyncio.CancelledError): + await task + + for iterator in iterators: + try: + await asyncio.wait_for(anext(iterator), 1) + except StopAsyncIteration: + # All iterators should be cancelled and print this message. + print("Iterator was cancelled normally") + except (Exception, asyncio.CancelledError) as e: + raise AssertionError() from e diff --git a/tests/utils_/test_func_utils.py b/tests/utils_/test_func_utils.py new file mode 100644 index 0000000000000..147a396994596 --- /dev/null +++ b/tests/utils_/test_func_utils.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa + +import pytest + +from vllm.utils.func import deprecate_kwargs, supports_kw + +from ..utils import error_on_warning + + +def test_deprecate_kwargs_always(): + @deprecate_kwargs("old_arg", is_deprecated=True) + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with pytest.warns(DeprecationWarning, match="'old_arg'"): + dummy(old_arg=1) + + with error_on_warning(DeprecationWarning): + dummy(new_arg=1) + + +def test_deprecate_kwargs_never(): + @deprecate_kwargs("old_arg", is_deprecated=False) + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with error_on_warning(DeprecationWarning): + dummy(old_arg=1) + + with error_on_warning(DeprecationWarning): + dummy(new_arg=1) + + +def test_deprecate_kwargs_dynamic(): + is_deprecated = True + + @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated) + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with pytest.warns(DeprecationWarning, match="'old_arg'"): + dummy(old_arg=1) + + with error_on_warning(DeprecationWarning): + dummy(new_arg=1) + + is_deprecated = False + + with error_on_warning(DeprecationWarning): + dummy(old_arg=1) + + with error_on_warning(DeprecationWarning): + dummy(new_arg=1) + + +def test_deprecate_kwargs_additional_message(): + @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd") + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with pytest.warns(DeprecationWarning, match="abcd"): + dummy(old_arg=1) + + +@pytest.mark.parametrize( + ("callable", "kw_name", "requires_kw_only", "allow_var_kwargs", "is_supported"), + [ + # Tests for positional argument support + (lambda foo: None, "foo", True, True, False), + (lambda foo: None, "foo", False, True, True), + # Tests for positional or keyword / keyword only + (lambda foo=100: None, "foo", True, True, False), + (lambda *, foo: None, "foo", False, True, True), + # Tests to make sure the names of variadic params are NOT supported + (lambda *args: None, "args", False, True, False), + (lambda **kwargs: None, "kwargs", False, True, False), + # Tests for if we allow var kwargs to add support + (lambda foo: None, "something_else", False, True, False), + (lambda foo, **kwargs: None, "something_else", False, True, True), + (lambda foo, **kwargs: None, "kwargs", True, True, False), + (lambda foo, **kwargs: None, "foo", True, True, False), + ], +) +def test_supports_kw( + callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported +): + assert ( + supports_kw( + callable=callable, + kw_name=kw_name, + requires_kw_only=requires_kw_only, + allow_var_kwargs=allow_var_kwargs, + ) + == is_supported + ) diff --git a/tests/utils_/test_jsontree.py b/tests/utils_/test_jsontree.py new file mode 100644 index 0000000000000..0af2751b2638c --- /dev/null +++ b/tests/utils_/test_jsontree.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.utils.jsontree import json_count_leaves + + +def test_json_count_leaves(): + """Test json_count_leaves function from jsontree utility.""" + + # Single leaf values + assert json_count_leaves(42) == 1 + assert json_count_leaves("hello") == 1 + assert json_count_leaves(None) == 1 + + # Empty containers + assert json_count_leaves([]) == 0 + assert json_count_leaves({}) == 0 + assert json_count_leaves(()) == 0 + + # Flat structures + assert json_count_leaves([1, 2, 3]) == 3 + assert json_count_leaves({"a": 1, "b": 2}) == 2 + assert json_count_leaves((1, 2, 3)) == 3 + + # Nested structures + nested_dict = {"a": 1, "b": {"c": 2, "d": 3}} + assert json_count_leaves(nested_dict) == 3 + + nested_list = [1, [2, 3], 4] + assert json_count_leaves(nested_list) == 4 + + mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4} + assert json_count_leaves(mixed_nested) == 4 diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 308629ab05834..3bc4d3536d58e 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -2,14 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa -import asyncio import hashlib import json import os import pickle import socket import tempfile -from collections.abc import AsyncIterator from pathlib import Path from unittest.mock import patch @@ -30,7 +28,6 @@ from vllm.utils import ( bind_kv_cache, common_broadcastable_dtype, current_stream, - deprecate_kwargs, get_open_port, get_tcp_uri, is_lossless_cast, @@ -38,104 +35,14 @@ from vllm.utils import ( make_zmq_path, make_zmq_socket, memory_profiling, - merge_async_iterators, sha256, split_host_port, split_zmq_path, - supports_kw, swap_dict_values, unique_filepath, ) -from ..utils import create_new_process_for_each_test, error_on_warning - - -@pytest.mark.asyncio -async def test_merge_async_iterators(): - async def mock_async_iterator(idx: int): - try: - while True: - yield f"item from iterator {idx}" - await asyncio.sleep(0.1) - except asyncio.CancelledError: - print(f"iterator {idx} cancelled") - - iterators = [mock_async_iterator(i) for i in range(3)] - merged_iterator = merge_async_iterators(*iterators) - - async def stream_output(generator: AsyncIterator[tuple[int, str]]): - async for idx, output in generator: - print(f"idx: {idx}, output: {output}") - - task = asyncio.create_task(stream_output(merged_iterator)) - await asyncio.sleep(0.5) - task.cancel() - with pytest.raises(asyncio.CancelledError): - await task - - for iterator in iterators: - try: - await asyncio.wait_for(anext(iterator), 1) - except StopAsyncIteration: - # All iterators should be cancelled and print this message. - print("Iterator was cancelled normally") - except (Exception, asyncio.CancelledError) as e: - raise AssertionError() from e - - -def test_deprecate_kwargs_always(): - @deprecate_kwargs("old_arg", is_deprecated=True) - def dummy(*, old_arg: object = None, new_arg: object = None): - pass - - with pytest.warns(DeprecationWarning, match="'old_arg'"): - dummy(old_arg=1) - - with error_on_warning(DeprecationWarning): - dummy(new_arg=1) - - -def test_deprecate_kwargs_never(): - @deprecate_kwargs("old_arg", is_deprecated=False) - def dummy(*, old_arg: object = None, new_arg: object = None): - pass - - with error_on_warning(DeprecationWarning): - dummy(old_arg=1) - - with error_on_warning(DeprecationWarning): - dummy(new_arg=1) - - -def test_deprecate_kwargs_dynamic(): - is_deprecated = True - - @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated) - def dummy(*, old_arg: object = None, new_arg: object = None): - pass - - with pytest.warns(DeprecationWarning, match="'old_arg'"): - dummy(old_arg=1) - - with error_on_warning(DeprecationWarning): - dummy(new_arg=1) - - is_deprecated = False - - with error_on_warning(DeprecationWarning): - dummy(old_arg=1) - - with error_on_warning(DeprecationWarning): - dummy(new_arg=1) - - -def test_deprecate_kwargs_additional_message(): - @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd") - def dummy(*, old_arg: object = None, new_arg: object = None): - pass - - with pytest.warns(DeprecationWarning, match="abcd"): - dummy(old_arg=1) +from ..utils import create_new_process_for_each_test def test_get_open_port(monkeypatch: pytest.MonkeyPatch): @@ -299,7 +206,7 @@ def test_dict_args(parser): "val2", "--hf-overrides.key2.key4", "val3", - # Test compile config and compilation level + # Test compile config and compilation mode "-O.use_inductor=true", "-O.backend", "custom", @@ -352,7 +259,7 @@ def test_dict_args(parser): }, } assert parsed_args.compilation_config == { - "level": 1, + "mode": 1, "use_inductor": True, "backend": "custom", "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"], @@ -367,7 +274,7 @@ def test_duplicate_dict_args(caplog_vllm, parser): "--hf-overrides.key1", "val2", "-O1", - "-O.level", + "-O.mode", "2", "-O3", ] @@ -375,45 +282,12 @@ def test_duplicate_dict_args(caplog_vllm, parser): parsed_args = parser.parse_args(args) # Should be the last value assert parsed_args.hf_overrides == {"key1": "val2"} - assert parsed_args.compilation_config == {"level": 3} + assert parsed_args.compilation_config == {"mode": 3} assert len(caplog_vllm.records) == 1 assert "duplicate" in caplog_vllm.text assert "--hf-overrides.key1" in caplog_vllm.text - assert "-O.level" in caplog_vllm.text - - -@pytest.mark.parametrize( - "callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported", - [ - # Tests for positional argument support - (lambda foo: None, "foo", True, True, False), - (lambda foo: None, "foo", False, True, True), - # Tests for positional or keyword / keyword only - (lambda foo=100: None, "foo", True, True, False), - (lambda *, foo: None, "foo", False, True, True), - # Tests to make sure the names of variadic params are NOT supported - (lambda *args: None, "args", False, True, False), - (lambda **kwargs: None, "kwargs", False, True, False), - # Tests for if we allow var kwargs to add support - (lambda foo: None, "something_else", False, True, False), - (lambda foo, **kwargs: None, "something_else", False, True, True), - (lambda foo, **kwargs: None, "kwargs", True, True, False), - (lambda foo, **kwargs: None, "foo", True, True, False), - ], -) -def test_supports_kw( - callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported -): - assert ( - supports_kw( - callable=callable, - kw_name=kw_name, - requires_kw_only=requires_kw_only, - allow_var_kwargs=allow_var_kwargs, - ) - == is_supported - ) + assert "-O.mode" in caplog_vllm.text @create_new_process_for_each_test() @@ -863,36 +737,6 @@ def test_join_host_port(): assert join_host_port("::1", 5555) == "[::1]:5555" -def test_json_count_leaves(): - """Test json_count_leaves function from jsontree utility.""" - from vllm.utils.jsontree import json_count_leaves - - # Single leaf values - assert json_count_leaves(42) == 1 - assert json_count_leaves("hello") == 1 - assert json_count_leaves(None) == 1 - - # Empty containers - assert json_count_leaves([]) == 0 - assert json_count_leaves({}) == 0 - assert json_count_leaves(()) == 0 - - # Flat structures - assert json_count_leaves([1, 2, 3]) == 3 - assert json_count_leaves({"a": 1, "b": 2}) == 2 - assert json_count_leaves((1, 2, 3)) == 3 - - # Nested structures - nested_dict = {"a": 1, "b": {"c": 2, "d": 3}} - assert json_count_leaves(nested_dict) == 3 - - nested_list = [1, [2, 3], 4] - assert json_count_leaves(nested_list) == 4 - - mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4} - assert json_count_leaves(mixed_nested) == 4 - - def test_convert_ids_list_to_tokens(): tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") token_ids = tokenizer.encode("Hello, world!") diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 76408fba2e169..aaac2deb12ac2 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -30,7 +30,6 @@ from vllm.v1.kv_cache_interface import ( from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm.v1.structured_output.request import StructuredOutputRequest from .utils import EOS_TOKEN_ID, create_requests, create_scheduler @@ -335,10 +334,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [], requests[1].request_id: [10], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -383,10 +382,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [10, 42], requests[1].request_id: [13], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -429,10 +428,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [10, 11], requests[1].request_id: [], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -470,10 +469,10 @@ def test_stop_via_update_from_output(): total_num_scheduled_tokens=3, scheduled_encoder_inputs={}, scheduled_spec_decode_tokens={requests[0].request_id: [EOS_TOKEN_ID, 10]}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -1941,7 +1940,6 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): sampling_params=sampling_params, pooling_params=None, eos_token_id=EOS_TOKEN_ID, - structured_output_request=StructuredOutputRequest(sampling_params), ) scheduler.add_request(request) output = scheduler.schedule() diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index 59841a446db3e..02fa27e3f05f7 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -11,7 +11,7 @@ from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, ParallelConfig, SchedulerConfig, @@ -42,7 +42,7 @@ def _create_vllm_config( mock_config.parallel_config = ParallelConfig() # Mimic the behavior of VllmConfig.__post_init__() - if compilation_config.level == CompilationLevel.PIECEWISE: + if compilation_config.mode == CompilationMode.VLLM_COMPILE: compilation_config.set_splitting_ops_for_v1() return mock_config @@ -50,23 +50,23 @@ def _create_vllm_config( class TestCudagraphDispatcher: @pytest.mark.parametrize( - "case_id,cudagraph_mode_str,compilation_level", + "case_id,cudagraph_mode_str,compilation_mode", [ # Test case 0: Full CG for mixed batches, no separate routine - (0, "FULL", CompilationLevel.NO_COMPILATION), + (0, "FULL", CompilationMode.NONE), # Test case 1: Full CG for uniform batches, piecewise for mixed - (1, "FULL_AND_PIECEWISE", CompilationLevel.NO_COMPILATION), + (1, "FULL_AND_PIECEWISE", CompilationMode.NONE), # Test case 2: Full CG for uniform batches, no CG for mixed - (2, "FULL_DECODE_ONLY", CompilationLevel.NO_COMPILATION), - # Test case 3: Piecewise for all - (3, "PIECEWISE", CompilationLevel.PIECEWISE), + (2, "FULL_DECODE_ONLY", CompilationMode.NONE), + # Test case 3: PIECEWISE for all + (3, "PIECEWISE", CompilationMode.VLLM_COMPILE), ], ) - def test_dispatcher(self, cudagraph_mode_str, compilation_level): + def test_dispatcher(self, cudagraph_mode_str, compilation_mode): # Setup dispatcher comp_config = CompilationConfig( cudagraph_mode=cudagraph_mode_str, - level=compilation_level, + mode=compilation_mode, cudagraph_capture_sizes=[1, 8], ) @@ -242,7 +242,7 @@ class TestCudagraphIntegration: def setup_method(self): # only FULL mode for non-uniform batches self.comp_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, cudagraph_mode="FULL", cudagraph_capture_sizes=[10, 20], ) diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 8c8148ae20948..818ae1d7ba677 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -10,7 +10,7 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from vllm import LLM -from vllm.config import CompilationConfig +from vllm.config import CompilationConfig, CompilationMode from vllm.platforms import current_platform @@ -73,7 +73,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte gpu_memory_utilization=0.45, max_model_len=1024, compilation_config=CompilationConfig( - level=3, cudagraph_mode=cudagraph_mode + mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode ), ) llm.generate(["Hello, my name is"] * 10) @@ -90,32 +90,27 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte ) -# test cudagraph_mode with different compilation level. -# (backend_name, cudagraph_mode, compilation_level, supported) +# test cudagraph_mode with different compilation mode. +# (backend_name, cudagraph_mode, compilation_mode, supported) combo_cases_2 = [ - ("FA2", "FULL", 0, True), # no compilation + full cudagraph - ("FA2", "FULL", 3, True), # piecewise compilation + full cudagraph - ("FA2", "PIECEWISE", 0, False), # no compilation + piecewise cudagraph - ("FA2", "PIECEWISE", 3, True), # piecewise compilation + piecewise cudagraph - ( - "FA2", - "FULL_AND_PIECEWISE", - 0, - False, - ), # piecewise cudagraph not supported without piecewise compilation - ("FA2", "FULL_AND_PIECEWISE", 3, True), - ("FA2", "FULL_DECODE_ONLY", 0, True), - ("FA2", "FULL_DECODE_ONLY", 3, True), - ("FA2", "NONE", 0, True), # no compilation + no cudagraph - ("FA2", "NONE", 3, True), # piecewise compilation + no cudagraph + ("FA2", "FULL", CompilationMode.NONE, True), + ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True), + ("FA2", "PIECEWISE", CompilationMode.NONE, False), + ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), + ("FA2", "NONE", CompilationMode.NONE, True), + ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True), ] @pytest.mark.parametrize( - "backend_name,cudagraph_mode,compilation_level,supported", combo_cases_2 + "backend_name,cudagraph_mode,compilation_mode,supported", combo_cases_2 ) def test_cudagraph_compilation_combo(combo_case): - backend_name, cudagraph_mode, compilation_level, supported = combo_case + backend_name, cudagraph_mode, compilation_mode, supported = combo_case env_vars = backend_configs[backend_name].env_vars @@ -130,7 +125,7 @@ def test_cudagraph_compilation_combo(combo_case): gpu_memory_utilization=0.45, max_model_len=1024, compilation_config=CompilationConfig( - level=compilation_level, cudagraph_mode=cudagraph_mode + mode=compilation_mode, cudagraph_mode=cudagraph_mode ), ) llm.generate(["Hello, my name is"] * 10) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index 89e5f26ac627f..f2c6d1c1fd1a4 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -7,7 +7,7 @@ import pytest import torch from vllm import LLM, SamplingParams -from vllm.config import CompilationConfig, CompilationLevel +from vllm.config import CompilationConfig, CompilationMode from vllm.distributed import cleanup_dist_env_and_memory from ...utils import fork_new_process_for_each_test @@ -75,9 +75,9 @@ def test_kv_sharing_fast_prefill( # This allows vLLM compilation backend to handle allocating and # managing buffers for cudagraph cudagraph_copy_inputs=True, - level=CompilationLevel.PIECEWISE + mode=CompilationMode.VLLM_COMPILE if not enforce_eager - else CompilationLevel.NO_COMPILATION, + else CompilationMode.NONE, ) with monkeypatch.context() as m: diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py index c5c5d35b83c3e..283a76dab6723 100644 --- a/tests/v1/engine/conftest.py +++ b/tests/v1/engine/conftest.py @@ -6,6 +6,7 @@ import torch from transformers import AutoTokenizer from tests.v1.engine.utils import ( + FULL_STRINGS, NUM_PROMPT_LOGPROBS_UNDER_TEST, NUM_SAMPLE_LOGPROBS_UNDER_TEST, PROMPT_LEN, @@ -18,8 +19,6 @@ from vllm.engine.arg_utils import EngineArgs from ...distributed.conftest import publisher_config, random_port # noqa: F401 -from tests.v1.engine.utils import FULL_STRINGS # isort: skip - EngineCoreSampleLogprobsType = list[tuple[torch.Tensor, torch.Tensor]] EngineCorePromptLogprobsType = tuple[torch.Tensor, torch.Tensor] diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py index 0bb67b574fa14..b5c8f378be182 100644 --- a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py +++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py @@ -26,7 +26,7 @@ def _make_empty_scheduler_output(): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, kv_connector_metadata=SharedStorageConnectorMetadata(), ) diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py index c2fc24442c7cd..c6634395bb167 100644 --- a/tests/v1/tpu/test_topk_topp_sampler.py +++ b/tests/v1/tpu/test_topk_topp_sampler.py @@ -8,10 +8,7 @@ import torch_xla from vllm.platforms import current_platform from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p - -# isort: off from vllm.v1.sample.tpu.sampler import apply_top_k_top_p as apply_top_k_top_p_tpu -# isort: on if not current_platform.is_tpu(): pytest.skip("This test needs a TPU.", allow_module_level=True) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index df9fcdc37fa37..e471174ef6744 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -89,10 +89,10 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -168,10 +168,10 @@ def test_update_states_request_finished(model_runner): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids={req_id}, free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -198,10 +198,10 @@ def test_update_states_request_resumed(model_runner): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -225,10 +225,10 @@ def test_update_states_request_resumed(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -256,10 +256,10 @@ def test_update_states_no_changes(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -291,10 +291,10 @@ def test_update_states_request_unscheduled(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 817cd7f10c1c6..fe52f565c8a86 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -146,10 +146,10 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -212,10 +212,10 @@ def test_update_states_request_finished(model_runner, dist_init): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids={req_id}, free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -244,10 +244,10 @@ def test_update_states_request_resumed(model_runner, dist_init): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -273,10 +273,10 @@ def test_update_states_request_resumed(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -366,10 +366,10 @@ def test_update_states_no_changes(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -403,10 +403,10 @@ def test_update_states_request_unscheduled(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 7fdfdb37a0c0f..a3aa546347255 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -28,6 +28,7 @@ FILES = [ "vllm/assets", "vllm/distributed", "vllm/entrypoints", + "vllm/executor", "vllm/inputs", "vllm/logging_utils", "vllm/multimodal", @@ -44,7 +45,6 @@ SEPARATE_GROUPS = [ "vllm/attention", "vllm/compilation", "vllm/engine", - "vllm/executor", "vllm/inputs", "vllm/lora", "vllm/model_executor", diff --git a/vllm/assets/video.py b/vllm/assets/video.py index a4e67ca0b63e3..277c8ea1bf0d7 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -5,7 +5,6 @@ from dataclasses import dataclass from functools import lru_cache from typing import Any, ClassVar, Literal -import cv2 import numpy as np import numpy.typing as npt from huggingface_hub import hf_hub_download @@ -43,6 +42,8 @@ def download_video_asset(filename: str) -> str: def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: + import cv2 + cap = cv2.VideoCapture(path) if not cap.isOpened(): raise ValueError(f"Could not open video file {path}") @@ -78,6 +79,8 @@ def video_to_pil_images_list(path: str, num_frames: int = -1) -> list[Image.Imag def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]: + import cv2 + cap = cv2.VideoCapture(path) if not cap.isOpened(): raise ValueError(f"Could not open video file {path}") diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 421b0c4beb370..fb2db4d0b0ec3 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -41,14 +41,6 @@ class AttentionBackend(ABC): # makes sure the output tensor is allocated inside the cudagraph. accept_output_buffer: bool = False - # Whether this backend supports receiving pre-quantized query input. - # If True, the attention layer will handle query quantization instead - # of the backend, allowing torch.compile to fuse quantization with - # previous operations. - # Needs to be worked through for all backends - # https://github.com/vllm-project/vllm/issues/25584 - supports_quant_query_input: bool = False - @staticmethod @abstractmethod def get_name() -> str: @@ -199,6 +191,22 @@ class AttentionImpl(ABC, Generic[T]): """ return False + def supports_quant_query_input(self) -> bool: + """ + Check if this attention implementation supports pre-quantized query input. + + When True, the attention layer will quantize queries before passing them + to this backend, allowing torch.compile to fuse the quantization with + previous operations. This is typically supported when using FP8 KV cache + with compatible attention kernels (e.g., TRT-LLM). + TODO add support to more backends: + https://github.com/vllm-project/vllm/issues/25584 + + Returns: + bool: True if the implementation can accept pre-quantized queries. + """ + return False + class MLAAttentionImpl(AttentionImpl[T], Generic[T]): @abstractmethod diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 929c3b6a4906b..9f879f7272e21 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -36,12 +36,9 @@ from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.platforms import current_platform from vllm.utils import GiB_bytes, direct_register_custom_op +FP8_DTYPE = current_platform.fp8_dtype() logger = init_logger(__name__) USE_XFORMERS_OPS = None -try: - tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe,) -except AttributeError: - tag_cudagraph_unsafe = () # type: ignore[assignment] def check_xformers_availability(): @@ -308,7 +305,7 @@ class Attention(nn.Module, AttentionLayerBase): self.query_quant = None if ( self.kv_cache_dtype.startswith("fp8") - and self.attn_backend.supports_quant_query_input + and self.impl.supports_quant_query_input() ): self.query_quant = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR) @@ -333,7 +330,6 @@ class Attention(nn.Module, AttentionLayerBase): """ if self.calculate_kv_scales: torch.ops.vllm.maybe_calc_kv_scales(query, key, value, self.layer_name) - output_dtype = query.dtype if self.query_quant is not None: # quantizing with a simple torch operation enables @@ -342,11 +338,14 @@ class Attention(nn.Module, AttentionLayerBase): # Otherwise queries are quantized using custom ops # which causes decoding overheads assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"} - query, _ = self.query_quant(query, self._q_scale) + + # check if query quantization is supported + if self.impl.supports_quant_query_input(): + query, _ = self.query_quant(query, self._q_scale) if self.use_output: output_shape = output_shape if output_shape is not None else query.shape - output = torch.zeros(output_shape, dtype=output_dtype, device=query.device) + output = torch.empty(output_shape, dtype=output_dtype, device=query.device) hidden_size = output_shape[-1] # Reshape the query, key, and value tensors. # NOTE(woosuk): We do this outside the custom op to minimize the @@ -591,6 +590,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): prefix: str = "", use_sparse: bool = False, indexer: object | None = None, + **extra_impl_args, ): super().__init__() self.num_heads = num_heads @@ -643,6 +643,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): v_head_dim=self.v_head_dim, kv_b_proj=kv_b_proj, indexer=indexer, + **extra_impl_args, ) self.use_direct_call = not current_platform.opaque_attention_op() @@ -705,7 +706,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): self.calc_kv_scales(q, kv_c_normed, k_pe) if self.attn_backend.accept_output_buffer: - output = torch.zeros(output_shape, dtype=q.dtype, device=q.device) + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) self.impl.forward( self, q, @@ -722,7 +723,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): ) else: if self.attn_backend.accept_output_buffer: - output = torch.zeros(output_shape, dtype=q.dtype, device=q.device) + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) torch.ops.vllm.unified_mla_attention_with_output( q, kv_c_normed, @@ -879,7 +880,6 @@ direct_register_custom_op( op_name="unified_attention", op_func=unified_attention, fake_impl=unified_attention_fake, - tags=tag_cudagraph_unsafe, ) @@ -931,7 +931,6 @@ direct_register_custom_op( op_func=unified_attention_with_output, mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, - tags=tag_cudagraph_unsafe, ) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index d610389ddb6b0..20a15bbc31e38 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -2979,13 +2979,14 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset): requests = [] token_mismatch_total = 0 for _ in range(num_prefixes): - prefix_tokens = _generate_exact_length_tokens(prefix_len) + prefix_tokens, prefix_mismatch = _generate_exact_length_tokens(prefix_len) + token_mismatch_total += prefix_mismatch for _ in range(prompts_per_prefix): - suffix_tokens, token_mistmatch = _generate_exact_length_tokens( + suffix_tokens, suffix_mismatch = _generate_exact_length_tokens( suffix_len ) - token_mismatch_total += token_mistmatch + token_mismatch_total += suffix_mismatch combined_tokens = prefix_tokens + suffix_tokens prompt = tokenizer.decode(combined_tokens) prompt_len = len(combined_tokens) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index c52e384a40023..3c85a1e8fdd9e 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1230,6 +1230,15 @@ def add_cli_args(parser: argparse.ArgumentParser): "the ready check will be skipped.", ) + parser.add_argument( + "--extra-body", + help="A JSON string representing extra body parameters to include " + "in each request." + 'Example: \'{"chat_template_kwargs":{"enable_thinking":false}}\'', + type=json.loads, + default=None, + ) + def main(args: argparse.Namespace) -> dict[str, Any]: return asyncio.run(main_async(args)) @@ -1330,6 +1339,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: else: sampling_params = {} + extra_body = args.extra_body or {} + extra_body = {**sampling_params, **extra_body} + # Avoid GC processing "static" data - reduce pause times. gc.collect() gc.freeze() @@ -1355,7 +1367,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, extra_headers=headers, - extra_body=sampling_params, + extra_body=extra_body, ramp_up_strategy=args.ramp_up_strategy, ramp_up_start_rps=args.ramp_up_start_rps, ramp_up_end_rps=args.ramp_up_end_rps, diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 01c6824ac91f8..866365ac18eb9 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -34,7 +34,7 @@ from vllm.inputs import TextPrompt, TokensPrompt from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams -from vllm.utils import merge_async_iterators +from vllm.utils.async_utils import merge_async_iterators def run_vllm( @@ -251,7 +251,7 @@ def run_hf( disable_detokenize: bool = False, ) -> float: llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code + model, dtype=torch.float16, trust_remote_code=trust_remote_code ) if llm.config.model_type == "llama": # To enable padding in the HF backend. diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 46c433fe6aefb..91be7e85af518 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -56,7 +56,7 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: return InductorAdaptor() else: assert compilation_config.backend == "eager", ( - "Custom backends not supported with CompilationLevel.PIECEWISE" + "Custom backends not supported with CompilationMode.VLLM_COMPILE" ) logger.debug("Using EagerAdaptor") @@ -481,7 +481,7 @@ def set_model_tag(tag: str): class VllmBackend: """The compilation backend for `torch.compile` with vLLM. - It is used for compilation level of `CompilationLevel.PIECEWISE`, + It is used for compilation mode of `CompilationMode.VLLM_COMPILE`, where we customize the compilation. The major work of this backend is to split the graph into diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 4553007027e39..e2369a635ad1f 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -575,7 +575,7 @@ class InductorAdaptor(CompilerInterface): Because it is re-entrant, we always set it (even if entering via Dynamo and the context was already entered). We might want to revisit if it - should be set at a different level of compilation. + should be set at a different mode of compilation. This is likely a bug in PyTorch: public APIs should not rely on manually setting up internal contexts. But we also rely on non-public diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 9e8de831bcb29..20918099f169d 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -27,8 +27,8 @@ class CompilationCounter: num_cache_entries_updated: int = 0 # The number of standalone_compile compiled artifacts saved num_compiled_artifacts_saved: int = 0 - # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS - dynamo_as_is_count: int = 0 + # Number of times a model was loaded with CompilationMode.STOCK_TORCH_COMPILE + stock_torch_compile_count: int = 0 def clone(self) -> "CompilationCounter": return copy.deepcopy(self) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index fe19d4e851294..20d4681e2c789 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -18,7 +18,7 @@ from torch._dynamo.symbolic_convert import InliningInstructionTranslator import vllm.envs as envs from vllm.compilation.counter import compilation_counter from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel, VllmConfig, set_current_vllm_config +from vllm.config import CompilationMode, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils import resolve_obj_by_qualname, supports_dynamo @@ -233,11 +233,11 @@ def _support_torch_compile( old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) self.vllm_config = vllm_config enable_compile = enable_if is None or enable_if(vllm_config) - # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner + # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner # will handle the compilation, so we don't need to do anything here. self.do_not_compile = ( - vllm_config.compilation_config.level - in [CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS] + vllm_config.compilation_config.mode + in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE] or not supports_dynamo() or _should_ignore_torch_compile(self.__class__) or not enable_compile @@ -247,7 +247,7 @@ def _support_torch_compile( compilation_counter.num_models_seen += 1 TorchCompileWrapperWithCustomDispatcher.__init__( - self, compilation_level=vllm_config.compilation_config.level + self, compilation_mode=vllm_config.compilation_config.mode ) cls.__init__ = __init__ diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index d3c437795fabb..1e6d0e79228b0 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -3,7 +3,7 @@ import time -from vllm.config import CompilationConfig, CompilationLevel, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, VllmConfig from vllm.logger import init_logger logger = init_logger(__name__) @@ -18,7 +18,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig): compilation_config: CompilationConfig = vllm_config.compilation_config path = vllm_config.compile_debug_dump_path() - if compilation_config.level == CompilationLevel.PIECEWISE and path: + if compilation_config.mode == CompilationMode.VLLM_COMPILE and path: import depyf path.mkdir(parents=True, exist_ok=True) @@ -29,7 +29,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig): def end_monitoring_torch_compile(vllm_config: VllmConfig): compilation_config: CompilationConfig = vllm_config.compilation_config - if compilation_config.level == CompilationLevel.PIECEWISE: + if compilation_config.mode == CompilationMode.VLLM_COMPILE: logger.info( "torch.compile takes %.2f s in total", compilation_config.compilation_time ) diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py index 5ea1b30860f59..cea4f9a816377 100644 --- a/vllm/compilation/partition_rules.py +++ b/vllm/compilation/partition_rules.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib +import logging from typing import TYPE_CHECKING from torch._library.utils import lookup_op @@ -38,8 +39,16 @@ def resolve_defined_ops(op_names: list[str]) -> list["torch._ops.OpOverload"]: resolved.append(lookup_op(op_name)) except Exception: # Skip operators that don't exist (e.g., model-specific ops) - logger.warning( - "Failed to resolve operator for Inductor partition: %s", op_name + # Do not warn for attention ops, warn for others + # (most likely manually specified) + from vllm.config import CompilationConfig + + logger.log( + logging.DEBUG + if op_name in CompilationConfig._attention_ops + else logging.WARNING, + "Failed to resolve operator for CUDAGraph partition: %s", + op_name, ) continue diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index b4a0d89af0d6d..4b10c85209f63 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -11,7 +11,7 @@ from types import CodeType import torch import vllm.envs as envs -from vllm.config import CompilationLevel, CUDAGraphMode, get_current_vllm_config +from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config from vllm.logger import init_logger logger = init_logger(__name__) @@ -31,7 +31,7 @@ class TorchCompileWrapperWithCustomDispatcher: """ def __init__( - self, compiled_callable: Callable | None = None, compilation_level: int = 0 + self, compiled_callable: Callable | None = None, compilation_mode: int = 0 ): vllm_config = get_current_vllm_config() self.vllm_config = vllm_config @@ -72,7 +72,7 @@ class TorchCompileWrapperWithCustomDispatcher: # subclasses can use this to switch between the custom dispatcher # and the default Dynamo guard mechanism. self.use_custom_dispatcher: bool = ( - compilation_level >= CompilationLevel.DYNAMO_ONCE + compilation_mode >= CompilationMode.DYNAMO_TRACE_ONCE ) def aot_compile(self, *args, **kwargs): @@ -85,7 +85,7 @@ class TorchCompileWrapperWithCustomDispatcher: return self.compiled_callable.aot_compile((args, kwargs)) def __call__(self, *args, **kwargs): - """Implement the dispatch logic here, beyond the torch.compile level. + """Implement the dispatch logic here, beyond the torch.compile mode. NOTE: this function can have additional arguments beyond the forward method, for directly dispatching to the compiled code. """ diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 6a0197d044dcd..7f1cc52024205 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -4,7 +4,7 @@ from vllm.config.cache import CacheConfig from vllm.config.compilation import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, PassConfig, ) @@ -49,7 +49,7 @@ __all__ = [ "CacheConfig", # From vllm.config.compilation "CompilationConfig", - "CompilationLevel", + "CompilationMode", "CUDAGraphMode", "PassConfig", # From vllm.config.device diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 60aef2f6f7e1c..a34fb0bf920c0 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -26,12 +26,20 @@ else: logger = init_logger(__name__) -class CompilationLevel: - # constants for the levels of the compilation process - NO_COMPILATION = 0 - DYNAMO_AS_IS = 1 - DYNAMO_ONCE = 2 - PIECEWISE = 3 +class CompilationMode: + """The compilation approach used for torch.compile-based compilation of the + model.""" + + NONE = 0 + """No torch.compile compilation is applied, model runs in fully eager pytorch mode. + The model runs as-is.""" + STOCK_TORCH_COMPILE = 1 + """The standard `torch.compile` compilation pipeline.""" + DYNAMO_TRACE_ONCE = 2 + """Single Dynamo trace through the model, avoiding recompilation.""" + VLLM_COMPILE = 3 + """Custom vLLM Inductor-based backend with caching, piecewise compilation, + shape specialization, and custom passes.""" class CUDAGraphMode(enum.Enum): @@ -134,7 +142,7 @@ class CompilationConfig: """Configuration for compilation. It has three parts: - Top-level Compilation control: - - [`level`][vllm.config.CompilationConfig.level] + - [`mode`][vllm.config.CompilationConfig.mode] - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path] - [`cache_dir`][vllm.config.CompilationConfig.cache_dir] - [`backend`][vllm.config.CompilationConfig.backend] @@ -171,14 +179,26 @@ class CompilationConfig: # Top-level Compilation control level: int | None = None - """The level of compilation: + """ + Level is deprecated and will be removed in the next release, + either 0.12.0 or 0.11.2 whichever is soonest. + Please use mode. Currently all levels are mapped to mode. + """ + # Top-level Compilation control + mode: int | None = None + """The compilation approach used for torch.compile-based compilation of the + model. - - None: If None, we will select the default compilation level. - For V1 engine this is 3, for V0 engine this is 0. - - 0: no compilation. - - 1: dynamo as is. - - 2: dynamo once. - - 3: piecewise compilation.""" + - None: If None, we will select the default compilation mode. + For V1 engine this is 3. + - 0: NONE: No torch.compile compilation is applied, model runs in fully + eager pytorch mode. The model runs as-is. + - 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline. + - 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding + recompilation by removing guards. + Requires no dynamic-shape-dependent control-flow. + - 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching, + piecewise compilation, shape specialization, and custom passes.""" debug_dump_path: Path | None = None """The path to dump the debug information.""" cache_dir: str = "" @@ -195,13 +215,13 @@ class CompilationConfig: backend function. We use string to avoid serialization issues when using compilation in a - distributed setting. When the compilation level is 1 or 2, the backend is + distributed setting. When the compilation mode is 1 or 2, the backend is used for the compilation directly (it sees the whole graph). When the - compilation level is 3, the backend is used for the piecewise compilation + compilation mode is 3, the backend is used for the piecewise compilation (it sees a part of the graph). The backend can not be custom for compilation - level 3, i.e. the backend must be either eager or inductor. Furthermore, + mode 3, i.e. the backend must be either eager or inductor. Furthermore, compilation is only piecewise if splitting ops is set accordingly and - use_inductor_cudagraphs_partition is off. Note that the default options for + use_inductor_graph_partition is off. Note that the default options for splitting ops are sufficient for piecewise compilation. """ custom_ops: list[str] = field(default_factory=list) @@ -214,7 +234,7 @@ class CompilationConfig: - 'none,+op1,+op2' to enable only op1 and op2 By default, all custom ops are enabled when running without Inductor and - disabled when running with Inductor: level>=PIECEWISE and use_inductor=True. + disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True. Inductor generates (fused) Triton kernels for disabled custom ops.""" splitting_ops: list[str] | None = None """A list of ops to exclude from cudagraphs, used in piecewise compilation. @@ -249,7 +269,7 @@ class CompilationConfig: One graph for symbolic shape and one graph per size in compile_sizes are compiled using configurations in inductor_compile_config. - This setting is ignored if level