diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 76f6d7aeca0d8..77ee313687fc8 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -5,11 +5,11 @@ import os import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB # Note that we have 800 MiB quota, please use it wisely. # See https://github.com/pypi/support/issues/6326 . # Please also sync the value with the one in Dockerfile. -VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450)) +VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml new file mode 100644 index 0000000000000..56ec933c9cc0e --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml @@ -0,0 +1,12 @@ +# For vllm script, with -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 +model_name: "HandH1998/QQQ-Llama-3-8b-g128" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.419 + - name: "exact_match,flexible-extract" + value: 0.416 +limit: 1000 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml new file mode 100644 index 0000000000000..f10b937249975 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml @@ -0,0 +1,11 @@ +# For hf script, without -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8 +model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +backend: "vllm-vlm" +tasks: +- name: "chartqa" + metrics: + - name: "relaxed_accuracy,none" + value: 0.90 +limit: 100 +num_fewshot: 0 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml new file mode 100644 index 0000000000000..96eeed04a9dc0 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml @@ -0,0 +1,11 @@ +# For hf script, without -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5 +model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +backend: "vllm-vlm" +tasks: +- name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.80 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml index a2f235f485815..aa4fb9fa03d6d 100644 --- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml +++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml @@ -1,4 +1,5 @@ -# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1 +# For vllm script, with -t option (tensor parallel size) +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" tasks: - name: "gsm8k" diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml new file mode 100644 index 0000000000000..5f3c31743e75b --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -0,0 +1,12 @@ +# For vllm script, with -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1 + +model_name: "Qwen/Qwen2.5-VL-7B-Instruct" +backend: "vllm-vlm" +tasks: +- name: "chartqa" + metrics: + - name: "relaxed_accuracy,none" + value: 0.855 +limit: 2500 +num_fewshot: 0 diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt new file mode 100644 index 0000000000000..4fb0b84bc4d81 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt new file mode 100644 index 0000000000000..91e22b6459c12 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-mm-small.txt b/.buildkite/lm-eval-harness/configs/models-mm-small.txt new file mode 100644 index 0000000000000..1097d220245fc --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt @@ -0,0 +1 @@ +Qwen2.5-VL-7B-Instruct.yaml \ No newline at end of file diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh new file mode 100755 index 0000000000000..c8db951381b0b --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on chartqa for vllm. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.9 + +usage() { + echo`` + echo "Runs lm eval harness on ChartQA using multimodal vllm." + echo "This pathway is intended to be used to create baselines for " + echo "our correctness tests in vllm's CI." + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -l - limit number of samples to run" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:l:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm-vlm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \ + --tasks chartqa \ + --batch_size auto \ + --apply_chat_template \ + --limit $LIMIT diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh old mode 100644 new mode 100755 diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh new file mode 100644 index 0000000000000..d85a1721db9a5 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on MMLUPRO for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] + +usage() { + echo`` + echo "Runs lm eval harness on MMLU Pro using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ + --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size auto diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index ceea01166b7f4..f10de82b1d8e8 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -19,21 +19,27 @@ RTOL = 0.08 def launch_lm_eval(eval_config, tp_size): trust_remote_code = eval_config.get("trust_remote_code", False) max_model_len = eval_config.get("max_model_len", 4096) + batch_size = eval_config.get("batch_size", "auto") + backend = eval_config.get("backend", "vllm") model_args = ( f"pretrained={eval_config['model_name']}," f"tensor_parallel_size={tp_size}," f"enforce_eager=true," f"add_bos_token=true," f"trust_remote_code={trust_remote_code}," - f"max_model_len={max_model_len}" + f"max_model_len={max_model_len}," ) results = lm_eval.simple_evaluate( - model="vllm", + model=backend, model_args=model_args, tasks=[task["name"] for task in eval_config["tasks"]], num_fewshot=eval_config["num_fewshot"], limit=eval_config["limit"], - batch_size="auto", + # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help + # text models. however, this is regressing measured strict-match for + # existing text models in CI, so only apply it for mm. + apply_chat_template=backend == "vllm-vlm", + batch_size=batch_size, ) return results diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml new file mode 100644 index 0000000000000..91f0b850575c4 --- /dev/null +++ b/.buildkite/test-amd.yaml @@ -0,0 +1,1266 @@ +# In this file, you can add more tests to run either by adding a new step or +# adding a new command to an existing step. See different options here for examples. + +# This script will be feed into Jinja template in `test-template-aws.j2` at +# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 +# to generate the final pipeline yaml file. + +# Documentation +# label(str): the name of the test. emojis allowed. +# fast_check(bool): whether to run this on each commit on the fastcheck pipeline. +# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline. +# fast_check_only(bool): run this test on the fastcheck pipeline only +# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run. +# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests). +# command(str): the single command to run for tests. incompatible with commands. +# commands(list): the list of commands to run for the test. incompatible with command. +# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] +# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 +# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. +# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, +# in this case, commands must be specified. the first command runs on the first host, the second +# command runs on the second host. +# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout. +# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB +# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. +# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests +# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. + +# When adding a test +# - If the test belongs to an existing group, add it there +# - If the test is short, add to any existing step +# - If the test takes more than 10min, then it is okay to create a new step. +# Note that all steps execute in parallel. + +steps: +##### fast check tests ##### + +- label: Pytorch Nightly Dependency Override Check # 2min + # if this test fails, it means the nightly torch version is not compatible with some + # of the dependencies. Please check the error message and add the package to whitelist + # in /vllm/tools/generate_nightly_torch_test.py + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + soft_fail: true + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh + +- label: Async Engine, Inputs, Utils, Worker Test # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/multimodal + - tests/standalone_tests/lazy_imports.py + - tests/transformers_utils + no_gpu: true + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s transformers_utils + +- label: Python-only Installation Test # 10min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Basic Correctness Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + +- label: Entrypoints Unit Tests # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" + fast_check: true + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration Test (LLM) # 30min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + +- label: Entrypoints Integration Test (API Server) # 100min + timeout_in_minutes: 130 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/test_chat_utils.py + +- label: Entrypoints Integration Test (Pooling) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + +- label: Distributed Tests (4 GPUs) # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/test_basic_correctness + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - tests/examples/offline_inference/data_parallel.py + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py + commands: + # test with torchrun tp=2 and external_dp=2 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=2 and pp=2 + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with internal dp + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - pushd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd + +- label: EPLB Algorithm Test # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution Test # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + +- label: Metrics, Tracing Test # 12min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + +##### fast check tests ##### +##### 1 GPU test ##### + +- label: Regression Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + grade: Blocking + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: Engine Test # 25min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + #grade: Blocking + source_file_dependencies: + - vllm/ + - tests/engine + - tests/tokenization + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + # OOM in the CI unless we run this separately + - pytest -v -s tokenization + +- label: V1 Test e2e + engine # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + +- label: V1 Test entrypoints # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: V1 Test others # 42min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # split the test to avoid interference + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + # Integration test for streaming correctness (requires special branch). + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 Test others (CPU) # 5 mins + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + no_gpu: true + commands: + # split the test to avoid interference + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + + +- label: Examples Test # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - examples/ + commands: + - pip install tensorizer # for tensorizer test + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/chat.py + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_pooling.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + +- label: Platform Tests (CUDA) # 4min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + +- label: Samplers Test # 56min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers + +- label: LoRA Test %N # 20min each + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - pytest -v -s lora \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --ignore=lora/test_chatglm3_tp.py \ + --ignore=lora/test_llama_tp.py \ + --ignore=lora/test_llm_with_multi_loras.py + parallelism: 4 + +- label: PyTorch Compilation Unit Tests # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_pass_manager.py + - pytest -v -s compile/test_fusion.py + - pytest -v -s compile/test_fusion_attn.py + - pytest -v -s compile/test_functionalization.py + - pytest -v -s compile/test_silu_mul_quant_fusion.py + - pytest -v -s compile/test_sequence_parallelism.py + - pytest -v -s compile/test_async_tp.py + - pytest -v -s compile/test_fusion_all_reduce.py + - pytest -v -s compile/test_decorator.py + - pytest -v -s compile/test_noop_elimination.py + +- label: PyTorch Fullgraph Smoke Test # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/piecewise/ + +- label: PyTorch Fullgraph Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_full_graph.py + +- label: Kernels Core Operation Test # 48min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - tests/kernels/core + commands: + - pytest -v -s kernels/core + +- label: Kernels Attention Test %N # 23min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/attention/ + - vllm/attention + - vllm/v1/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Quantization Test %N # 64min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels MoE Test %N # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Mamba Test # 31min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + commands: + - pytest -v -s kernels/mamba + +- label: Model Executor Test # 23min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py + +- label: Benchmarks # 11min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ + +- label: Quantization Test # 70min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + commands: + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ + +- label: LM Eval Small Models # 53min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + +- label: OpenAI API correctness # 22min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + commands: # LMEval+Transcription WER check + - pytest -s entrypoints/openai/correctness/ + +- label: OpenAI-Compatible Tool Use # 23 min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + fast_check: false + source_file_dependencies: + - vllm/ + - tests/tool_use + commands: + - pytest -v -s -m 'not cpu_test' tool_use + +- label: OpenAI-Compatible Tool Use (CPU) # 5 mins + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/tool_use + no_gpu: true + commands: + - pytest -v -s -m 'cpu_test' tool_use + +##### models test ##### + +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + commands: + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/test_initialization.py + commands: + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py \ + -k 'not test_can_initialize_small_subset' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + no_gpu: true + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + # Test standard language models, excluding a subset of slow tests + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + # Shard hybrid language model tests + - pytest -v -s models/language/generation \ + -m hybrid_model \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. + - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + +- label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + +- label: Multi-Modal Processor Test # 44min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + +- label: Multi-Modal Models Test (Standard) # 60min + timeout_in_minutes: 80 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Models Test (Extended) 1 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + +- label: Multi-Modal Models Test (Extended) 2 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models Test (Extended) 3 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +- label: Quantized Models Test # 45 min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization + +# This test is used only in PR development phase to test individual models and should never run on main +- label: Custom Models Test + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + optional: true + commands: + - echo 'Testing custom models...' + # PR authors can temporarily add commands below to test individual models + # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py + # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* + +- label: Transformers Nightly Models Test + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/" + optional: true + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/offline_inference/basic/chat.py + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # Whisper needs spawn method to avoid deadlock + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper + +- label: Blackwell Test # 38 min + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + # optional: true + source_file_dependencies: + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/fusion.py + - vllm/compilation/fusion_attn.py + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + # Attention + # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + # Quantization + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + # Fusion + - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern + - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + +- label: Blackwell GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: Blackwell Quantized MoE Test + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - tests/quantization/test_blackwell_moe.py + - vllm/model_executor/models/deepseek_v2.py + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/models/llama4.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization/compressed_tensors + - vllm/model_executor/layers/quantization/modelopt.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + +- label: Blackwell LM Eval Small Models + timeout_in_minutes: 120 + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + +##### 1 GPU test ##### +##### multi gpus test ##### + +- label: Distributed Comm Ops Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + +- label: 2 Node Tests (4 GPUs in total) # 16min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py + commands: + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + +- label: Distributed Tests (2 GPUs) # 68min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + commands: + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - pytest -v -s distributed/test_sequence_parallel.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Model Tests (2 GPUs) # 37min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' + +- label: Plugin Tests (2 GPUs) # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # end io_processor plugins test + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins + +- label: Pipeline + Context Parallelism Test # 45min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py + +- label: LoRA TP Test (Distributed) # 17 min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + + +- label: Weight Loading Multiple GPU Test # 33min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + +- label: Weight Loading Multiple GPU Test - Large Models # optional + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + gpu: a100 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + + +##### multi gpus test ##### +##### A100 test ##### + +- label: Distributed Tests (A100) # optional + gpu: a100 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + +- label: LM Eval Large Models # optional + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +##### H200 test ##### +- label: Distrubted Tests (H200) # optional + gpu: h200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + +##### B200 test ##### +- label: Distributed Tests (B200) # optional + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + +##### RL Integration Tests ##### +- label: Prime-RL Integration Test # 15min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ebe0602a1b5db..a8a5bf3ad234d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -527,7 +527,8 @@ steps: # since torchao nightly is only compatible with torch nightly currently # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved - - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - label: LM Eval Small Models # 53min @@ -733,6 +734,16 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work +- label: Multi-Modal Accuracy Eval (Small Models) # 50min + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + - label: Multi-Modal Models Test (Extended) 1 mirror_hardwares: [amdexperimental] optional: true diff --git a/.coveragerc b/.coveragerc index bc6342956109b..b7a9fdb4e05a8 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,5 +1,10 @@ [run] -source = vllm +# Track the installed vllm package (this is what actually gets imported during tests) +# Use wildcard pattern to match the installed location +source = + vllm + */dist-packages/vllm + */site-packages/vllm omit = */tests/* */test_* @@ -12,6 +17,16 @@ omit = */benchmarks/* */docs/* +[paths] +# Map all possible vllm locations to a canonical "vllm" path +# This ensures coverage.combine properly merges data from different test runs +source = + vllm + /vllm-workspace/src/vllm + /vllm-workspace/vllm + */site-packages/vllm + */dist-packages/vllm + [report] exclude_lines = pragma: no cover diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml index c2b17abe811cd..7d565ef9f2e45 100644 --- a/.github/workflows/issue_autolabel.yml +++ b/.github/workflows/issue_autolabel.yml @@ -13,6 +13,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Label issues based on keywords + id: label-step uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | @@ -42,7 +43,6 @@ jobs: searchIn: "body" }, ], - // Substring search - matches anywhere in text (partial matches) substrings: [ { @@ -89,14 +89,12 @@ jobs: term: "hip_", searchIn: "both" }, - // ROCm tools and libraries { term: "hipify", searchIn: "both" }, ], - // Regex patterns - for complex pattern matching regexPatterns: [ { @@ -107,13 +105,17 @@ jobs: } ], }, + // Add more label configurations here as needed + // example: { + // keywords: [...], + // substrings: [...], + // regexPatterns: [...] + // }, }; - // Helper function to create regex based on search type function createSearchRegex(term, type) { // Escape special regex characters in the term const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); - switch (type) { case 'keyword': // Word boundary search - matches whole words only @@ -125,16 +127,13 @@ jobs: throw new Error(`Unknown search type: ${type}`); } } - // Helper function to find matching terms in text with line information function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') { const matches = []; const lines = text.split('\n'); - for (const termConfig of searchTerms) { let regex; let term, searchIn, pattern, description, flags; - // Handle different input formats (string or object) if (typeof termConfig === 'string') { term = termConfig; @@ -146,21 +145,17 @@ jobs: description = termConfig.description; flags = termConfig.flags; } - // Skip if this term shouldn't be searched in the current location if (searchIn !== 'both' && searchIn !== searchLocation) { continue; } - // Create appropriate regex if (searchType === 'regex') { regex = new RegExp(pattern, flags || "gi"); } else { regex = createSearchRegex(term, searchType); } - const termMatches = []; - // Check each line for matches lines.forEach((line, lineIndex) => { const lineMatches = line.match(regex); @@ -175,15 +170,14 @@ jobs: originalTerm: term || pattern, description: description, // Show context around the match in the line - context: line.length > 100 ? - line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), - line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' + context: line.length > 100 ? + line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), + line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' : line.trim() }); }); } }); - if (termMatches.length > 0) { matches.push({ term: term || (description || pattern), @@ -196,64 +190,48 @@ jobs: }); } } - return matches; } - // Helper function to check if label should be added async function processLabel(labelName, config) { const body = context.payload.issue.body || ""; const title = context.payload.issue.title || ""; - core.notice(`Processing label: ${labelName}`); core.notice(`Issue Title: "${title}"`); core.notice(`Issue Body length: ${body.length} characters`); - let shouldAddLabel = false; let allMatches = []; let reason = ''; - const keywords = config.keywords || []; const substrings = config.substrings || []; const regexPatterns = config.regexPatterns || []; - core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`); - // Search in title if (title.trim()) { core.notice(`Searching in title: "${title}"`); - const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title'); const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title'); const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title'); - allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches); } - // Search in body if (body.trim()) { core.notice(`Searching in body (${body.length} characters)`); - const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body'); const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body'); const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body'); - allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches); } - if (allMatches.length > 0) { core.notice(`Found ${allMatches.length} matching term(s):`); - for (const termMatch of allMatches) { const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body'; const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn; - if (termMatch.searchType === 'regex') { core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); } else { core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); } - // Show details for each match termMatch.matches.forEach((match, index) => { core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`); @@ -266,7 +244,6 @@ jobs: } }); } - shouldAddLabel = true; const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0); const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0); @@ -274,13 +251,10 @@ jobs: const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0); const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0); const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0); - reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`; } - core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`); core.notice(`Reason: ${reason || 'No matching terms found'}`); - if (shouldAddLabel) { const existingLabels = context.payload.issue.labels.map(l => l.name); if (!existingLabels.includes(labelName)) { @@ -296,14 +270,92 @@ jobs: core.notice(`Label "${labelName}" already present.`); return false; } - core.notice(`No matching terms found for label "${labelName}".`); return false; } - // Process all configured labels - const processLabels = Object.entries(labelConfig) - .map(([labelName, config]) => processLabel(labelName, config)); - const labelsAdded = await Promise.all(processLabels); - const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0); - core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`); \ No newline at end of file + const labelsAddedResults = await Promise.all( + Object.entries(labelConfig).map(([labelName, config]) => + processLabel(labelName, config).then(added => ({ labelName, added })) + ) + ); + + const numLabelsAdded = labelsAddedResults.filter(r => r.added).length; + core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`); + + // Return which labels were added for the next step + const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName); + core.setOutput('labels_added', JSON.stringify(addedLabels)); + return addedLabels; + + - name: CC users for labeled issues + if: steps.label-step.outputs.labels_added != '[]' + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 + with: + script: | + // Configuration: Map labels to GitHub users to CC + // You can add multiple users per label, and multiple label configurations + const ccConfig = { + rocm: { + users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3'] + message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions + }, + // Add more label -> user mappings here + // Example: + // cuda: { + // users: ['user1', 'user2'], + // message: 'CC {users} for CUDA-related issue' + // }, + // performance: { + // users: ['perfexpert'], + // message: 'CC {users} for performance issue' + // }, + }; + + const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}'); + core.notice(`Labels added: ${labelsAdded.join(', ')}`); + + // Get existing comments to check for already mentioned users + const comments = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + + const issueBody = context.payload.issue.body || ''; + const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n'); + + // Process each label that was added + for (const label of labelsAdded) { + if (ccConfig[label]) { + const config = ccConfig[label]; + const usersToMention = []; + + // Check which users haven't been mentioned yet + for (const user of config.users) { + const mentionPattern = new RegExp(`@${user}\\b`, 'i'); + if (!mentionPattern.test(allExistingText)) { + usersToMention.push(user); + } else { + core.notice(`@${user} already mentioned for label "${label}", skipping`); + } + } + + // Post comment if there are users to mention + if (usersToMention.length > 0) { + const mentions = usersToMention.map(u => `@${u}`).join(' '); + const message = config.message.replace('{users}', mentions); + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: message + }); + + core.notice(`CC comment added for label "${label}": ${mentions}`); + } else { + core.notice(`All users for label "${label}" already mentioned, skipping comment`); + } + } + } \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 832c3edcdc7fe..121bdb750de5d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,6 +16,7 @@ repos: rev: v1.38.1 hooks: - id: typos + args: [--force-exclude] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v21.1.2 hooks: diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index d3040e9738f7b..9298d3b58dfb9 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -631,7 +631,7 @@ def main(args: argparse.Namespace): else: ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") shard_intermediate_size = 2 * intermediate_size // args.tp_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" block_quant_shape = get_weight_block_size_safety(config) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 04d2205aa3722..459eafa6d907d 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -344,7 +344,7 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok hidden_size = config.hidden_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" use_customized_permute = args.use_customized_permute diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake index 9aace7693077a..5a59a409999ad 100644 --- a/cmake/external_projects/qutlass.cmake +++ b/cmake/external_projects/qutlass.cmake @@ -22,10 +22,10 @@ else() CONFIGURE_COMMAND "" BUILD_COMMAND "" ) - FetchContent_Populate(qutlass) - set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}") endif() +FetchContent_Populate(qutlass) + if(NOT qutlass_SOURCE_DIR) message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.") endif() diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000000000..304c0be8105fc --- /dev/null +++ b/codecov.yml @@ -0,0 +1,12 @@ +codecov: + require_ci_to_pass: false + +fixes: + # Map source code paths to repository root paths + # Wildcards match any Python version (python3.*) + - "/vllm-workspace/src/vllm/::vllm/" + - "/vllm-workspace/vllm/::vllm/" + - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/" + - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/" + - "/usr/lib/python3.*/dist-packages/vllm/::vllm/" + - "/usr/lib/python3.*/site-packages/vllm/::vllm/" diff --git a/docker/Dockerfile b/docker/Dockerfile index 3a0db3cc49f61..f9e07acb855c3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py -ARG VLLM_MAX_SIZE_MB=450 +ARG VLLM_MAX_SIZE_MB=500 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ARG RUN_WHEEL_CHECK=true RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 26b95ad053337..85906d23dee33 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs. ```python from vllm import LLM -llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", - tensor_parallel_size=2) +llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) ``` !!! warning @@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option). ```python from vllm import LLM -llm = LLM(model="adept/fuyu-8b", - max_model_len=2048, - max_num_seqs=2) +llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2) ``` ## Reduce CUDA Graphs @@ -61,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc ```python from vllm import LLM - from vllm.config import CompilationConfig, CompilationLevel + from vllm.config import CompilationConfig, CompilationMode llm = LLM( model="meta-llama/Llama-3.1-8B-Instruct", compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, # By default, it goes up to max_num_seqs cudagraph_capture_sizes=[1, 2, 4, 8, 16], ), @@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag: ```python from vllm import LLM -llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True) +llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True) ``` ## Adjust cache size @@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem from vllm import LLM # Accept up to 3 images and 1 video per prompt -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"image": 3, "video": 1}) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"image": 3, "video": 1}, +) ``` You can go a step further and disable unused modalities completely by setting its limit to zero. @@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a from vllm import LLM # Accept any number of images but no videos -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"video": 0}) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"video": 0}, +) ``` You can even run a multi-modal model for text-only inference: @@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference: from vllm import LLM # Don't accept images. Just text. -llm = LLM(model="google/gemma-3-27b-it", - limit_mm_per_prompt={"image": 0}) +llm = LLM( + model="google/gemma-3-27b-it", + limit_mm_per_prompt={"image": 0}, +) ``` ### Configurable options @@ -173,14 +175,14 @@ Here are some examples: from vllm import LLM # Available for Qwen2-VL series models -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_kwargs={ - "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 - }) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28 +) # Available for InternVL series models -llm = LLM(model="OpenGVLab/InternVL2-2B", - mm_processor_kwargs={ - "max_dynamic_patch": 4, # Default is 12 - }) +llm = LLM( + model="OpenGVLab/InternVL2-2B", + mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12 +) ``` diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 5c74610ebd290..24c1efa61f286 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -100,7 +100,7 @@ from vllm import LLM llm = LLM( model="meta-llama/Llama-3.3-70B-Instruct, tensor_parallel_size=4, - pipeline_parallel_size=2 + pipeline_parallel_size=2, ) ``` @@ -257,18 +257,24 @@ Examples: ```python # Use a larger cache -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_cache_gb=8) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_cache_gb=8, +) # Use a shared-memory based IPC cache -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - tensor_parallel_size=2, - mm_processor_cache_type="shm", - mm_processor_cache_gb=8) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + tensor_parallel_size=2, + mm_processor_cache_type="shm", + mm_processor_cache_gb=8, +) # Disable the cache -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_cache_gb=0) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_cache_gb=0, +) ``` ### Cache Placement diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 6b1eabf3d67fa..0f2c4a5d7f069 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -35,6 +35,7 @@ th { | Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` | | Random | ✅ | ✅ | `synthetic` | | RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` | +| RandomForReranking | ✅ | ✅ | `synthetic` | | Prefix Repetition | ✅ | ✅ | `synthetic` | | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` | | HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` | @@ -878,6 +879,51 @@ vllm bench serve \ +#### Reranker Benchmark + +Benchmark the performance of rerank requests in vLLM. + +
+Show more + +Unlike generative models which use Completions API or Chat Completions API, +you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API. + +For reranking, the only supported dataset is `--dataset-name random-rerank` + +Start the server: + +```bash +vllm serve BAAI/bge-reranker-v2-m3 +``` + +Run the benchmark: + +```bash +vllm bench serve \ + --model BAAI/bge-reranker-v2-m3 \ + --backend vllm-rerank \ + --endpoint /v1/rerank \ + --dataset-name random-rerank \ + --tokenizer BAAI/bge-reranker-v2-m3 \ + --random-input-len 512 \ + --num-prompts 10 \ + --random-batch-size 5 +``` + +For reranker models, this will create `num_prompts / random_batch_size` requests with +`random_batch_size` "documents" where each one has close to `random_input_len` tokens. +In the example above, this results in 2 rerank requests with 5 "documents" each where +each document has close to 512 tokens. + +Please note that the `/v1/rerank` is also supported by embedding models. So if you're running +with an embedding model, also set `--no_reranker`. Because in this case the query is +treated as a individual prompt by the server, here we send `random_batch_size - 1` documents +to account for the extra prompt which is the query. The token accounting to report the +throughput numbers correctly is also adjusted. + +
+ [](){ #performance-benchmarks } ## Performance Benchmarks diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index aafdb1058e03c..a423f4e683378 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -73,8 +73,8 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, ) -> torch.Tensor: ... ``` diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 724dc2284e282..721081dffb499 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -16,7 +16,7 @@ Further update the model as follows: ... @classmethod - def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): return "" @@ -45,14 +45,14 @@ Further update the model as follows: ... def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: - assert self.vision_encoder is not None image_features = self.vision_encoder(image_input) return self.multi_modal_projector(image_features) def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - + self, + **kwargs: object, + ) -> MultiModalEmbeddings | None: # Validate the multimodal input keyword arguments image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: @@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m For example, if the model supports any number of images but only one video per prompt: ```python -def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: +def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None, "video": 1} ``` @@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, + mm_options: Mapping[str, BaseDummyOptions] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) @@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in ```python def get_image_size_with_most_features(self) -> ImageSize: image_processor = self.get_image_processor() - return ImageSize(width=image_processor.size["width"], - height=image_processor.size["height"]) + return ImageSize( + width=image_processor.size["width"], + height=image_processor.size["height"], + ) ``` Fuyu does not expect image placeholders in the inputs to HF processor, so @@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in return { "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images, - overrides=image_overrides) + self._get_dummy_images( + width=target_width, + height=target_height, + num_images=num_images, + overrides=image_overrides, + ) } ``` @@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies image_width=image_size.width, image_height=image_size.height, ) - image_tokens = ([_IMAGE_TOKEN_ID] * ncols + - [_NEWLINE_TOKEN_ID]) * nrows + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows return PromptUpdateDetails.select_token_id( image_tokens + [bos_token_id], @@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies image_width=image_size.width, image_height=image_size.height, ) - image_tokens = ([_IMAGE_TOKEN_ID] * ncols + - [_NEWLINE_TOKEN_ID]) * nrows + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows return PromptUpdateDetails.select_token_id( image_tokens + [bos_token_id], @@ -810,9 +812,11 @@ to register them to the multi-modal registry: from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY -+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor, -+ info=YourProcessingInfo, -+ dummy_inputs=YourDummyInputsBuilder) ++ @MULTIMODAL_REGISTRY.register_processor( ++ YourMultiModalProcessor, ++ info=YourProcessingInfo, ++ dummy_inputs=YourDummyInputsBuilder, ++ ) class YourModelForImage2Seq(nn.Module, SupportsMultiModal): ``` diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md index 35f35ffa4cde6..3bb4f961ef15f 100644 --- a/docs/contributing/model/registration.md +++ b/docs/contributing/model/registration.md @@ -42,7 +42,7 @@ def register(): ModelRegistry.register_model( "YourModelForCausalLM", - "your_code:YourModelForCausalLM" + "your_code:YourModelForCausalLM", ) ``` diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index 4ce748ce1fed4..59f14a5ea27b9 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -15,6 +15,7 @@ Declare supported languages and capabilities: - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). ??? code "supported_languages and supports_transcription_only" + ```python from typing import ClassVar, Mapping, Literal import numpy as np @@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor This is for controlling general behavior of the API when serving your model: ??? code "get_speech_to_text_config()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: ??? code "get_generation_prompt()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: ??? code "get_generation_prompt()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface If your model requires a language and you want a default, override this method (see Whisper): ??? code "validate_language()" + ```python @classmethod def validate_language(cls, language: str | None) -> str | None: if language is None: logger.warning( - "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.") + "Defaulting to language='en'. If you wish to transcribe " + "audio in a different language, pass the `language` field " + "in the TranscriptionRequest." + ) language = "en" return super().validate_language(language) ``` @@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo Provide a fast duration→token estimate to improve streaming usage statistics: ??? code "get_num_audio_tokens()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi Relevant server logic: ??? code "_preprocess_speech_to_text()" + ```python # vllm/entrypoints/openai/speech_to_text.py async def _preprocess_speech_to_text(...): diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md index 1f233c3204a15..960347d9525c4 100644 --- a/docs/deployment/frameworks/cerebrium.md +++ b/docs/deployment/frameworks/cerebrium.md @@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference ??? console "Command" - ```python + ```bash curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ -H 'Content-Type: application/json' \ -H 'Authorization: ' \ @@ -81,7 +81,7 @@ You should get a response like: ??? console "Response" - ```python + ```json { "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", "result": { diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md index fe4d87f78f2aa..9d2c7f5bb565f 100644 --- a/docs/deployment/frameworks/dstack.md +++ b/docs/deployment/frameworks/dstack.md @@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK: client = OpenAI( base_url="https://gateway.", - api_key="" + api_key="", ) completion = client.chat.completions.create( @@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK: "role": "user", "content": "Compose a poem that explains the concept of recursion in programming.", } - ] + ], ) print(completion.choices[0].message.content) diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md index 836305cf15c42..b53b829d6d3c0 100644 --- a/docs/deployment/frameworks/haystack.md +++ b/docs/deployment/frameworks/haystack.md @@ -34,7 +34,7 @@ pip install vllm haystack-ai api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), model="mistralai/Mistral-7B-Instruct-v0.1", api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1", - generation_kwargs = {"max_tokens": 512} + generation_kwargs={"max_tokens": 512}, ) response = generator.run( diff --git a/docs/deployment/frameworks/hf_inference_endpoints.md b/docs/deployment/frameworks/hf_inference_endpoints.md index 75a234bdf1422..d39bb9a899c8a 100644 --- a/docs/deployment/frameworks/hf_inference_endpoints.md +++ b/docs/deployment/frameworks/hf_inference_endpoints.md @@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo import os client = OpenAI( - base_url = DEPLOYMENT_URL, - api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens + base_url=DEPLOYMENT_URL, + api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens ) chat_completion = client.chat.completions.create( - model = "HuggingFaceTB/SmolLM3-3B", - messages = [ + model="HuggingFaceTB/SmolLM3-3B", + messages=[ { "role": "user", "content": [ { "type": "text", - "text": "Give me a brief explanation of gravity in simple terms." + "text": "Give me a brief explanation of gravity in simple terms.", } - ] + ], } ], - stream = True + stream=True, ) for message in chat_completion: - print(message.choices[0].delta.content, end = "") + print(message.choices[0].delta.content, end="") ``` !!! note @@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg import os client = OpenAI( - base_url = DEPLOYMENT_URL, - api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens + base_url=DEPLOYMENT_URL, + api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens ) chat_completion = client.chat.completions.create( - model = "ibm-granite/granite-docling-258M", - messages = [ + model="ibm-granite/granite-docling-258M", + messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": { - "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png" - } + "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png", + }, }, { "type": "text", - "text": "Convert this page to docling." - } + "text": "Convert this page to docling.", + }, ] } ], - stream = True + stream=True, ) for message in chat_completion: - print(message.choices[0].delta.content, end = "") + print(message.choices[0].delta.content, end="") ``` !!! note diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md index 0d6c3729911ad..9ea7c0373d2a1 100644 --- a/docs/deployment/frameworks/litellm.md +++ b/docs/deployment/frameworks/litellm.md @@ -36,15 +36,16 @@ pip install vllm litellm ```python import litellm - messages = [{ "content": "Hello, how are you?","role": "user"}] + messages = [{"content": "Hello, how are you?", "role": "user"}] # hosted_vllm is prefix key word and necessary response = litellm.completion( - model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name - messages=messages, - api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", - temperature=0.2, - max_tokens=80) + model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name + messages=messages, + api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", + temperature=0.2, + max_tokens=80, + ) print(response) ``` diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md index d86ab1600f126..37f90ef08f32e 100644 --- a/docs/deployment/frameworks/retrieval_augmented_generation.md +++ b/docs/deployment/frameworks/retrieval_augmented_generation.md @@ -40,7 +40,7 @@ pip install -U vllm \ 1. Run the script - ```python + ```bash python retrieval_augmented_generation_with_langchain.py ``` @@ -78,6 +78,6 @@ pip install vllm \ 1. Run the script: - ```python + ```bash python retrieval_augmented_generation_with_llamaindex.py ``` diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md index f88a29f6eadd8..c6d71589be985 100644 --- a/docs/design/cuda_graphs.md +++ b/docs/design/cuda_graphs.md @@ -106,9 +106,11 @@ The dispatch code looks like: batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...) runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor) # execution -with set_forward_context(..., - cudagraph_runtime_mode=runtime_mode, - batch_descriptor=batch_descriptor): +with set_forward_context( + ..., + cudagraph_runtime_mode=runtime_mode, + batch_descriptor=batch_descriptor, +): output = self.model(...) ``` @@ -165,7 +167,7 @@ class AttentionCGSupport(enum.Enum): """NO CUDA Graphs support""" ``` -Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. +Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. The following table lists backends that support full CUDA Graphs at the time of writing. @@ -200,12 +202,12 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG") import vllm from vllm.config import CUDAGraphMode -compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} +compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} model = vllm.LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - dtype='auto', - compilation_config = compilation_config, - ) + model="meta-llama/Llama-3.1-8B-Instruct", + dtype="auto", + compilation_config=compilation_config, +) sampling_params = vllm.SamplingParams( temperature=0, # greedy decoding max_tokens=1024, diff --git a/docs/design/dbo.md b/docs/design/dbo.md index d92c47c80f951..f2d98ccd063fa 100644 --- a/docs/design/dbo.md +++ b/docs/design/dbo.md @@ -34,10 +34,10 @@ To enable the DBO system pass in the `--enable-dbo` argument to your vllm serve * `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch * `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch -Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `VLLM_ALL2ALL_BACKEND` environment variable must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests. +Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `--all2all-backend` argument must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests. Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled. -EX: `VLLM_ALL2ALL_BACKEND=deepep_low_latency vllm serve --model="deepseek-ai/DeepSeek-V2-Lite" --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo` +EX: `vllm serve deepseek-ai/DeepSeek-V2-Lite --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo --all2all-backend deepep_low_latency` Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES` diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index e70ee4a076e54..682fc5c413e2d 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin IO Processor plugins implement the `IOProcessor` interface (): ```python -IOProcessorInput = TypeVar('IOProcessorInput') -IOProcessorOutput = TypeVar('IOProcessorOutput') +IOProcessorInput = TypeVar("IOProcessorInput") +IOProcessorOutput = TypeVar("IOProcessorOutput") class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): @@ -21,30 +21,32 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): def pre_process( self, prompt: IOProcessorInput, - request_id: Optional[str] = None, + request_id: str | None = None, **kwargs, - ) -> Union[PromptType, Sequence[PromptType]]: + ) -> PromptType | Sequence[PromptType]: raise NotImplementedError async def pre_process_async( self, prompt: IOProcessorInput, - request_id: Optional[str] = None, + request_id: str | None = None, **kwargs, - ) -> Union[PromptType, Sequence[PromptType]]: + ) -> PromptType | Sequence[PromptType]: return self.pre_process(prompt, request_id, **kwargs) @abstractmethod - def post_process(self, - model_output: Sequence[PoolingRequestOutput], - request_id: Optional[str] = None, - **kwargs) -> IOProcessorOutput: + def post_process( + self, + model_output: Sequence[PoolingRequestOutput], + request_id: str | None = None, + **kwargs, + ) -> IOProcessorOutput: raise NotImplementedError async def post_process_async( self, model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]], - request_id: Optional[str] = None, + request_id: str | None = None, **kwargs, ) -> IOProcessorOutput: collected_output = [item async for i, item in model_output] @@ -56,7 +58,8 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): @abstractmethod def output_to_response( - self, plugin_output: IOProcessorOutput) -> IOProcessorResponse: + self, plugin_output: IOProcessorOutput + ) -> IOProcessorResponse: raise NotImplementedError ``` diff --git a/docs/design/metrics.md b/docs/design/metrics.md index 90b2fd32f2979..c4a2d72a2f4a4 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -478,15 +478,17 @@ us with: ```python if seq_group.is_finished(): - if (seq_group.metrics.first_scheduled_time is not None and - seq_group.metrics.first_token_time is not None): + if ( + seq_group.metrics.first_scheduled_time is not None + and seq_group.metrics.first_token_time is not None + ): time_queue_requests.append( seq_group.metrics.first_scheduled_time - - seq_group.metrics.arrival_time) + seq_group.metrics.arrival_time + ) ... if seq_group.metrics.time_in_queue is not None: - time_in_queue_requests.append( - seq_group.metrics.time_in_queue) + time_in_queue_requests.append(seq_group.metrics.time_in_queue) ``` This seems duplicative, and one of them should be removed. The latter diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md index 9941837bf1652..270699df623e0 100644 --- a/docs/design/prefix_caching.md +++ b/docs/design/prefix_caching.md @@ -112,8 +112,8 @@ class KVCacheBlock: ref_cnt: int # The pointers to form a doubly linked list for the free queue. - prev_free_block: Optional["KVCacheBlock"] = None - next_free_block: Optional["KVCacheBlock"] = None + prev_free_block: "KVCacheBlock | None" = None + next_free_block: "KVCacheBlock | None" = None ``` There are two design points to highlight: diff --git a/docs/features/lora.md b/docs/features/lora.md index db794b2ebd71d..d3b44520a5a79 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter. sampling_params = SamplingParams( temperature=0, max_tokens=256, - stop=["[/assistant]"] + stop=["[/assistant]"], ) prompts = [ @@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter. outputs = llm.generate( prompts, sampling_params, - lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) + lora_request=LoRARequest("sql_adapter", 1, sql_lora_path), ) ``` @@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin: lora_request = LoRARequest( lora_name=lora_name, lora_path=local_path, - lora_int_id=abs(hash(lora_name)) + lora_int_id=abs(hash(lora_name)), ) return lora_request ``` @@ -296,10 +296,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au if has_audio: question = f"<|audio|>{question}" chat = [ - { - "role": "user", - "content": question - } + {"role": "user", "content": question}, ] return tokenizer.apply_chat_template(chat, tokenize=False) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index dcc5ea3b90964..8f75f714d4b01 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -154,9 +154,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis outputs = llm.generate({ "prompt": prompt, - "multi_modal_data": { - "image": [image1, image2] - }, + "multi_modal_data": {"image": [image1, image2]}, }) for o in outputs: @@ -183,21 +181,24 @@ conversation = [ {"role": "assistant", "content": "Hello! How can I assist you today?"}, { "role": "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - },{ - "type": "image_pil", - "image_pil": image_pil - }, { - "type": "image_embeds", - "image_embeds": image_embeds - }, { - "type": "text", - "text": "What's in these images?" - }], + "content": [ + { + "type": "image_url", + "image_url": {"url": image_url}, + }, + { + "type": "image_pil", + "image_pil": image_pil, + }, + { + "type": "image_embeds", + "image_embeds": image_embeds, + }, + { + "type": "text", + "text": "What's in these images?", + }, + ], }, ] @@ -224,7 +225,10 @@ Multi-image input can be extended to perform video captioning. We show this with message = { "role": "user", "content": [ - {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, + { + "type": "text", + "text": "Describe this set of frames. Consider the frames to be a part of the same video.", + }, ], } for i in range(len(video_frames)): @@ -255,13 +259,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f # Custom black background for dark theme llm = LLM( model="llava-hf/llava-1.5-7b-hf", - media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}} + media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}, ) # Custom brand color background (e.g., blue) llm = LLM( model="llava-hf/llava-1.5-7b-hf", - media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}} + media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}, ) ``` @@ -294,20 +298,23 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown limit_mm_per_prompt={"video": 1}, ) - sampling_params = SamplingParams( - max_tokens=1024, - ) + sampling_params = SamplingParams(max_tokens=1024) video_messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": [ + { + "role": "system", + "content": "You are a helpful assistant.", + }, + { + "role": "user", + "content": [ {"type": "text", "text": "describe this video."}, { "type": "video", "video": video_path, "total_pixels": 20480 * 28 * 28, - "min_pixels": 16 * 28 * 28 - } + "min_pixels": 16 * 28 * 28, + }, ] }, ] @@ -465,21 +472,24 @@ Then, you can use the OpenAI client as follows: chat_response = client.chat.completions.create( model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - # NOTE: The prompt formatting with the image token `` is not needed - # since the prompt will be processed automatically by the API server. - {"type": "text", "text": "What’s in this image?"}, - { - "type": "image_url", - "image_url": { - url": image_url + messages=[ + { + "role": "user", + "content": [ + # NOTE: The prompt formatting with the image token `` is not needed + # since the prompt will be processed automatically by the API server. + { + "type": "text", + "text": "What’s in this image?", }, - "uuid": image_url # Optional - }, - ], - }], + { + "type": "image_url", + "image_url": {"url": image_url}, + "uuid": image_url, # Optional + }, + ], + } + ], ) print("Chat completion output:", chat_response.choices[0].message.content) @@ -489,26 +499,27 @@ Then, you can use the OpenAI client as follows: chat_response = client.chat.completions.create( model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - {"type": "text", "text": "What are the animals in these images?"}, - { - "type": "image_url", - "image_url": { - "url": image_url_duck + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the animals in these images?", }, - "uuid": image_url_duck # Optional - }, - { - "type": "image_url", - "image_url": { - "url": image_url_lion + { + "type": "image_url", + "image_url": {"url": image_url_duck}, + "uuid": image_url_duck, # Optional }, - "uuid": image_url_lion # Optional - }, - ], - }], + { + "type": "image_url", + "image_url": {"url": image_url_lion}, + "uuid": image_url_lion, # Optional + }, + ], + } + ], ) print("Chat completion output:", chat_response.choices[0].message.content) ``` @@ -560,23 +571,22 @@ Then, you can use the OpenAI client as follows: ## Use video url in the payload chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this video?" - }, - { - "type": "video_url", - "video_url": { - "url": video_url + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this video?", }, - "uuid": video_url # Optional - }, - ], - }], + { + "type": "video_url", + "video_url": {"url": video_url}, + "uuid": video_url, # Optional + }, + ], + } + ], model=model, max_completion_tokens=64, ) @@ -652,23 +662,25 @@ Then, you can use the OpenAI client as follows: audio_base64 = encode_base64_content_from_url(audio_url) chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "input_audio", - "input_audio": { - "data": audio_base64, - "format": "wav" + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?", }, - "uuid": audio_url # Optional - }, - ], - }], + { + "type": "input_audio", + "input_audio": { + "data": audio_base64, + "format": "wav", + }, + "uuid": audio_url, # Optional + }, + ], + }, + ], model=model, max_completion_tokens=64, ) @@ -683,22 +695,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag ```python chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - "url": audio_url + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?", }, - "uuid": audio_url # Optional - }, - ], - }], + { + "type": "audio_url", + "audio_url": {"url": audio_url}, + "uuid": audio_url, # Optional + }, + ], + } + ], model=model, max_completion_tokens=64, ) @@ -747,43 +759,48 @@ The following example demonstrates how to pass image embeddings to the OpenAI se # Basic usage - this is equivalent to the LLaVA example for offline inference model = "llava-hf/llava-1.5-7b-hf" - embeds = { + embeds = { "type": "image_embeds", "image_embeds": f"{base64_image_embedding}", - "uuid": image_url # Optional + "uuid": image_url, # Optional } # Pass additional parameters (available to Qwen2-VL and MiniCPM-V) model = "Qwen/Qwen2-VL-2B-Instruct" - embeds = { + embeds = { "type": "image_embeds", "image_embeds": { - "image_embeds": f"{base64_image_embedding}" , # Required - "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct + "image_embeds": f"{base64_image_embedding}", # Required + "image_grid_thw": f"{base64_image_grid_thw}", # Required by Qwen/Qwen2-VL-2B-Instruct }, - "uuid": image_url # Optional + "uuid": image_url, # Optional } model = "openbmb/MiniCPM-V-2_6" - embeds = { + embeds = { "type": "image_embeds", "image_embeds": { - "image_embeds": f"{base64_image_embedding}" , # Required - "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6 + "image_embeds": f"{base64_image_embedding}", # Required + "image_sizes": f"{base64_image_sizes}", # Required by openbmb/MiniCPM-V-2_6 }, - "uuid": image_url # Optional + "uuid": image_url, # Optional } chat_completion = client.chat.completions.create( messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": [ { - "type": "text", - "text": "What's in this image?", + "role": "system", + "content": "You are a helpful assistant.", }, - embeds, - ], - }, - ], + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this image?", + }, + embeds, + ], + }, + ], model=model, ) ``` @@ -802,22 +819,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit { "type": "image_embeds", "image_embeds": None, - "uuid": image_uuid + "uuid": image_uuid, }, # input_audio: { "type": "input_audio", "input_audio": None, - "uuid": audio_uuid + "uuid": audio_uuid, }, # PIL Image: { "type": "image_pil", - "image_pil": None - "uuid": image_uuid - } + "image_pil": None, + "uuid": image_uuid, + }, ``` diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md index 795b0c77d610e..bfc0e0d86c6ae 100644 --- a/docs/features/nixl_connector_usage.md +++ b/docs/features/nixl_connector_usage.md @@ -156,6 +156,16 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \ NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`). Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior. +## Experimental Feature + +### Heterogenuous KV Layout support + +Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration + +```bash +--kv-transfer-config '{..., "enable_permute_local_kv":"True"}' +``` + ## Example Scripts/Code Refer to these example scripts in the vLLM repository: diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md index fc998387d29aa..e77e8b5a1f415 100644 --- a/docs/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -1,5 +1,9 @@ # AutoAWQ +> ⚠️ **Warning:** + The `AutoAWQ` library is deprecated. This functionality has been adopted by the vLLM project in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq). + For the recommended quantization workflow, please see the AWQ examples in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq). For more details on the deprecation, refer to the original [AutoAWQ repository](https://github.com/casper-hansen/AutoAWQ). + To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint. The main benefits are lower latency and memory usage. @@ -18,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the from awq import AutoAWQForCausalLM from transformers import AutoTokenizer - model_path = 'mistralai/Mistral-7B-Instruct-v0.2' - quant_path = 'mistral-instruct-v0.2-awq' - quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } + model_path = "mistralai/Mistral-7B-Instruct-v0.2" + quant_path = "mistral-instruct-v0.2-awq" + quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"} # Load model model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + model_path, + low_cpu_mem_usage=True, + use_cache=False, ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md index ac766d5e29228..9c14f362b663f 100644 --- a/docs/features/quantization/auto_round.md +++ b/docs/features/quantization/auto_round.md @@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound model_name = "Qwen/Qwen3-0.6B" -model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) bits, group_size, sym = 4, 128, True diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md index 53b689ad53ff6..c3a1276576223 100644 --- a/docs/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -34,7 +34,7 @@ llm = LLM( model=model_id, dtype=torch.bfloat16, trust_remote_code=True, - quantization="bitblas" + quantization="bitblas", ) ``` @@ -53,6 +53,6 @@ llm = LLM( dtype=torch.float16, trust_remote_code=True, quantization="bitblas", - max_model_len=1024 + max_model_len=1024, ) ``` diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md index 3b15a6072d47a..2348c7739c066 100644 --- a/docs/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit" llm = LLM( model=model_id, dtype=torch.bfloat16, - trust_remote_code=True + trust_remote_code=True, ) ``` @@ -43,7 +43,7 @@ llm = LLM( model=model_id, dtype=torch.bfloat16, trust_remote_code=True, - quantization="bitsandbytes" + quantization="bitsandbytes", ) ``` diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index 834c03cbe05b0..0c5111fb8af0d 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` @@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio # Configure the simple PTQ quantization recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) + targets="Linear", + scheme="FP8_DYNAMIC", + ignore=["lm_head"], + ) # Apply the quantization algorithm. oneshot(model=model, recipe=recipe) diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md index 2a1c3bdd775f1..2a731e9b7e032 100644 --- a/docs/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint: conversation = [ { "role": "system", - "content": "You are a helpful assistant" + "content": "You are a helpful assistant", }, { "role": "user", - "content": "Hello" + "content": "Hello", }, { "role": "assistant", - "content": "Hello! How can I assist you today?" + "content": "Hello! How can I assist you today?", }, { "role": "user", @@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint: sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. - llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") + llm = LLM( + model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", + tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.chat(conversation, sampling_params) diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md index 47cb2d65bae47..f14a931725da4 100644 --- a/docs/features/quantization/gptqmodel.md +++ b/docs/features/quantization/gptqmodel.md @@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: calibration_dataset = load_dataset( "allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", - split="train" + split="train", ).select(range(1024))["text"] quant_config = QuantizeConfig(bits=4, group_size=128) diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index d6fdac7b07f7f..035e7ea291f9e 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` @@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y }, ignore=["lm_head"], update_size=NUM_CALIBRATION_SAMPLES, - dampening_frac=0.01 + dampening_frac=0.01, ) ``` diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index af3650e701ad0..ec8a77f74ffef 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md index 39ae03b1bdac0..c48ccb719a79d 100644 --- a/docs/features/quantization/modelopt.md +++ b/docs/features/quantization/modelopt.md @@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll from vllm import LLM, SamplingParams def main(): - model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" - # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint + + # Ensure you specify quantization="modelopt" when loading the modelopt checkpoint llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) sampling_params = SamplingParams(temperature=0.8, top_p=0.9) diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index b2b417309e92b..56cf057678be6 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization: from vllm import LLM, SamplingParams sampling_params = SamplingParams(temperature=0.7, top_p=0.8) - llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - calculate_kv_scales=True) + llm = LLM( + model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + calculate_kv_scales=True, + ) prompt = "London is the capital of" out = llm.generate(prompt, sampling_params)[0].outputs[0].text print(out) @@ -80,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models # Select model and load it MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" - model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") + model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index 85b7d8ec84ed3..385e3bbb8712f 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -48,7 +48,9 @@ to fetch model and tokenizer. MAX_SEQ_LEN = 512 model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + dtype="auto", ) model.eval() @@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") text_data = dataset["text"][:NUM_CALIBRATION_DATA] - tokenized_outputs = tokenizer(text_data, return_tensors="pt", - padding=True, truncation=True, max_length=MAX_SEQ_LEN) - calib_dataloader = DataLoader(tokenized_outputs['input_ids'], - batch_size=BATCH_SIZE, drop_last=True) + tokenized_outputs = tokenizer( + text_data, + return_tensors="pt", + padding=True, + truncation=True, + max_length=MAX_SEQ_LEN, + ) + calib_dataloader = DataLoader( + tokenized_outputs['input_ids'], + batch_size=BATCH_SIZE, + drop_last=True, + ) ``` ### 3. Set the Quantization Configuration @@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. load_quant_algo_config_from_file) # Define fp8/per-tensor/static spec. - FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", - is_dynamic=False).to_quantization_spec() + FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec( + observer_method="min_max", + is_dynamic=False, + ).to_quantization_spec() # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. - global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, - weight=FP8_PER_TENSOR_SPEC) + global_quant_config = QuantizationConfig( + input_tensors=FP8_PER_TENSOR_SPEC, + weight=FP8_PER_TENSOR_SPEC, + ) # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] - kv_cache_quant_config = {name : - QuantizationConfig(input_tensors=global_quant_config.input_tensors, - weight=global_quant_config.weight, - output_tensors=KV_CACHE_SPEC) - for name in kv_cache_layer_names_for_llama} + kv_cache_quant_config = { + name: QuantizationConfig( + input_tensors=global_quant_config.input_tensors, + weight=global_quant_config.weight, + output_tensors=KV_CACHE_SPEC, + ) + for name in kv_cache_layer_names_for_llama + } layer_quant_config = kv_cache_quant_config.copy() # Define algorithm config by config file. - LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = - 'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json' + LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json" algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) EXCLUDE_LAYERS = ["lm_head"] @@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. layer_quant_config=layer_quant_config, kv_cache_quant_config=kv_cache_quant_config, exclude=EXCLUDE_LAYERS, - algo_config=algo_config) + algo_config=algo_config, + ) ``` ### 4. Quantize the Model and Export @@ -165,8 +182,11 @@ for more exporting format details. EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) with torch.no_grad(): - exporter.export_safetensors_model(freezed_model, - quant_config=quant_config, tokenizer=tokenizer) + exporter.export_safetensors_model( + freezed_model, + quant_config=quant_config, + tokenizer=tokenizer, + ) ``` ### 5. Evaluation in vLLM @@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. - llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", - kv_cache_dtype='fp8',quantization='quark') + llm = LLM( + model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", + kv_cache_dtype="fp8", + quantization="quark", + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md index 6932445997012..b95b560882bb1 100644 --- a/docs/features/quantization/torchao.md +++ b/docs/features/quantization/torchao.md @@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) quantized_model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype="auto", + dtype="auto", device_map="auto", quantization_config=quantization_config ) diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 389b3cb21ef5d..0b00b8805bb2c 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -11,6 +11,7 @@ vLLM currently supports the following reasoning models: | Model Series | Parser Name | Structured Output Support | Tool Calling | |--------------|-------------|------------------|-------------| | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ | +| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ | | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ | | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ | | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ | @@ -20,8 +21,9 @@ vLLM currently supports the following reasoning models: | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ | !!! note - IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. + IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. + DeepSeek-V3.1 tool calling is supported in non-thinking mode. ## Quickstart @@ -117,9 +119,11 @@ OpenAI Python client library does not officially support `reasoning_content` att # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` # For Qwen3 series, if you want to disable thinking in reasoning mode, add: # extra_body={"chat_template_kwargs": {"enable_thinking": False}} - stream = client.chat.completions.create(model=model, - messages=messages, - stream=True) + stream = client.chat.completions.create( + model=model, + messages=messages, + stream=True, + ) print("client: Start streaming chat completions...") printed_reasoning_content = False @@ -159,27 +163,29 @@ The reasoning content is also available when both tool calling and the reasoning client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") - tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} - }, - "required": ["location", "unit"] - } + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location", "unit"], + } + }, } - }] + ] response = client.chat.completions.create( model=client.models.list().data[0].id, messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], tools=tools, - tool_choice="auto" + tool_choice="auto", ) print(response) @@ -225,7 +231,7 @@ You can add a new `ReasoningParser` similar to Union[DeltaMessage, None]: + ) -> DeltaMessage | None: """ Instance method that should be implemented for extracting reasoning from an incomplete response; for use when handling reasoning calls and @@ -235,8 +241,10 @@ You can add a new `ReasoningParser` similar to tuple[Optional[str], Optional[str]]: + self, + model_output: str, + request: ChatCompletionRequest | ResponsesRequest, + ) -> tuple[str | None, str | None]: """ Extract reasoning content from a complete model-generated string. @@ -274,10 +282,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner @classmethod def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner: - return cls(start_token_id=tokenizer.encode( - "", add_special_tokens=False)[0], - end_token_id=tokenizer.encode("", - add_special_tokens=False)[0]) + return cls( + start_token_id=tokenizer.encode("", add_special_tokens=False)[0], + end_token_id=tokenizer.encode("", add_special_tokens=False)[0], + ) def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.end_token_id in input_ids diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index e57a8945971f5..02a700c09d391 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -27,27 +27,29 @@ Next, make a request that triggers the model to use the available tools: return f"Getting the weather for {location} in {unit}..." tool_functions = {"get_weather": get_weather} - tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} + }, + "required": ["location", "unit"], }, - "required": ["location", "unit"] - } - } - }] + }, + }, + ] response = client.chat.completions.create( model=client.models.list().data[0].id, messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], tools=tools, - tool_choice="auto" + tool_choice="auto", ) tool_call = response.choices[0].message.tool_calls[0].function @@ -402,8 +404,7 @@ Here is a summary of a plugin file: # adjust request. e.g.: set skip special tokens # to False for tool call output. - def adjust_request( - self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: return request # implement the tool call parse for stream call @@ -416,7 +417,7 @@ Here is a summary of a plugin file: current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: return delta # implement the tool parse for non-stream call diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md index e45baa0aa4938..9cae9ed1a212e 100644 --- a/docs/getting_started/installation/cpu/arm.inc.md +++ b/docs/getting_started/installation/cpu/arm.inc.md @@ -23,7 +23,46 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] ---8<-- "docs/getting_started/installation/cpu/build.inc.md" +First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: + +```bash +sudo apt-get update -y +sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof +sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +``` + +Second, clone the vLLM project: + +```bash +git clone https://github.com/vllm-project/vllm.git vllm_source +cd vllm_source +``` + +Third, install required dependencies: + +```bash +uv pip install -r requirements/cpu-build.txt --torch-backend cpu +uv pip install -r requirements/cpu.txt --torch-backend cpu +``` + +??? console "pip" + ```bash + pip install --upgrade pip + pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu + ``` + +Finally, build and install vLLM: + +```bash +VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation +``` + +If you want to develop vLLM, install it in editable mode instead. + +```bash +VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation +``` Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md deleted file mode 100644 index 4bd4d39a6f80b..0000000000000 --- a/docs/getting_started/installation/cpu/build.inc.md +++ /dev/null @@ -1,45 +0,0 @@ -First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: - -```bash -sudo apt-get update -y -sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof -sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 -``` - -Second, clone the vLLM project: - -```bash -git clone https://github.com/vllm-project/vllm.git vllm_source -cd vllm_source -``` - -Third, install required dependencies: - -```bash -uv pip install -r requirements/cpu-build.txt --torch-backend cpu -uv pip install -r requirements/cpu.txt --torch-backend cpu -``` - -??? console "pip" - ```bash - pip install --upgrade pip - pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu - pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu - ``` - -Finally, build and install vLLM: - -```bash -VLLM_TARGET_DEVICE=cpu python setup.py install -``` - -If you want to develop vLLM, install it in editable mode instead. - -```bash -VLLM_TARGET_DEVICE=cpu python setup.py develop -``` - -!!! note - If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM. - -# --8<-- [end:extra-information] diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 49e1f6fac7151..1cba21cf5f6d9 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep api_key=openai_api_key, base_url=openai_api_base, ) - completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", - prompt="San Francisco is a") + completion = client.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + prompt="San Francisco is a", + ) print("Completion result:", completion) ``` @@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package: messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a joke."}, - ] + ], ) print("Chat response:", chat_response) ``` diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index ecd71ee1f3f66..a4da5b933e159 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -22,6 +22,11 @@ sys.modules["vllm._C"] = MagicMock() class PydanticMagicMock(MagicMock): """`MagicMock` that's able to generate pydantic-core schemas.""" + def __init__(self, *args, **kwargs): + name = kwargs.pop("name", None) + super().__init__(*args, **kwargs) + self.__spec__ = importlib.machinery.ModuleSpec(name, None) + def __get_pydantic_core_schema__(self, source_type, handler): return core_schema.any_schema() @@ -42,7 +47,9 @@ def auto_mock(module, attr, max_mocks=50): raise e except ModuleNotFoundError as e: logger.info("Mocking %s for argparse doc generation", e.name) - sys.modules[e.name] = PydanticMagicMock() + sys.modules[e.name] = PydanticMagicMock(name=e.name) + except Exception as e: + logger.warning("Failed to import %s.%s: %s", module, attr, e) raise ImportError( f"Failed to import {module}.{attr} after mocking {max_mocks} imports" diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index f70ab0c6f4e5c..3df80d5af6c4d 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -60,7 +60,7 @@ from vllm import LLM llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", load_format="tensorizer", - enable_lora=True + enable_lora=True, ) ``` @@ -97,6 +97,6 @@ llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", load_format="tensorizer", enable_lora=True, - model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}} + model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}, ) ``` diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index 05f8d16cc4ca7..9ea32ed616457 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc conversation = [ { "role": "system", - "content": "You are a helpful assistant" + "content": "You are a helpful assistant", }, { "role": "user", - "content": "Hello" + "content": "Hello", }, { "role": "assistant", - "content": "Hello! How can I assist you today?" + "content": "Hello! How can I assist you today?", }, { "role": "user", diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 50982d3d0d0f3..45bfba2cbf594 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u from vllm import LLM llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") -(output,) = llm.score("What is the capital of France?", - "The capital of Brazil is Brasilia.") +(output,) = llm.score( + "What is the capital of France?", + "The capital of Brazil is Brasilia.", +) score = output.outputs.score print(f"Score: {score}") @@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please Here is an example to serve a model with Matryoshka Embeddings enabled. -```text +```bash vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}' ``` @@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka ```python from vllm import LLM, PoolingParams -llm = LLM(model="jinaai/jina-embeddings-v3", - runner="pooling", - trust_remote_code=True) -outputs = llm.embed(["Follow the white rabbit."], - pooling_params=PoolingParams(dimensions=32)) +llm = LLM( + model="jinaai/jina-embeddings-v3", + runner="pooling", + trust_remote_code=True, +) +outputs = llm.embed( + ["Follow the white rabbit."], + pooling_params=PoolingParams(dimensions=32), +) print(outputs[0].outputs) ``` @@ -234,13 +240,13 @@ A code example can be found here: ```python import os -os.environ['http_proxy'] = 'http://your.proxy.server:port' -os.environ['https_proxy'] = 'http://your.proxy.server:port' +os.environ["http_proxy"] = "http://your.proxy.server:port" +os.environ["https_proxy"] = "http://your.proxy.server:port" ``` ### ModelScope @@ -335,108 +335,108 @@ th { } -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ | -| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ | -| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | ✅︎ | -| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | -| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ | -| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ | -| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ | -| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ | -| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ | -| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `FlexOlmoForCausalLM` | FlexOlmo | `allenai/FlexOlmo-7x7B-1T`, `allenai/FlexOlmo-7x7B-1T-RT`, etc. | | ✅︎ | ✅︎ | -| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | -| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ | -| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ | -| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ | -| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ | ✅︎ | -| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | -| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | -| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ | ✅︎ | -| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ | -| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ | -| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Lfm2MoeForCausalLM` | LFM2MoE | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | ✅︎ | -| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ | -| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ | -| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | ✅︎ | -| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ | -| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ | -| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ | -| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | ✅︎ | -| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ | -| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ | -| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ | -| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ | -| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ | -| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ |✅︎ | ✅︎ | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ | +| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | +| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | +| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | +| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | +| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | +| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ | +| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | +| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | +| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | +| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | +| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | +| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | +| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | +| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | +| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | +| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | +| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | +| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | +| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | +| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ | +| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | +| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | +| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | +| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | +| `FlexOlmoForCausalLM` | FlexOlmo | `allenai/FlexOlmo-7x7B-1T`, `allenai/FlexOlmo-7x7B-1T-RT`, etc. | | ✅︎ | +| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | +| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | +| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | +| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | +| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | +| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | +| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | +| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | +| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | +| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | +| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | +| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ | +| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | +| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | +| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | +| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | +| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | +| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ | +| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ | +| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | +| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | +| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | +| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | +| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | +| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | +| `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | +| `Lfm2MoeForCausalLM` | LFM2MoE | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ | +| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | +| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | +| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | +| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | +| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | +| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | +| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | +| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | +| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | +| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | +| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | +| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | +| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | +| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | +| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | +| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | +| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | +| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | +| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | +| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | +| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | +| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | +| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | +| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | +| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | +| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | +| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | +| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | +| `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | +| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | +| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | +| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | +| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | +| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | +| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | +| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | +| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | +| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | +| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ | Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ | ✅︎ | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ | !!! note Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. @@ -453,21 +453,21 @@ See [this page](./pooling_models.md) for more information on how to use pooling These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ | -| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Gemma3TextModel`C | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | -| `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | ✅︎ | -| `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | ✅︎ | -| `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | ✅︎ | -| `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | ✅︎ | -| `LlamaModel`C, `LlamaForCausalLM`C, `MistralModel`C, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2Model`C, `Qwen2ForCausalLM`C | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3Model`C, `Qwen3ForCausalLM`C | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | ✅︎ | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | +| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | +| `Gemma3TextModel`C | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | +| `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | +| `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | +| `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | +| `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | +| `LlamaModel`C, `LlamaForCausalLM`C, `MistralModel`C, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen2Model`C, `Qwen2ForCausalLM`C | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | +| `Qwen3Model`C, `Qwen3ForCausalLM`C | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | +| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | C Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. @@ -494,11 +494,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | +| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. @@ -511,16 +511,16 @@ If your model is not in the above list, we will try to automatically convert the Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ | -| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | -| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | ✅︎ | -| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | -| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ | -| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | ✅︎ | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | +| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | +| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | +| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | +| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | +| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | +| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. @@ -553,13 +553,13 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `LlamaForCausalLM`C | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | +| `LlamaForCausalLM`C | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | +| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | +| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | C Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. @@ -575,10 +575,10 @@ If your model is not in the above list, we will try to automatically convert the These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------| -| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | ✅︎ | -| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | ✅︎ | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|-----------------------------|-----------------------------------------| +| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | +| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | !!! note Named Entity Recognition (NER) usage, please refer to , . @@ -604,29 +604,6 @@ On the other hand, modalities separated by `/` are mutually exclusive. See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model. -!!! important - **To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference) - or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: - - Offline inference: - - ```python - from vllm import LLM - - llm = LLM( - model="Qwen/Qwen2-VL-7B-Instruct", - limit_mm_per_prompt={"image": 4}, - ) - ``` - - Online serving: - - ```bash - vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}' - ``` - - **This is no longer required if you are using vLLM V1.** - !!! tip For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache. @@ -663,70 +640,70 @@ See [this page](generative_models.md) for more information on how to use generat These models primarily accept the [`LLM.generate`](./generative_models.md#llmgenerate) API. Chat/Instruct models additionally support the [`LLM.chat`](./generative_models.md#llmchat) API. -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------| -| `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | | | ✅︎ | -| `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ | -| `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ | -| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ | -| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I+ | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ | -| `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ | -| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I+/ V+ | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ | -| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ | -| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | -| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | -| `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | -| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | -| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | -| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + IE+ + VE+ | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ | -| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | ✅︎ | -| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ | -| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | -| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + IE+ | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ | -| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ | -| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + IE+ | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ | -| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ | -| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ | -| `MiDashengLMModel` | MiDashengLM | T + A+ | `mispeech/midashenglm-7b` | | ✅︎ | ✅︎ | -| `MiniCPMO` | MiniCPM-O | T + IE+ + VE+ + AE+ | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ | -| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ | -| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MolmoForCausalLM` | Molmo | T + I+ | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ | -| `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ | -| `Ovis2_5` | Ovis2.5 | T + I+ + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | ✅︎ | -| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ | -| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ | -| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ | -| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ | -| `QwenVLForConditionalGeneration`^ | Qwen-VL | T + IE+ | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ | -| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + IE+ + VE+ | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + IE+ + VE+ | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ | ✅︎ | -| `RForConditionalGeneration` | R-VL-4B | T + IE+ | `YannQi/R-4B` | | ✅︎ | ✅︎ | -| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | -| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | -| `Step3VLForConditionalGeneration` | Step3-VL | T + I+ | `stepfun-ai/step3` | | ✅︎ | ✅︎ | -| `TarsierForConditionalGeneration` | Tarsier | T + IE+ | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ | -| `Tarsier2ForConditionalGeneration`^ | Tarsier2 | T + IE+ + VE+ | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|--------|-------------------|----------------------|---------------------------| +| `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | | | +| `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | +| `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | +| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | +| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I+ | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | +| `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | +| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I+/ V+ | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | +| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | +| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | +| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | +| `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | +| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | +| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | +| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | +| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | +| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | +| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | +| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | +| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + IE+ + VE+ | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | +| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | +| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | +| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | +| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | +| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + IE+ | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | +| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | +| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + IE+ | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | +| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | +| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | +| `MiDashengLMModel` | MiDashengLM | T + A+ | `mispeech/midashenglm-7b` | | ✅︎ | +| `MiniCPMO` | MiniCPM-O | T + IE+ + VE+ + AE+ | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | +| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | +| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | +| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | +| `MolmoForCausalLM` | Molmo | T + I+ | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | +| `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | +| `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | +| `Ovis2_5` | Ovis2.5 | T + I+ + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | +| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | +| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | +| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | +| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | +| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | +| `QwenVLForConditionalGeneration`^ | Qwen-VL | T + IE+ | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | +| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | +| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | +| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + IE+ + VE+ | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + IE+ + VE+ | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ | +| `RForConditionalGeneration` | R-VL-4B | T + IE+ | `YannQi/R-4B` | | ✅︎ | +| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | +| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | +| `Step3VLForConditionalGeneration` | Step3-VL | T + I+ | `stepfun-ai/step3` | | ✅︎ | +| `TarsierForConditionalGeneration` | Tarsier | T + IE+ | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | +| `Tarsier2ForConditionalGeneration`^ | Tarsier2 | T + IE+ + VE+ | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------| -| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------| +| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ^ You need to set the architecture name via `--hf-overrides` to match the one in vLLM.     • For example, to use DeepSeek-VL2 series models: @@ -811,11 +788,11 @@ Some models are supported only via the [Transformers backend](#transformers). Th Speech2Text models trained specifically for Automatic Speech Recognition. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | ✅︎ | -| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | +| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | +| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ### Pooling Models @@ -830,12 +807,12 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A The following table lists those that are tested in vLLM. -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------| -| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | ✅︎ | -| `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | ✅︎ | -| `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | ✅︎ | -| `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* | \* | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|--------|-------------------|----------------------|---------------------------| +| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | +| `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | +| `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | +| `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* | C Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. @@ -847,9 +824,9 @@ The following table lists those that are tested in vLLM. Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------| -| `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|--------|-------------------|----------------------|---------------------------| +| `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. diff --git a/docs/serving/context_parallel_deployment.md b/docs/serving/context_parallel_deployment.md new file mode 100644 index 0000000000000..dacdf312ee55b --- /dev/null +++ b/docs/serving/context_parallel_deployment.md @@ -0,0 +1,47 @@ +# Context Parallel Deployment + +Context parallel mainly solves the problem of serving long context requests. As prefill and decode present quite different characteristics and have quite different SLO (service level objectives), we need to implement context parallel separately for them. The major considerations are: + +- For long context prefill, we need to control the TTFT (time to first token) by amortizing the computation time of the prefill across query tokens. +- For long context decode, we need more space for KV cache to increase the batchsize (and hence the throughput). + +## Prefill Context Parallel + +During prefill, for a long request with `T` new tokens, we need to compute query/key/value tensors for these new tokens. Say we have `N` GPUs, we can split the request into `N` chunks, and each GPU computes one chunk of the query/key/value tensors. + +Depending on the use case, there're two possible strategies: + +1. Partial query, full key/value: If the request token length is moderately long (we can afford holding the full key/value tensors), and the goal is to accelerate the prefill (and amortize the computation time of the prefill across query tokens), then we can gather the key/value tensors from all GPUs and let each GPU compute the attention output corresponding to the query tokens of its chunk. +2. Partial query, partial key/value: If the request token length is too long, we cannot afford holding the full key/value tensors anymore, then we can only compute one chunk of query/key/value tensors for each GPU, and use techniques like [ring-attention](http://arxiv.org/abs/2310.01889) to send/recv key/value tensors chunk by chunk. + +Both approaches are under active development. + +## Decode Context Parallel + +Due to the auto-regressive nature of decoding, every decoding step needs to compute a small amount of query tokens w.r.t. a large number of key/value tokens stored in the paged KV cache. The core of decode context parallel is how to shard the KV cache across GPUs. + +For a model with `H` kv-heads, a request with `T` tokens in the context needs to store `H * T` key/value tensors in the KV cache. + +1. If one GPU can hold them all, and the performance is good enough, then no parallelization is needed. +2. If one GPU cannot hold them all, or we want to hold more requests in the KV cache, we can first shard the KV cache along the `H` dimension, that's the plain tensor parallel sharding. It's as simple as adding `-tp ` to the command line. +3. Since `H` is limited (determined by the model architecture), when we continue to increase the tensor parallel size, the KV cache for each GPU will be duplicated for `tp_size / H` times. Of course, duplication is not good for efficiency. Then we need to add decode context parallel to further shard the KV cache along the `T` dimension. This is as simple as adding `-dcp ` to the command line. Note that `size` does not increase the number of GPUs we need to launch, but just reduces the KV cache duplication. The dcp size should lie in the range of `[1, tp_size/H]`. With larger dcp size, the KV cache duplication is reduced, but the communication overhead increases. + +Theoretically, it is possible to extend the dcp size beyond `tp_size / H` to further shard the KV cache and accelerate the decoding phase. However, since the number of query tokens is limited in decoding, it's unclear what should we do for the remaining `dcp_size - tp_size / H` GPUs for non-attention layers. For the sake of simplicity, dcp size is upper bounded by `tp_size / H`. If you want to further accelerate the decoding phase, you can consider increasing the `tp_size` first, and then increasing the dcp size. + +Note that kv cache can grow during decoding, and the sharding strategy needs to be carefully implemented. We use an interleaving strategy to shard the KV cache along the `T` dimension, so that kv cache for future tokens can be naturally sharded along the `T` dimension. This is proposed by [Chao Hong from Moonshot](https://github.com/youzhedian), and also explained in details in [this paper](http://arxiv.org/abs/2507.07120). + +Case study: + +For DeepSeek-R1, we have 1 kv-head when MLA is enabled. The typical single-node deployment with `-tp 8` causes 8x KV cache duplication. We can consider adding `-dcp 8` to reduce the KV cache duplication. + +For Kimi-K2, the architecture is similar to DeepSeek-R1, but with more parameters. When we deploy it with `-tp 16`, the KV cache duplication is 16x. We can add `-dcp 16` to completely remove the KV cache duplication, at the cost of more communication overhead. We can also add `-dcp 8` to reduce the KV cache duplication to 2x. Although it still duplicates the KV cache twice, the communication overhead is smaller since the DCP communication only happens inside one node. + +For Qwen3-235B-A22B, we have 4 kv-heads. When we deploy it with `-tp 8`, the KV cache duplication is 2x. Then we can add `-dcp 2` to remove the KV cache duplication. + +In short, for decode context parallel, try to increase `-tp` size until you get satisfactory performance, and then add `-dcp` to reduce the KV cache duplication. + +Decode context parallel is supported in vLLM, for both MLA and GQA models. Some attention backends also support the combination of decode context parallel and MTP (multi-token prediction) to further accelerate the decoding phase. + +## Technical Discussions + +The main discussions happen in the `#sig-context-parallel` channel of [vLLM Slack](https://slack.vllm.ai/). diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index 93ed383395f27..f1dfb05ea5d45 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -14,13 +14,16 @@ Before using EP, you need to install the necessary dependencies. We are actively ### Backend Selection Guide -vLLM provides three communication backends for EP: +vLLM provides multiple communication backends for EP. Use `--all2all-backend` to select one: | Backend | Use Case | Features | Best For | |---------|----------|----------|----------| -| `pplx` | Single node | Chunked prefill support | Development, best for intra-node deployments | -| `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout | High-throughput scenarios, prefill-dominated workloads | -| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout | Low-latency scenarios, decode-dominated workloads | +| `allgather_reducescatter` | Default backend | Standard all2all using allgather/reducescatter primitives | General purpose, works with any EP+DP configuration | +| `pplx` | Single node | Chunked prefill support, efficient intra-node communication | Single-node deployments, development | +| `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout, optimized for prefill | Prefill-dominated workloads, high-throughput scenarios | +| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios | +| `flashinfer_all2allv` | MNNVL systems | FlashInfer alltoallv kernels for multi-node NVLink | Systems with NVLink across nodes | +| `naive` | Testing/debugging | Simple broadcast-based implementation | Debugging, not recommended for production | ## Single Node Deployment @@ -47,11 +50,11 @@ The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parall ```bash # Single node EP deployment with pplx backend -VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 \ - vllm serve deepseek-ai/DeepSeek-V3-0324 \ - --tensor-parallel-size 1 \ # Tensor parallelism across 1 GPU +vllm serve deepseek-ai/DeepSeek-V3-0324 \ + --tensor-parallel-size 1 \ # Tensor parallelism across 1 GPU --data-parallel-size 8 \ # Data parallelism across 8 processes - --enable-expert-parallel # Enable expert parallelism + --enable-expert-parallel \ # Enable expert parallelism + --all2all-backend pplx # Use pplx communication backend ``` ## Multi-Node Deployment @@ -70,8 +73,8 @@ The following example deploys `DeepSeek-V3-0324` across 2 nodes using `deepep_lo ```bash # Node 1 (Primary - handles incoming requests) -VLLM_ALL2ALL_BACKEND=deepep_low_latency VLLM_USE_DEEP_GEMM=1 \ - vllm serve deepseek-ai/DeepSeek-V3-0324 \ +vllm serve deepseek-ai/DeepSeek-V3-0324 \ + --all2all-backend deepep_low_latency \ --tensor-parallel-size 1 \ # TP size per node --enable-expert-parallel \ # Enable EP --data-parallel-size 16 \ # Total DP size across all nodes @@ -81,8 +84,8 @@ VLLM_ALL2ALL_BACKEND=deepep_low_latency VLLM_USE_DEEP_GEMM=1 \ --api-server-count=8 # Number of API servers for load handling (scaling this out to total ranks are recommended) # Node 2 (Secondary - headless mode, no API server) -VLLM_ALL2ALL_BACKEND=deepep_low_latency VLLM_USE_DEEP_GEMM=1 \ - vllm serve deepseek-ai/DeepSeek-V3-0324 \ +vllm serve deepseek-ai/DeepSeek-V3-0324 \ + --all2all-backend deepep_low_latency \ --tensor-parallel-size 1 \ # TP size per node --enable-expert-parallel \ # Enable EP --data-parallel-size 16 \ # Total DP size across all nodes @@ -169,11 +172,12 @@ Single node deployment with EPLB enabled: ```bash # Single node with EPLB load balancing -VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 vllm serve deepseek-ai/DeepSeek-V3-0324 \ - --tensor-parallel-size 1 \ # Tensor parallelism - --data-parallel-size 8 \ # Data parallelism - --enable-expert-parallel \ # Enable EP - --enable-eplb \ # Enable load balancer +vllm serve deepseek-ai/DeepSeek-V3-0324 \ + --tensor-parallel-size 1 \ # Tensor parallelism + --data-parallel-size 8 \ # Data parallelism + --enable-expert-parallel \ # Enable EP + --all2all-backend pplx \ # Use pplx communication backend + --enable-eplb \ # Enable load balancer --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}' ``` @@ -239,10 +243,10 @@ try: "remote_engine_id": None, # Will be populated by vLLM "remote_block_ids": None, # Will be populated by vLLM "remote_host": None, # Will be populated by vLLM - "remote_port": None # Will be populated by vLLM + "remote_port": None, # Will be populated by vLLM } }, - extra_headers={"X-Request-Id": request_id} + extra_headers={"X-Request-Id": request_id}, ) print("-" * 50) @@ -258,7 +262,7 @@ try: extra_body={ "kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info }, - extra_headers={"X-Request-Id": request_id} # Same request ID + extra_headers={"X-Request-Id": request_id}, # Same request ID ) print("-" * 50) diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md index 47074f411ac99..192a61ea5b903 100644 --- a/docs/serving/integrations/langchain.md +++ b/docs/serving/integrations/langchain.md @@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain` ```python from langchain_community.llms import VLLM - llm = VLLM(model="mosaicml/mpt-7b", - trust_remote_code=True, # mandatory for hf models - max_new_tokens=128, - top_k=10, - top_p=0.95, - temperature=0.8, - # tensor_parallel_size=... # for distributed inference + llm = VLLM( + model="mosaicml/mpt-7b", + trust_remote_code=True, # mandatory for hf models + max_new_tokens=128, + top_k=10, + top_p=0.95, + temperature=0.8, + # for distributed inference + # tensor_parallel_size=..., ) print(llm("What is the capital of France ?")) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index fe0e1e3df378b..215c7bf0ced3c 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Hello!"} - ] + {"role": "user", "content": "Hello!"}, + ], ) print(completion.choices[0].message) @@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below: completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} - ] + { + "role": "user", + "content": [ + {"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}, + ], + }, + ], ) ``` @@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}, ], extra_body={ - "structured_outputs": {"choice": ["positive", "negative"]} - } + "structured_outputs": {"choice": ["positive", "negative"]}, + }, ) ``` @@ -149,11 +154,11 @@ with `--enable-request-id-headers`. completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}, ], extra_headers={ "x-request-id": "sentiment-classification-00001", - } + }, ) print(completion._request_id) @@ -162,7 +167,7 @@ with `--enable-request-id-headers`. prompt="A robot may not injure a human being", extra_headers={ "x-request-id": "completion-test", - } + }, ) print(completion._request_id) ``` @@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi model="openai/whisper-large-v3-turbo", file=audio_file, language="en", - response_format="verbose_json" + response_format="verbose_json", ) print(transcription.text) @@ -812,22 +817,22 @@ You can pass multi-modal inputs to scoring models by passing `content` including "model": "jinaai/jina-reranker-m0", "text_1": "slm markdown", "text_2": { - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" - }, - }, - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" - }, - }, - ] - } + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" + }, + }, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" + }, + }, + ], }, + }, ) response.raise_for_status() response_json = response.json() diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 340aaf54bb720..889648b3e7ed2 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -88,12 +88,6 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the | **Mamba Models** | 🟢 (Mamba-2), 🟢 (Mamba-1) | | **Multimodal Models** | 🟢 Functional | -vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol. - -!!! tip - - This corresponds to the V1 column in our [list of supported models](../models/supported_models.md). - See below for the status of models that are not yet supported or have more features planned in V1. #### Embedding Models diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 0076d4d30ee8e..a3e671a0f4cca 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -95,7 +95,7 @@ def parse_args(): parser.add_argument( "--compilation-config", type=int, - help=("Compilation optimization (O) level 0-3."), + help=("Compilation optimization (O) mode 0-3."), ) parser.add_argument( "--quantization", diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md index 3c6f6c7a6c588..7d5a1af8f5a4a 100644 --- a/examples/offline_inference/openai_batch/README.md +++ b/examples/offline_inference/openai_batch/README.md @@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_ """ try: url = s3_client.generate_presigned_url( - ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in + ClientMethod=client_method, + Params=method_parameters, + ExpiresIn=expires_in, ) except ClientError: raise @@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_ s3_client = boto3.client("s3") input_url = generate_presigned_url( - s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600 + s3_client, + "get_object", + {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, + expires_in=3600, ) output_url = generate_presigned_url( - s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600 + s3_client, + "put_object", + {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, + expires_in=3600, ) print(f"{input_url=}") print(f"{output_url=}") diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md index 79afbd9cfac47..7c535e91afac8 100644 --- a/examples/offline_inference/pooling/README.md +++ b/examples/offline_inference/pooling/README.md @@ -26,6 +26,12 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py python examples/offline_inference/pooling/embed_matryoshka_fy.py ``` +## Multi vector retrieval usage + +```bash +python examples/offline_inference/pooling/multi_vector_retrieval.py +``` + ## Named Entity Recognition (NER) usage ```bash diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/offline_inference/pooling/multi_vector_retrieval.py new file mode 100644 index 0000000000000..8b8892117d378 --- /dev/null +++ b/examples/offline_inference/pooling/multi_vector_retrieval.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from argparse import Namespace + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults( + model="BAAI/bge-m3", + runner="pooling", + enforce_eager=True, + ) + return parser.parse_args() + + +def main(args: Namespace): + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + # Create an LLM. + # You should pass runner="pooling" for embedding models + llm = LLM(**vars(args)) + + # Generate embedding. The output is a list of EmbeddingRequestOutputs. + outputs = llm.embed(prompts) + + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for prompt, output in zip(prompts, outputs): + embeds = output.outputs.embedding + print(len(embeds)) + + # Generate embedding for each token. The output is a list of PoolingRequestOutput. + outputs = llm.encode(prompts, pooling_task="token_embed") + + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for prompt, output in zip(prompts, outputs): + multi_vector = output.outputs.data + print(multi_vector.shape) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py index 418c40645f9f2..6c47b57154386 100644 --- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py +++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py @@ -40,7 +40,7 @@ def main(): model_impl="terratorch", ) - pooling_params = PoolingParams(task="encode", softmax=False) + pooling_params = PoolingParams(task="token_classify", activation=False) pooler_output = llm.encode( img_prompt, pooling_params=pooling_params, diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py index b46cea5619671..04d21e0489402 100644 --- a/examples/online_serving/multi_instance_data_parallel.py +++ b/examples/online_serving/multi_instance_data_parallel.py @@ -1,11 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import threading from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams +from vllm.v1.metrics.loggers import AggregatedLoggingStatLogger """ To run this example, run the following commands simultaneously with @@ -21,37 +23,64 @@ send a request to the instance with DP rank 1. """ +def _do_background_logging(engine, interval, stop_event): + try: + while not stop_event.is_set(): + asyncio.run(engine.do_log_stats()) + stop_event.wait(interval) + except Exception as e: + print(f"vLLM background logging shutdown: {e}") + pass + + async def main(): engine_args = AsyncEngineArgs( model="ibm-research/PowerMoE-3b", data_parallel_size=2, + tensor_parallel_size=1, dtype="auto", max_model_len=2048, data_parallel_address="127.0.0.1", data_parallel_rpc_port=62300, data_parallel_size_local=1, enforce_eager=True, + enable_log_requests=True, + disable_custom_all_reduce=True, ) - engine_client = AsyncLLMEngine.from_engine_args(engine_args) - + engine_client = AsyncLLMEngine.from_engine_args( + engine_args, + # Example: Using aggregated logger + stat_loggers=[AggregatedLoggingStatLogger], + ) + stop_logging_event = threading.Event() + logging_thread = threading.Thread( + target=_do_background_logging, + args=(engine_client, 5, stop_logging_event), + daemon=True, + ) + logging_thread.start() sampling_params = SamplingParams( temperature=0.7, top_p=0.9, max_tokens=100, ) + num_prompts = 10 + for i in range(num_prompts): + prompt = "Who won the 2004 World Series?" + final_output: RequestOutput | None = None + async for output in engine_client.generate( + prompt=prompt, + sampling_params=sampling_params, + request_id=f"abcdef-{i}", + data_parallel_rank=1, + ): + final_output = output + if final_output: + print(final_output.outputs[0].text) - prompt = "Who won the 2004 World Series?" - final_output: RequestOutput | None = None - async for output in engine_client.generate( - prompt=prompt, - sampling_params=sampling_params, - request_id="abcdef", - data_parallel_rank=1, - ): - final_output = output - if final_output: - print(final_output.outputs[0].text) + stop_logging_event.set() + logging_thread.join() if __name__ == "__main__": diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index ac4e40221edf1..91345e0ae7785 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -18,6 +18,12 @@ python examples/online_serving/pooling/embedding_embed_dtype_client.py python examples/online_serving/pooling/jinaai_rerank_client.py ``` +## Multi vector retrieval usage + +```bash +python examples/online_serving/pooling/multi_vector_retrieval_client.py +``` + ## Named Entity Recognition (NER) usage ```bash diff --git a/examples/online_serving/pooling/multi_vector_retrieval_client.py b/examples/online_serving/pooling/multi_vector_retrieval_client.py new file mode 100644 index 0000000000000..ef8c4745aa531 --- /dev/null +++ b/examples/online_serving/pooling/multi_vector_retrieval_client.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Example online usage of Pooling API for multi vector retrieval. + +Run `vllm serve --runner pooling` +to start up the server in vLLM. e.g. + +vllm serve BAAI/bge-m3 +""" + +import argparse + +import requests +import torch + + +def post_http_request(prompt: dict, api_url: str) -> requests.Response: + headers = {"User-Agent": "Test Client"} + response = requests.post(api_url, headers=headers, json=prompt) + return response + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--model", type=str, default="BAAI/bge-m3") + + return parser.parse_args() + + +def main(args): + api_url = f"http://{args.host}:{args.port}/pooling" + model_name = args.model + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + prompt = {"model": model_name, "input": prompts} + + pooling_response = post_http_request(prompt=prompt, api_url=api_url) + for output in pooling_response.json()["data"]: + multi_vector = torch.tensor(output["data"]) + print(multi_vector.shape) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index acbfd8cda489a..2601c9eff971b 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -84,7 +84,7 @@ directly to load models: from vllm import LLM llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", - load_format="tensorizer" + load_format="tensorizer", ) ``` diff --git a/requirements/common.txt b/requirements/common.txt index a7aa801208969..5e7769561c4f4 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -7,7 +7,7 @@ requests >= 2.26.0 tqdm blake3 py-cpuinfo -transformers >= 4.55.2 +transformers >= 4.56.0 tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. @@ -39,7 +39,7 @@ six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that need setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. compressed-tensors == 0.12.2 # required for compressed-tensors -depyf==0.19.0 # required for profiling and debugging with compilation config +depyf==0.20.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files python-json-logger # Used by logging as per examples/others/logging_configuration.md diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index 84194f3ed01e8..e01b58220959f 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -11,6 +11,7 @@ from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from vllm import LLM, SamplingParams from vllm.config import CompilationConfig from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer @contextlib.contextmanager @@ -32,13 +33,13 @@ def temporary_environ(env_vars): os.environ[k] = v -test_params_full_cudagraph = [] +model_backends_full_cudagraph = [] # deepseek-ai/DeepSeek-V2-Lite with MLA MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"] for mla_backend in MLA_backends: - test_params_full_cudagraph.append( - pytest.param(("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])) + model_backends_full_cudagraph.append( + ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend]) ) # Qwen/Qwen2-1.5B-Instruct with other backends @@ -46,14 +47,18 @@ other_backend_configs = [ backend_configs[c] for c in backend_configs if c not in MLA_backends ] for backend_config in other_backend_configs: - test_params_full_cudagraph.append( - pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config)) - ) + model_backends_full_cudagraph.append(("Qwen/Qwen2-1.5B-Instruct", backend_config)) @pytest.fixture(scope="class") def llm_pair(request): - model, backend_config = request.param + model, backend_config, use_inductor_graph_partition = request.param + backend_config.comp_config["use_inductor_graph_partition"] = ( + use_inductor_graph_partition + ) + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("Inductor graph partition only supported in torch>=2.9") # Dynamically skip test if GPU capability is not met if ( @@ -104,7 +109,15 @@ def llm_pair(request): ) -@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True) +@pytest.mark.parametrize( + "llm_pair", + [ + pytest.param((model, backend_config, use_inductor_graph_partition)) + for model, backend_config in model_backends_full_cudagraph + for use_inductor_graph_partition in [True, False] + ], + indirect=True, +) class TestFullCUDAGraph: """ Use a class such that an llm pair is constructed once for all diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index d88645e3bfd62..246239b87d5fe 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -5,6 +5,7 @@ Test (piecewise) compilation with a simple model where multiple submodules are compiled and graph captured separately. """ +import pytest import torch from torch import nn @@ -13,12 +14,13 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, ) from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from .. import silly_attention # noqa: F401 @@ -190,16 +192,21 @@ def run_model( return output.cpu() -def test_multi_graph_piecewise_compile_outputs_equal(): +@pytest.mark.parametrize("use_inductor_graph_partition", [False, True]) +def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available in PyTorch 2.9+") + outputs = [] - # piecewise compile + # vllmcompile compile vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=use_inductor_graph_partition, ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -220,23 +227,31 @@ def test_multi_graph_piecewise_compile_outputs_equal(): # static tensor addresses inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda() - with compilation_counter.expect( - num_graphs_seen=2, # two graphs for the model - num_piecewise_graphs_seen=6, + if use_inductor_graph_partition: + # Splitting happens at Inductor lowering level, + # total piecewise fx graphs is equal to total graphs + num_piecewise_fx = 2 + num_piecewise_capturable_fx = 2 + else: # attn_one, attn_two each has 3 piecewise graphs # (pre attn, post attn, silly_attention) each - num_piecewise_capturable_graphs_seen=4, + num_piecewise_fx = 6 # attn_one, attn_two has pre attn and post attn each, total=4 - num_backend_compilations=4, # num_piecewise_capturable_graphs_seen - num_cudagraph_captured=8, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_piecewise_capturable_fx = 4 + + with compilation_counter.expect( + num_graphs_seen=2, # two graphs for the model + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, + num_cudagraph_captured=8, # num_cudagraph_sizes * num_partitions ): outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) # no compile or cudagraph vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.NO_COMPILATION, + mode=CompilationMode.NONE, ) ) cudagraph_runtime_mode = CUDAGraphMode.NONE @@ -265,9 +280,10 @@ def test_multi_graph_piecewise_compile_outputs_equal(): # piecewise compile without CUDA graph vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=False, splitting_ops=["silly::attention"], + use_inductor_graph_partition=use_inductor_graph_partition, ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -286,9 +302,9 @@ def test_multi_graph_piecewise_compile_outputs_equal(): with compilation_counter.expect( num_graphs_seen=2, - num_piecewise_graphs_seen=6, - num_piecewise_capturable_graphs_seen=4, - num_backend_compilations=4, + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, num_cudagraph_captured=0, # no cudagraph captured ): outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index bc65e3da0ae74..f61a0a4eb740d 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -13,7 +13,7 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -61,7 +61,7 @@ def _run_simple_model( ): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, use_inductor=use_inductor, splitting_ops=splitting_ops, diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 45317b456af48..75a89d692fa8f 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -9,6 +9,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are initialized randomly with a fixed seed. """ +from copy import deepcopy from dataclasses import dataclass from typing import Any @@ -20,12 +21,13 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, ) from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from .. import silly_attention # noqa: F401 @@ -257,27 +259,13 @@ def tractable_computation( @torch.inference_mode -def run_model( - llama_config, use_compile: bool, use_inductor: bool, split_attn: bool = False -) -> torch.Tensor: - if use_compile: - compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, - use_cudagraph=True, - use_inductor=use_inductor, - cudagraph_capture_sizes=[1, 2], - ) - if split_attn: - compilation_config.splitting_ops = ["silly::attention"] - cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE - else: - compilation_config = CompilationConfig( - level=CompilationLevel.NO_COMPILATION, - ) - cudagraph_runtime_mode = CUDAGraphMode.NONE +def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor: + # Start with a fresh copy to make sure there's no cache dir sharing + compile_config = deepcopy(compile_config) + cudagraph_runtime_mode = compile_config.cudagraph_mode vllm_config = VllmConfig( - compilation_config=compilation_config, additional_config=llama_config + compilation_config=compile_config, additional_config=llama_config ) with set_current_vllm_config(vllm_config): model = ( @@ -338,8 +326,25 @@ def run_model( return output.cpu() -@pytest.mark.parametrize("use_inductor", [True, False]) -def test_toy_llama(use_inductor: bool): +@pytest.mark.parametrize( + "backend, use_inductor_graph_partition", + [ + ("eager", False), # No inductor + ("inductor", False), # Inductor, Dynamo partition + ("inductor", True), # Inductor, Inductor partition + ], +) +def test_toy_llama( + backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path +): + # We disable the vLLM compile cache into a new tmp dir for 2 reasons: + # 1. To make sure we can properly track the number of Inductor compilations. + # 2. Inductor partitioning does not play nicely with Autograd cache (below) + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("Inductor graph partition only supported in torch>=2.9") + # compare output with and without piecewise compilation llama_config = LlamaConfig( @@ -350,6 +355,32 @@ def test_toy_llama(use_inductor: bool): hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True ) + compile_config_no_compile = CompilationConfig( + level=CompilationMode.NONE, + cudagraph_mode=CUDAGraphMode.NONE, + backend="eager", + ) + + compile_config_no_split = CompilationConfig( + level=CompilationMode.VLLM_COMPILE, + use_inductor_graph_partition=use_inductor_graph_partition, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + backend=backend, + cudagraph_capture_sizes=[1, 2], + ) + + # FIXME(luka/boyuan): the graph from the previous test case + # (no inductor partition) gets cached by AotAutograd so then the + # compilation with inductor partitioning incorrectly loads an unpartitioned + # graph and never partitions. I think this is a bug with custom inductor + # partitioning but does not affect vLLM more generally as vLLM uses its own + # cache (which takes inductor partitioning into account). + if use_inductor_graph_partition: + compile_config_no_split.inductor_compile_config["force_disable_caches"] = True + + compile_config_split = deepcopy(compile_config_no_split) + compile_config_split.splitting_ops = ["silly::attention"] + outputs = [] with compilation_counter.expect( num_graphs_seen=0, @@ -358,53 +389,44 @@ def test_toy_llama(use_inductor: bool): num_backend_compilations=0, num_cudagraph_captured=0, ): - outputs.append(run_model(llama_config, use_inductor=False, use_compile=False)) - run_model(tractable_config, use_inductor=False, use_compile=False) + outputs.append(run_model(llama_config, compile_config_no_compile)) - if use_inductor: + run_model(tractable_config, compile_config_no_compile) + + if backend == "inductor": kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0} else: kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0} with compilation_counter.expect( - # One graph for the model - num_graphs_seen=1, + num_graphs_seen=1, # one graph for the model num_piecewise_graphs_seen=1, num_piecewise_capturable_graphs_seen=1, - # num_piecewise_capturable_graphs_seen - num_backend_compilations=1, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_backend_compilations=1, # num_piecewise_capturable_graphs_seen num_cudagraph_captured=2, **kwargs, ): - outputs.append( - run_model(llama_config, use_inductor=use_inductor, use_compile=True) - ) - run_model(tractable_config, use_inductor=use_inductor, use_compile=True) + outputs.append(run_model(llama_config, compile_config_no_split)) + + run_model(tractable_config, compile_config_no_split) + + if use_inductor_graph_partition: + num_piecewise_fx = 1 + num_piecewise_capturable_fx = 1 + else: + num_piecewise_fx = 2 * llama_config.num_layers + 1 + num_piecewise_capturable_fx = 1 + llama_config.num_layers with compilation_counter.expect( num_graphs_seen=1, # one graph for the model - num_piecewise_graphs_seen=2 * llama_config.num_layers + 1, # 2 * num_layers + 1 - num_piecewise_capturable_graphs_seen=1 - + llama_config.num_layers, # 1 + num_layers - num_backend_compilations=1 - + llama_config.num_layers, # num_piecewise_capturable_graphs_seen - num_cudagraph_captured=2 - * ( - 1 + llama_config.num_layers - ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, + # num_cudagraph_sizes * num_partitions + num_cudagraph_captured=2 * (1 + llama_config.num_layers), ): - outputs.append( - run_model( - llama_config, - use_inductor=use_inductor, - use_compile=True, - split_attn=True, - ) - ) - run_model( - tractable_config, use_inductor=use_inductor, use_compile=True, split_attn=True - ) + outputs.append(run_model(llama_config, compile_config_split)) + run_model(tractable_config, compile_config_split) for i in range(1, len(outputs)): assert torch.allclose(outputs[0], outputs[i]) @@ -436,14 +458,14 @@ def benchmark(): for piecewise in [False, True]: if piecewise: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=cudagraph_sizes, ) else: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, cudagraph_capture_sizes=cudagraph_sizes, ) diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py index c0d3f908149f6..f33c5772906a6 100644 --- a/tests/compile/silly_attention.py +++ b/tests/compile/silly_attention.py @@ -62,5 +62,4 @@ direct_register_custom_op( mutates_args=["out"], fake_impl=silly_attention_fake, target_lib=silly_lib, - tags=(torch._C.Tag.cudagraph_unsafe,), ) diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py index 08f79d90cd367..1701d85fe84e7 100644 --- a/tests/compile/test_aot_compile.py +++ b/tests/compile/test_aot_compile.py @@ -10,7 +10,7 @@ import torch from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, VllmConfig, set_current_vllm_config, ) @@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module): def make_vllm_config() -> VllmConfig: return VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, ) ) diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index d396d3940f67f..60856f5a58067 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -10,6 +10,7 @@ import vllm.envs as envs from vllm.compilation.collective_fusion import AsyncTPPass from vllm.config import ( CompilationConfig, + CompilationMode, DeviceConfig, ModelConfig, PassConfig, @@ -332,7 +333,7 @@ def async_tp_pass_on_test_model( # this is a fake model name to construct the model config # in the vllm_config, it's not really used. - model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e" + model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8" vllm_config.model_config = ModelConfig( model=model_name, trust_remote_code=True, dtype=dtype, seed=42 ) @@ -400,7 +401,7 @@ def test_async_tp_pass_correctness( common_args.append("--enforce-eager") compilation_config = { - "level": 3, + "mode": CompilationMode.VLLM_COMPILE, "compile_sizes": [2, 4, 8], "splitting_ops": [], "pass_config": {"enable_async_tp": async_tp_enabled}, diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 9bfd72260436b..954774a8e3983 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -4,7 +4,7 @@ import dataclasses import pytest -from vllm.config import CompilationLevel +from vllm.config import CompilationMode from vllm.utils import cuda_device_count_stateless from ..utils import compare_all_settings @@ -21,7 +21,7 @@ class TestSetting: # we cannot afford testing the full Cartesian product -# of all models and all levels +# of all models and all modes @pytest.mark.parametrize( "test_setting", [ @@ -77,14 +77,15 @@ class TestSetting: method="encode", ), # vision language model - TestSetting( - model="microsoft/Phi-3.5-vision-instruct", - model_args=["--trust-remote-code", "--max-model-len", "2048"], - pp_size=2, - tp_size=1, - attn_backend="FLASH_ATTN", - method="generate_with_image", - ), + # See https://github.com/vllm-project/vllm/issues/26716. + # TestSetting( + # model="microsoft/Phi-3.5-vision-instruct", + # model_args=["--trust-remote-code", "--max-model-len", "2048"], + # pp_size=2, + # tp_size=1, + # attn_backend="FLASH_ATTN", + # method="generate_with_image", + # ), ], ) def test_compile_correctness( @@ -109,41 +110,44 @@ def test_compile_correctness( with monkeypatch.context() as m: m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) final_args = [ - "--enforce-eager", *model_args, "-pp", str(pp_size), "-tp", str(tp_size), + "-O.cudagraph_mode=none", ] all_args: list[list[str]] = [] all_envs: list[dict[str, str] | None] = [] - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.PIECEWISE, + for comp_mode in [ + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, ]: - all_args.append(final_args + [f"-O{level}"]) + for mode in [CompilationMode.NONE, comp_mode]: + all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"]) + + # inductor will change the output, so we only compare if the output + # is close, not exactly the same. + compare_all_settings( + model, + all_args, + all_envs, + method=method if method != "generate" else "generate_close", + ) + all_envs.clear() + all_args.clear() + + for mode in [ + CompilationMode.NONE, + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, + ]: + all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"]) all_envs.append({}) - - # inductor will change the output, so we only compare if the output - # is close, not exactly the same. - compare_all_settings( - model, - all_args, - all_envs, - method=method if method != "generate" else "generate_close", - ) - all_envs.clear() - all_args.clear() - - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - ]: - all_args.append(final_args + [f"-O{level}"]) all_envs.append({}) compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index ae8b0b226c313..7f51c763da73c 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -4,7 +4,7 @@ import pytest from vllm.compilation.counter import compilation_counter from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig -from vllm.config.compilation import CompilationLevel +from vllm.config.compilation import CompilationMode from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer @@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @pytest.mark.forked -def test_dynamo_as_is(vllm_runner, monkeypatch): +def test_stock_torch_compile(vllm_runner, monkeypatch): # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(dynamo_as_is_count=1), + compilation_counter.expect(stock_torch_compile_count=1), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", - compilation_config={"level": 1}, + compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE}, gpu_memory_utilization=0.4, ) as _, ): @@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch): # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), + compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", - compilation_config={"level": 0}, + compilation_config={"mode": CompilationMode.NONE}, gpu_memory_utilization=0.4, ) as _, ): @@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch): monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), + compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4 @@ -151,7 +151,7 @@ def test_splitting_ops_dynamic(): if is_torch_equal_or_newer("2.9.0.dev"): config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, splitting_ops=["vllm::unified_attention"], ) @@ -163,7 +163,7 @@ def test_splitting_ops_dynamic(): # When attn_fusion pass enabled, splitting_ops now default to attention ops. config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, pass_config={"enable_attn_fusion": True, "enable_noop": True}, custom_ops=["+quant_fp8"], cudagraph_mode=CUDAGraphMode.PIECEWISE, @@ -178,7 +178,7 @@ def test_splitting_ops_dynamic(): if is_torch_equal_or_newer("2.9.0.dev"): config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, pass_config={"enable_attn_fusion": True, "enable_noop": True}, custom_ops=["+quant_fp8"], diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py index 6b050207ec41b..e459bc539f2b8 100644 --- a/tests/compile/test_decorator.py +++ b/tests/compile/test_decorator.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest import torch from torch import nn @@ -8,12 +9,13 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_comp from vllm.config import ( CacheConfig, CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, ) from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from . import silly_attention # noqa: F401 @@ -65,18 +67,40 @@ def run_model( return output.cpu() -def test_ignore_torch_compile_decorator(): +@pytest.mark.parametrize("use_inductor_graph_partition", [True, False]) +def test_ignore_torch_compile_decorator(use_inductor_graph_partition, monkeypatch): + # disable compile cache so that we can count the number of compilations + # appropriately + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available in PyTorch 2.9+") + # piecewise vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=use_inductor_graph_partition, ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE + expected_num_graphs_seen = 1 + expected_num_cudagraph_captured = ( + 4 # num_cudagraph_sizes * num cudagraphs to capture + ) + if use_inductor_graph_partition: + expected_num_piecewise_graphs_seen = 1 + expected_num_piecewise_capturable_graphs_seen = 1 + expected_num_backend_compilations = 1 + else: + expected_num_piecewise_graphs_seen = 3 + expected_num_piecewise_capturable_graphs_seen = 2 + expected_num_backend_compilations = 2 + @support_torch_compile class A(nn.Module): def __init__( @@ -103,12 +127,11 @@ def test_ignore_torch_compile_decorator(): # A has support_torch_compile with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_graphs_seen=expected_num_graphs_seen, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, + num_cudagraph_captured=expected_num_cudagraph_captured, ): run_model(vllm_config, mod_A, cudagraph_runtime_mode) @@ -130,12 +153,11 @@ def test_ignore_torch_compile_decorator(): # C's support_torch_compile should override B's ignore_torch_compile with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_graphs_seen=expected_num_graphs_seen, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, + num_cudagraph_captured=expected_num_cudagraph_captured, ): run_model(vllm_config, mod_C, cudagraph_runtime_mode) @@ -178,16 +200,25 @@ class A(nn.Module): return x -def test_conditional_compile_enable_if(): +@pytest.mark.parametrize("use_inductor_graph_partition", [True, False]) +def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch): + # disable compile cache so that we can count the number of compilations + # appropriately + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available in PyTorch 2.9+") + vllm_config = VllmConfig( cache_config=CacheConfig( kv_sharing_fast_prefill=True, ), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=use_inductor_graph_partition, ), ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -195,17 +226,26 @@ def test_conditional_compile_enable_if(): with set_current_vllm_config(vllm_config): mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda() + if use_inductor_graph_partition: + expected_num_piecewise_graphs_seen = 2 + expected_num_piecewise_capturable_graphs_seen = 2 + expected_num_backend_compilations = 2 + else: + expected_num_piecewise_graphs_seen = 6 + expected_num_piecewise_capturable_graphs_seen = 4 + expected_num_backend_compilations = 4 + # A has support_torch_compile but enable_if fn returns False # enalbe_if will be True for B, so we expect mod1 and mod2 # to be compiled with compilation_counter.expect( num_graphs_seen=2, - num_piecewise_graphs_seen=6, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, # 3 piecewise graphs per instance of B() - num_piecewise_capturable_graphs_seen=4, - num_backend_compilations=4, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, num_cudagraph_captured=8, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + # num_cudagraph_sizes * num cudagraphable graphs to capture ): run_model(vllm_config, mod_A, cudagraph_runtime_mode) @@ -216,23 +256,34 @@ def test_conditional_compile_enable_if(): kv_sharing_fast_prefill=False, ), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=use_inductor_graph_partition, ), ) with set_current_vllm_config(vllm_config): mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda() + if use_inductor_graph_partition: + expected_num_piecewise_graphs_seen = 1 + expected_num_piecewise_capturable_graphs_seen = 1 + expected_num_backend_compilations = 1 + else: + # 3 attn ops and 4 non-attn ops + expected_num_piecewise_graphs_seen = 7 + expected_num_piecewise_capturable_graphs_seen = 4 + expected_num_backend_compilations = 4 + with compilation_counter.expect( num_graphs_seen=1, - num_piecewise_graphs_seen=7, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, # 3 attn ops and 4 non-attn ops - num_piecewise_capturable_graphs_seen=4, - num_backend_compilations=4, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, num_cudagraph_captured=8, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + # num_cudagraph_sizes * num cudagraphable graphs to capture ): run_model(vllm_config, mod_A, cudagraph_runtime_mode) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 2f3794c90b204..2d290771f9ad7 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -12,7 +12,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams from vllm.attention.backends.registry import _Backend from vllm.attention.selector import global_force_attn_backend_context_manager -from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig +from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.platforms import current_platform from vllm.utils import is_torch_equal_or_newer @@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None): @pytest.mark.parametrize( - "optimization_level", - [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], + "compilation_mode", + [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE], ) @pytest.mark.parametrize("model_info", models_list(all=True)) @create_new_process_for_each_test() def test_full_graph( monkeypatch: pytest.MonkeyPatch, model_info: tuple[str, dict[str, Any]], - optimization_level: int, + compilation_mode: int, ): model, model_kwargs = model_info with monkeypatch.context(): print(f"MODEL={model}") - run_model(optimization_level, model, model_kwargs) + run_model(compilation_mode, model, model_kwargs) # TODO(luka) add other supported compilation config scenarios here @@ -104,7 +104,7 @@ def test_full_graph( [ # additional compile sizes, only some of the models ( - CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]), + CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]), model, ) for model in models_list(all=False) @@ -113,7 +113,7 @@ def test_full_graph( # RMSNorm + quant fusion, only 8-bit quant models ( CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm"], pass_config=PassConfig(enable_fusion=True, enable_noop=True), ), @@ -125,7 +125,8 @@ def test_full_graph( # Test depyf integration works ( CompilationConfig( - level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir() + mode=CompilationMode.VLLM_COMPILE, + debug_dump_path=tempfile.gettempdir(), ), ("facebook/opt-125m", {}), ), @@ -134,7 +135,7 @@ def test_full_graph( # graph inductor partition ( CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, # inductor graph partition uses # torch._C.Tag.cudagraph_unsafe to specify splitting ops use_inductor_graph_partition=True, @@ -164,10 +165,10 @@ def test_custom_compile_config( @pytest.mark.parametrize( - "optimization_level", - [CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE], + "compilation_mode", + [CompilationMode.NONE, CompilationMode.VLLM_COMPILE], ) -def test_fp8_kv_scale_compile(optimization_level: int): +def test_fp8_kv_scale_compile(compilation_mode: int): model = "Qwen/Qwen2-0.5B" model_kwargs = { "quantization": "fp8", @@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int): "calculate_kv_scales": True, "max_model_len": 512, } - run_model(optimization_level, model, model_kwargs) + run_model(compilation_mode, model, model_kwargs) def test_inductor_graph_partition_attn_fusion(caplog_vllm): @@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm): model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, cudagraph_mode=CUDAGraphMode.PIECEWISE, custom_ops=["+quant_fp8"], diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 7c22336432299..1a5eaf2639b36 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -13,7 +13,7 @@ from vllm.compilation.fusion import ( ) from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass -from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, @@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant( vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"], pass_config=PassConfig(enable_fusion=True, enable_noop=True), ) diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index 7e5c460db1744..fbcd6c71fb723 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -12,7 +12,7 @@ from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, DeviceConfig, ModelConfig, PassConfig, @@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model( vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"] + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"] ) ) vllm_config.compilation_config.pass_config = PassConfig( @@ -229,7 +229,7 @@ def all_reduce_fusion_pass_on_test_model( # this is a fake model name to construct the model config # in the vllm_config, it's not really used. - model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e" + model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8" vllm_config.model_config = ModelConfig( model=model_name, trust_remote_code=True, dtype=dtype, seed=42 ) diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index d1ab85cfb875c..a8d78daa32a1d 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -19,7 +19,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import ( CacheConfig, CompilationConfig, - CompilationLevel, + CompilationMode, ModelConfig, PassConfig, SchedulerConfig, @@ -321,7 +321,7 @@ def test_attention_quant_pattern( ), scheduler_config=SchedulerConfig(max_num_seqs=1024), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+quant_fp8"], use_inductor_graph_partition=use_inductor_graph_partition, ), diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py index 188f4514dda5f..0ccc1a0161629 100644 --- a/tests/compile/test_noop_elimination.py +++ b/tests/compile/test_noop_elimination.py @@ -6,7 +6,7 @@ import torch import vllm from vllm.compilation.noop_elimination import NoOpEliminationPass -from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig from .backend import TestBackend @@ -50,7 +50,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig(enable_noop=True), ) ) @@ -98,7 +98,7 @@ def test_non_noop_slice_preserved(): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig(enable_noop=True), ) ) diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py index afb31cb95be09..6abab88e63696 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/test_sequence_parallelism.py @@ -278,7 +278,7 @@ def sequence_parallelism_pass_on_test_model( # this is a fake model name to construct the model config # in the vllm_config, it's not really used. - model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e" + model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8" vllm_config.model_config = ModelConfig( model=model_name, trust_remote_code=True, dtype=dtype, seed=42 ) diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index b2fff822bbbb5..da0afd9eaa49f 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -5,7 +5,7 @@ import torch from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel +from vllm.config import CompilationMode class MyMod(torch.nn.Module): @@ -20,7 +20,7 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher): self.model = model compiled_callable = torch.compile(self.forward, backend="eager") super().__init__( - compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE + compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE ) def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): diff --git a/tests/conftest.py b/tests/conftest.py index 2fde7f97836d6..369acb92cfb91 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -334,7 +334,7 @@ class HfRunner: trust_remote_code=trust_remote_code, ) self.device = self.get_default_device() - self.dtype = torch_dtype = _get_and_verify_dtype( + self.dtype = dtype = _get_and_verify_dtype( self.model_name, self.config, dtype=dtype, @@ -342,7 +342,7 @@ class HfRunner: ) model_kwargs = model_kwargs if model_kwargs is not None else {} - model_kwargs.setdefault("torch_dtype", torch_dtype) + model_kwargs.setdefault("dtype", dtype) if is_sentence_transformer: # Lazy init required for AMD CI @@ -388,7 +388,7 @@ class HfRunner: if not skip_tokenizer_init: self.tokenizer = AutoTokenizer.from_pretrained( model_name, - torch_dtype=torch_dtype, + dtype=dtype, trust_remote_code=trust_remote_code, ) @@ -398,7 +398,7 @@ class HfRunner: self.processor = AutoProcessor.from_pretrained( model_name, - torch_dtype=torch_dtype, + dtype=dtype, trust_remote_code=trust_remote_code, ) if skip_tokenizer_init: @@ -1011,8 +1011,12 @@ class VllmRunner: req_outputs = self.llm.embed(inputs, *args, **kwargs) return [req_output.outputs.embedding for req_output in req_outputs] - def encode(self, prompts: list[str]) -> list[list[float]]: - req_outputs = self.llm.encode(prompts) + def token_embed(self, prompts: list[str]) -> list[list[float]]: + req_outputs = self.llm.encode(prompts, pooling_task="token_embed") + return [req_output.outputs.data for req_output in req_outputs] + + def token_classify(self, prompts: list[str]) -> list[list[float]]: + req_outputs = self.llm.encode(prompts, pooling_task="token_classify") return [req_output.outputs.data for req_output in req_outputs] def reward(self, prompts: list[str]) -> list[list[float]]: diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index 149b502a85a75..5495640af07eb 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -204,17 +204,21 @@ def _compare_cp_with_tp( CP_TEXT_GENERATION_MODELS = { - # [MLA attention only] "deepseek-ai/DeepSeek-V2-Lite-Chat": [ CPTestSettings.detailed(), CPTestSettings.detailed(tp_base=2), ], + "bigcode/gpt_bigcode-santacoder": [ + CPTestSettings.detailed(), + CPTestSettings.detailed(tp_base=2), + ], } CP_TEST_MODELS = [ # TODO support other models # [LANGUAGE GENERATION] "deepseek-ai/DeepSeek-V2-Lite-Chat", + "bigcode/gpt_bigcode-santacoder", ] diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index a431bf30fc890..362e9daf5ae04 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -15,6 +15,7 @@ from typing import Literal, NamedTuple import pytest +from vllm.config.compilation import CompilationMode from vllm.config.model import RunnerOption from vllm.logger import init_logger @@ -234,7 +235,7 @@ def _compare_sp( common_args.append("--skip-tokenizer-init") compilation_config = { - "level": 3, + "mode": CompilationMode.VLLM_COMPILE, "custom_ops": ["+rms_norm"], "compile_sizes": [4, 8], "pass_config": { diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 78928a53942f9..c73083b0b5ef6 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -226,30 +226,30 @@ def test_compilation_config(): # set to O3 args = parser.parse_args(["-O0"]) - assert args.compilation_config.level == 0 + assert args.compilation_config.mode == 0 # set to O 3 (space) args = parser.parse_args(["-O", "1"]) - assert args.compilation_config.level == 1 + assert args.compilation_config.mode == 1 # set to O 3 (equals) args = parser.parse_args(["-O=2"]) - assert args.compilation_config.level == 2 + assert args.compilation_config.mode == 2 - # set to O.level 3 - args = parser.parse_args(["-O.level", "3"]) - assert args.compilation_config.level == 3 + # set to O.mode 3 + args = parser.parse_args(["-O.mode", "3"]) + assert args.compilation_config.mode == 3 # set to string form of a dict args = parser.parse_args( [ "-O", - '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '"use_inductor": false}', ] ) assert ( - args.compilation_config.level == 3 + args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and not args.compilation_config.use_inductor ) @@ -258,12 +258,12 @@ def test_compilation_config(): args = parser.parse_args( [ "--compilation-config=" - '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '"use_inductor": true}', ] ) assert ( - args.compilation_config.level == 3 + args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.use_inductor ) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index a96f0134c2ffb..a2d8993441fcd 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -53,21 +53,34 @@ def base64_encoded_audio() -> dict[str, str]: } +def dummy_messages_from_audio_url( + audio_urls: str | list[str], + content_text: str = "What's happening in this audio?", +): + if isinstance(audio_urls, str): + audio_urls = [audio_urls] + + return [ + { + "role": "user", + "content": [ + *( + {"type": "audio_url", "audio_url": {"url": audio_url}} + for audio_url in audio_urls + ), + {"type": "text", "text": content_text}, + ], + } + ] + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) async def test_single_chat_session_audio( client: openai.AsyncOpenAI, model_name: str, audio_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "audio_url", "audio_url": {"url": audio_url}}, - {"type": "text", "text": "What's happening in this audio?"}, - ], - } - ] + messages = dummy_messages_from_audio_url(audio_url) # test single completion chat_completion = await client.chat.completions.create( @@ -138,20 +151,9 @@ async def test_single_chat_session_audio_base64encoded( audio_url: str, base64_encoded_audio: dict[str, str], ): - messages = [ - { - "role": "user", - "content": [ - { - "type": "audio_url", - "audio_url": { - "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's happening in this audio?"}, - ], - } - ] + messages = dummy_messages_from_audio_url( + f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" + ) # test single completion chat_completion = await client.chat.completions.create( @@ -252,15 +254,7 @@ async def test_single_chat_session_input_audio( async def test_chat_streaming_audio( client: openai.AsyncOpenAI, model_name: str, audio_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "audio_url", "audio_url": {"url": audio_url}}, - {"type": "text", "text": "What's happening in this audio?"}, - ], - } - ] + messages = dummy_messages_from_audio_url(audio_url) # test single completion chat_completion = await client.chat.completions.create( @@ -365,18 +359,7 @@ async def test_chat_streaming_input_audio( async def test_multi_audio_input( client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str] ): - messages = [ - { - "role": "user", - "content": [ - *( - {"type": "audio_url", "audio_url": {"url": audio_url}} - for audio_url in audio_urls - ), - {"type": "text", "text": "What's happening in this audio?"}, - ], - } - ] + messages = dummy_messages_from_audio_url(audio_urls) if len(audio_urls) > MAXIMUM_AUDIOS: with pytest.raises(openai.BadRequestError): # test multi-audio input diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 44d4176655375..6833f8d96d1c4 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -247,10 +247,10 @@ async def test_tool_id_kimi_k2( ) assert chat_completion.choices[0].message.tool_calls is not None assert len(chat_completion.choices[0].message.tool_calls) > 0 - assert ( - chat_completion.choices[0].message.tool_calls[0].id - == "functions.get_current_weather:0" - ) + assert chat_completion.choices[0].message.tool_calls[0].id in [ + "functions.get_current_weather:0", + "functions.get_forecast:1", + ] else: # Streaming test output_stream = await k2_client.chat.completions.create( @@ -266,7 +266,10 @@ async def test_tool_id_kimi_k2( if chunk.choices and chunk.choices[0].delta.tool_calls: output.extend(chunk.choices[0].delta.tool_calls) for o in output: - assert o.id is None or o.id == "functions.get_current_weather:0" + assert o.id is None or o.id in [ + "functions.get_current_weather:0", + "functions.get_forecast:1", + ] @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py new file mode 100644 index 0000000000000..3ddf2308eb1d5 --- /dev/null +++ b/tests/entrypoints/openai/test_enable_force_include_usage.py @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import openai +import pytest +import pytest_asyncio + +from ...utils import RemoteOpenAIServer + + +@pytest.fixture(scope="module") +def chat_server_with_force_include_usage(request): # noqa: F811 + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "128", + "--enforce-eager", + "--max-num-seqs", + "1", + "--enable-force-include-usage", + "--port", + "55857", + "--gpu-memory-utilization", + "0.2", + ] + + with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def chat_client_with_force_include_usage(chat_server_with_force_include_usage): + async with chat_server_with_force_include_usage.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +async def test_chat_with_enable_force_include_usage( + chat_client_with_force_include_usage: openai.AsyncOpenAI, +): + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ] + + stream = await chat_client_with_force_include_usage.chat.completions.create( + model="Qwen/Qwen3-0.6B", + messages=messages, + max_completion_tokens=10, + extra_body=dict(min_tokens=10), + temperature=0.0, + stream=True, + ) + last_completion_tokens = 0 + async for chunk in stream: + if not len(chunk.choices): + assert chunk.usage.prompt_tokens >= 0 + assert ( + last_completion_tokens == 0 + or chunk.usage.completion_tokens > last_completion_tokens + or ( + not chunk.choices + and chunk.usage.completion_tokens == last_completion_tokens + ) + ) + assert chunk.usage.total_tokens == ( + chunk.usage.prompt_tokens + chunk.usage.completion_tokens + ) + else: + assert chunk.usage is None + + +@pytest.fixture(scope="module") +def transcription_server_with_force_include_usage(): + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-num-seqs", + "1", + "--enforce-eager", + "--enable-force-include-usage", + "--gpu-memory-utilization", + "0.2", + ] + + with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def transcription_client_with_force_include_usage( + transcription_server_with_force_include_usage, +): + async with ( + transcription_server_with_force_include_usage.get_async_client() as async_client + ): + yield async_client + + +@pytest.mark.asyncio +async def test_transcription_with_enable_force_include_usage( + transcription_client_with_force_include_usage, winning_call +): + res = ( + await transcription_client_with_force_include_usage.audio.transcriptions.create( + model="openai/whisper-large-v3-turbo", + file=winning_call, + language="en", + temperature=0.0, + stream=True, + timeout=30, + ) + ) + + async for chunk in res: + if not len(chunk.choices): + # final usage sent + usage = chunk.usage + assert isinstance(usage, dict) + assert usage["prompt_tokens"] > 0 + assert usage["completion_tokens"] > 0 + assert usage["total_tokens"] > 0 + else: + assert not hasattr(chunk, "usage") diff --git a/tests/entrypoints/openai/test_protocol.py b/tests/entrypoints/openai/test_protocol.py new file mode 100644 index 0000000000000..e9b1cfb58b502 --- /dev/null +++ b/tests/entrypoints/openai/test_protocol.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from openai_harmony import ( + Message, +) + +from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages + + +def test_serialize_message() -> None: + dict_value = {"a": 1, "b": "2"} + assert serialize_message(dict_value) == dict_value + + msg_value = { + "role": "assistant", + "name": None, + "content": [{"type": "text", "text": "Test 1"}], + "channel": "analysis", + } + msg = Message.from_dict(msg_value) + assert serialize_message(msg) == msg_value + + +def test_serialize_messages() -> None: + assert serialize_messages(None) is None + assert serialize_messages([]) is None + + dict_value = {"a": 3, "b": "4"} + msg_value = { + "role": "assistant", + "name": None, + "content": [{"type": "text", "text": "Test 2"}], + "channel": "analysis", + } + msg = Message.from_dict(msg_value) + assert serialize_messages([msg, dict_value]) == [msg_value, dict_value] diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 0720c8aa51219..4251d06435c11 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -12,12 +12,26 @@ from openai_harmony import ( Message, ) -from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages - from ...utils import RemoteOpenAIServer MODEL_NAME = "openai/gpt-oss-20b" +GET_WEATHER_SCHEMA = { + "type": "function", + "name": "get_weather", + "description": "Get current temperature for provided coordinates in celsius.", # noqa + "parameters": { + "type": "object", + "properties": { + "latitude": {"type": "number"}, + "longitude": {"type": "number"}, + }, + "required": ["latitude", "longitude"], + "additionalProperties": False, + }, + "strict": True, +} + @pytest.fixture(scope="module") def server(): @@ -307,6 +321,54 @@ async def test_streaming_types(client: OpenAI, model_name: str): assert len(stack_of_event_types) == 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_function_calling_with_streaming_types(client: OpenAI, model_name: str): + # this links the "done" type with the "start" type + # so every "done" type should have a corresponding "start" type + # and every open block should be closed by the end of the stream + pairs_of_event_types = { + "response.completed": "response.created", + "response.output_item.done": "response.output_item.added", + "response.output_text.done": "response.output_text.delta", + "response.reasoning_text.done": "response.reasoning_text.delta", + "response.reasoning_part.done": "response.reasoning_part.added", + "response.function_call_arguments.done": "response.function_call_arguments.delta", # noqa + } + + tools = [GET_WEATHER_SCHEMA] + input_list = [ + { + "role": "user", + "content": "What's the weather like in Paris today?", + } + ] + stream_response = await client.responses.create( + model=model_name, + input=input_list, + tools=tools, + stream=True, + ) + + stack_of_event_types = [] + async for event in stream_response: + if event.type == "response.created": + stack_of_event_types.append(event.type) + elif event.type == "response.completed": + assert stack_of_event_types[-1] == pairs_of_event_types[event.type] + stack_of_event_types.pop() + if event.type.endswith("added"): + stack_of_event_types.append(event.type) + elif event.type.endswith("delta"): + if stack_of_event_types[-1] == event.type: + continue + stack_of_event_types.append(event.type) + elif event.type.endswith("done"): + assert stack_of_event_types[-1] == pairs_of_event_types[event.type] + stack_of_event_types.pop() + assert len(stack_of_event_types) == 0 + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("background", [True, False]) @@ -485,23 +547,7 @@ def call_function(name, args): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_function_calling(client: OpenAI, model_name: str): - tools = [ - { - "type": "function", - "name": "get_weather", - "description": "Get current temperature for provided coordinates in celsius.", # noqa - "parameters": { - "type": "object", - "properties": { - "latitude": {"type": "number"}, - "longitude": {"type": "number"}, - }, - "required": ["latitude", "longitude"], - "additionalProperties": False, - }, - "strict": True, - } - ] + tools = [GET_WEATHER_SCHEMA] response = await client.responses.create( model=model_name, @@ -567,21 +613,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str): }, "strict": True, }, - { - "type": "function", - "name": "get_weather", - "description": "Get current temperature for provided coordinates in celsius.", # noqa - "parameters": { - "type": "object", - "properties": { - "latitude": {"type": "number"}, - "longitude": {"type": "number"}, - }, - "required": ["latitude", "longitude"], - "additionalProperties": False, - }, - "strict": True, - }, + GET_WEATHER_SCHEMA, ] response = await client.responses.create( @@ -645,23 +677,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_function_calling_required(client: OpenAI, model_name: str): - tools = [ - { - "type": "function", - "name": "get_weather", - "description": "Get current temperature for provided coordinates in celsius.", # noqa - "parameters": { - "type": "object", - "properties": { - "latitude": {"type": "number"}, - "longitude": {"type": "number"}, - }, - "required": ["latitude", "longitude"], - "additionalProperties": False, - }, - "strict": True, - } - ] + tools = [GET_WEATHER_SCHEMA] with pytest.raises(BadRequestError): await client.responses.create( @@ -691,23 +707,7 @@ async def test_system_message_with_tools(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_function_calling_full_history(client: OpenAI, model_name: str): - tools = [ - { - "type": "function", - "name": "get_weather", - "description": "Get current temperature for provided coordinates in celsius.", # noqa - "parameters": { - "type": "object", - "properties": { - "latitude": {"type": "number"}, - "longitude": {"type": "number"}, - }, - "required": ["latitude", "longitude"], - "additionalProperties": False, - }, - "strict": True, - } - ] + tools = [GET_WEATHER_SCHEMA] input_messages = [ {"role": "user", "content": "What's the weather like in Paris today?"} @@ -747,6 +747,74 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str): assert response_2.output_text is not None +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_function_calling_with_stream(client: OpenAI, model_name: str): + tools = [GET_WEATHER_SCHEMA] + input_list = [ + { + "role": "user", + "content": "What's the weather like in Paris today?", + } + ] + stream_response = await client.responses.create( + model=model_name, + input=input_list, + tools=tools, + stream=True, + ) + assert stream_response is not None + final_tool_calls = {} + final_tool_calls_named = {} + async for event in stream_response: + if event.type == "response.output_item.added": + if event.item.type != "function_call": + continue + final_tool_calls[event.output_index] = event.item + final_tool_calls_named[event.item.name] = event.item + elif event.type == "response.function_call_arguments.delta": + index = event.output_index + tool_call = final_tool_calls[index] + if tool_call: + tool_call.arguments += event.delta + final_tool_calls_named[tool_call.name] = tool_call + elif event.type == "response.function_call_arguments.done": + assert event.arguments == final_tool_calls_named[event.name].arguments + for tool_call in final_tool_calls.values(): + if ( + tool_call + and tool_call.type == "function_call" + and tool_call.name == "get_weather" + ): + args = json.loads(tool_call.arguments) + result = call_function(tool_call.name, args) + input_list += [tool_call] + break + assert result is not None + response = await client.responses.create( + model=model_name, + input=input_list + + [ + { + "type": "function_call_output", + "call_id": tool_call.call_id, + "output": str(result), + } + ], + tools=tools, + stream=True, + ) + assert response is not None + async for event in response: + # check that no function call events in the stream + assert event.type != "response.function_call_arguments.delta" + assert event.type != "response.function_call_arguments.done" + # check that the response contains output text + if event.type == "response.completed": + assert len(event.response.output) > 0 + assert event.response.output_text is not None + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_output_messages_enabled(client: OpenAI, model_name: str, server): @@ -760,32 +828,3 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server): assert response.status == "completed" assert len(response.input_messages) > 0 assert len(response.output_messages) > 0 - - -def test_serialize_message() -> None: - dict_value = {"a": 1, "b": "2"} - assert serialize_message(dict_value) == dict_value - - msg_value = { - "role": "assistant", - "name": None, - "content": [{"type": "text", "text": "Test 1"}], - "channel": "analysis", - } - msg = Message.from_dict(msg_value) - assert serialize_message(msg) == msg_value - - -def test_serialize_messages() -> None: - assert serialize_messages(None) is None - assert serialize_messages([]) is None - - dict_value = {"a": 3, "b": "4"} - msg_value = { - "role": "assistant", - "name": None, - "content": [{"type": "text", "text": "Test 2"}], - "channel": "analysis", - } - msg = Message.from_dict(msg_value) - assert serialize_messages([msg, dict_value]) == [msg_value, dict_value] diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 4c7d1c14ca17b..7ecdac518f97f 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -55,21 +55,34 @@ def base64_encoded_video() -> dict[str, str]: } +def dummy_messages_from_video_url( + video_urls: str | list[str], + content_text: str = "What's in this video?", +): + if isinstance(video_urls, str): + video_urls = [video_urls] + + return [ + { + "role": "user", + "content": [ + *( + {"type": "video_url", "video_url": {"url": video_url}} + for video_url in video_urls + ), + {"type": "text", "text": content_text}, + ], + } + ] + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) async def test_single_chat_session_video( client: openai.AsyncOpenAI, model_name: str, video_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "video_url", "video_url": {"url": video_url}}, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url(video_url) # test single completion chat_completion = await client.chat.completions.create( @@ -137,15 +150,7 @@ async def test_error_on_invalid_video_url_type( async def test_single_chat_session_video_beamsearch( client: openai.AsyncOpenAI, model_name: str, video_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "video_url", "video_url": {"url": video_url}}, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url(video_url) chat_completion = await client.chat.completions.create( model=model_name, @@ -172,20 +177,9 @@ async def test_single_chat_session_video_base64encoded( video_url: str, base64_encoded_video: dict[str, str], ): - messages = [ - { - "role": "user", - "content": [ - { - "type": "video_url", - "video_url": { - "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url( + f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" + ) # test single completion chat_completion = await client.chat.completions.create( @@ -231,20 +225,10 @@ async def test_single_chat_session_video_base64encoded_beamsearch( video_url: str, base64_encoded_video: dict[str, str], ): - messages = [ - { - "role": "user", - "content": [ - { - "type": "video_url", - "video_url": { - "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url( + f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" + ) + chat_completion = await client.chat.completions.create( model=model_name, messages=messages, @@ -265,15 +249,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch( async def test_chat_streaming_video( client: openai.AsyncOpenAI, model_name: str, video_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "video_url", "video_url": {"url": video_url}}, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url(video_url) # test single completion chat_completion = await client.chat.completions.create( @@ -318,18 +294,7 @@ async def test_chat_streaming_video( async def test_multi_video_input( client: openai.AsyncOpenAI, model_name: str, video_urls: list[str] ): - messages = [ - { - "role": "user", - "content": [ - *( - {"type": "video_url", "video_url": {"url": video_url}} - for video_url in video_urls - ), - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url(video_urls) if len(video_urls) > MAXIMUM_VIDEOS: with pytest.raises(openai.BadRequestError): # test multi-video input diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 5a15a352f45cc..09bd0dabb799a 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -78,6 +78,27 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]: } +def dummy_messages_from_image_url( + image_urls: str | list[str], + content_text: str = "What's in this image?", +): + if isinstance(image_urls, str): + image_urls = [image_urls] + + return [ + { + "role": "user", + "content": [ + *( + {"type": "image_url", "image_url": {"url": image_url}} + for image_url in image_urls + ), + {"type": "text", "text": content_text}, + ], + } + ] + + def get_hf_prompt_tokens(model_name, content, image_url): processor = AutoProcessor.from_pretrained( model_name, trust_remote_code=True, num_crops=4 @@ -107,15 +128,7 @@ async def test_single_chat_session_image( client: openai.AsyncOpenAI, model_name: str, image_url: str ): content_text = "What's in this image?" - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": content_text}, - ], - } - ] + messages = dummy_messages_from_image_url(image_url, content_text) max_completion_tokens = 10 # test single completion @@ -188,15 +201,8 @@ async def test_error_on_invalid_image_url_type( async def test_single_chat_session_image_beamsearch( client: openai.AsyncOpenAI, model_name: str, image_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + content_text = "What's in this image?" + messages = dummy_messages_from_image_url(image_url, content_text) chat_completion = await client.chat.completions.create( model=model_name, @@ -226,20 +232,10 @@ async def test_single_chat_session_image_base64encoded( base64_encoded_image: dict[str, str], ): content_text = "What's in this image?" - messages = [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": content_text}, - ], - } - ] + messages = dummy_messages_from_image_url( + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", + content_text, + ) max_completion_tokens = 10 # test single completion @@ -293,20 +289,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch( raw_image_url = TEST_IMAGE_ASSETS[image_idx] expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] - messages = [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + messages = dummy_messages_from_image_url( + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" + ) + chat_completion = await client.chat.completions.create( model=model_name, messages=messages, @@ -326,15 +312,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( async def test_chat_streaming_image( client: openai.AsyncOpenAI, model_name: str, image_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + messages = dummy_messages_from_image_url(image_url) # test single completion chat_completion = await client.chat.completions.create( @@ -381,18 +359,7 @@ async def test_chat_streaming_image( async def test_multi_image_input( client: openai.AsyncOpenAI, model_name: str, image_urls: list[str] ): - messages = [ - { - "role": "user", - "content": [ - *( - {"type": "image_url", "image_url": {"url": image_url}} - for image_url in image_urls - ), - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + messages = dummy_messages_from_image_url(image_urls) if len(image_urls) > MAXIMUM_IMAGES: with pytest.raises(openai.BadRequestError): # test multi-image input diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py index 488c82c9fe7fd..96f634ee0a8c7 100644 --- a/tests/entrypoints/pooling/llm/test_classify.py +++ b/tests/entrypoints/pooling/llm/test_classify.py @@ -63,7 +63,7 @@ def test_encode_api(llm: LLM): # chunked prefill does not support all pooling err_msg = "pooling_task must be one of.+" with pytest.raises(ValueError, match=err_msg): - llm.encode(prompts, use_tqdm=False) + llm.encode(prompts, pooling_task="token_classify", use_tqdm=False) def test_score_api(llm: LLM): diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py index c53941390bd10..5455b5f91fc09 100644 --- a/tests/entrypoints/pooling/llm/test_embedding.py +++ b/tests/entrypoints/pooling/llm/test_embedding.py @@ -35,6 +35,13 @@ def llm(): cleanup_dist_env_and_memory() +@pytest.mark.skip_global_cleanup +def test_encode_api(llm: LLM): + outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False) + multi_vector = outputs[0].outputs.data + assert multi_vector.shape == (11, 384) + + def test_pooling_params(llm: LLM): def get_outputs(normalize): outputs = llm.embed( diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py index 9ba380334e5a2..ca85d2758fce4 100644 --- a/tests/entrypoints/pooling/llm/test_encode.py +++ b/tests/entrypoints/pooling/llm/test_encode.py @@ -57,20 +57,24 @@ def test_multiple_pooling_params(llm: LLM): ] # Multiple PoolingParams should be matched with each prompt - outputs = llm.encode(PROMPTS, pooling_params=pooling_params) + outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed") assert len(PROMPTS) == len(outputs) # Exception raised, if the size of params does not match the size of prompts with pytest.raises(ValueError): - outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3]) + outputs = llm.encode( + PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed" + ) # Single PoolingParams should be applied to every prompt single_pooling_params = PoolingParams() - outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params) + outputs = llm.encode( + PROMPTS, pooling_params=single_pooling_params, pooling_task="embed" + ) assert len(PROMPTS) == len(outputs) # pooling_params is None, default params should be applied - outputs = llm.encode(PROMPTS, pooling_params=None) + outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed") assert len(PROMPTS) == len(outputs) diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/llm/test_reward.py index 8312ff180b36f..81058dbad891b 100644 --- a/tests/entrypoints/pooling/llm/test_reward.py +++ b/tests/entrypoints/pooling/llm/test_reward.py @@ -36,22 +36,23 @@ def llm(): cleanup_dist_env_and_memory() -@pytest.mark.skip_global_cleanup def test_pooling_params(llm: LLM): - def get_outputs(softmax): + def get_outputs(activation): outputs = llm.reward( - prompts, pooling_params=PoolingParams(softmax=softmax), use_tqdm=False + prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False ) return torch.cat([x.outputs.data for x in outputs]) - default = get_outputs(softmax=None) - w_softmax = get_outputs(softmax=True) - wo_softmax = get_outputs(softmax=False) + default = get_outputs(activation=None) + w_activation = get_outputs(activation=True) + wo_activation = get_outputs(activation=False) - assert torch.allclose(default, w_softmax, atol=1e-2), "Default should use softmax." - assert not torch.allclose(w_softmax, wo_softmax, atol=1e-2), ( - "wo_softmax should not use softmax." + assert torch.allclose(default, w_activation, atol=1e-2), ( + "Default should use activation." ) - assert torch.allclose(softmax(wo_softmax), w_softmax, atol=1e-2), ( - "w_softmax should be close to softmax(wo_softmax)." + assert not torch.allclose(w_activation, wo_activation, atol=1e-2), ( + "wo_activation should not use activation." + ) + assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), ( + "w_activation should be close to activation(wo_activation)." ) diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py index 8a3d298a48e2e..ab8ca9d68e0e7 100644 --- a/tests/entrypoints/pooling/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -17,6 +17,7 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import ( EMBED_DTYPE_TO_TORCH_DTYPE, EmbeddingResponse, + PoolingResponse, ) from vllm.transformers_utils.tokenizer import get_tokenizer @@ -509,3 +510,20 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str): assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), ( "w_normal should be close to normal(wo_normal)." ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling(server: RemoteOpenAIServer, model_name: str): + input_text = ["The chef prepared a delicious meal."] + + response = requests.post( + server.url_for("pooling"), + json={"model": model_name, "input": input_text, "encoding_format": "float"}, + ) + + poolings = PoolingResponse.model_validate(response.json()) + + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 11 + assert len(poolings.data[0].data[0]) == 384 diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py index 9980fcff16c15..e43148d25feeb 100644 --- a/tests/entrypoints/pooling/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F from tests.utils import RemoteOpenAIServer -from vllm.entrypoints.openai.protocol import RerankResponse +from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" @@ -159,3 +159,20 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str): assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), ( "w_activation should be close to activation(wo_activation)." ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling(server: RemoteOpenAIServer, model_name: str): + input_text = ["The chef prepared a delicious meal."] + + response = requests.post( + server.url_for("pooling"), + json={"model": model_name, "input": input_text, "encoding_format": "float"}, + ) + + poolings = PoolingResponse.model_validate(response.json()) + + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 11 + assert len(poolings.data[0].data[0]) == 1 diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py index b0faa870a9272..31ea856224f90 100644 --- a/tests/entrypoints/test_context.py +++ b/tests/entrypoints/test_context.py @@ -6,7 +6,11 @@ from unittest.mock import MagicMock, patch import pytest from openai_harmony import Author, Message, Role, StreamState, TextContent -from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext +from vllm.entrypoints.context import ( + HarmonyContext, + StreamingHarmonyContext, + TurnMetrics, +) from vllm.outputs import CompletionOutput, RequestOutput @@ -101,8 +105,12 @@ def test_single_turn_token_counting(): # Verify internal state tracking assert not context.is_first_turn - assert context.previous_turn.input_tokens == 5 - assert context.previous_turn.output_tokens == 3 + assert len(context.all_turn_metrics) == 1 + previous_turn = context.all_turn_metrics[0] + assert previous_turn.input_tokens == 5 + assert previous_turn.output_tokens == 3 + assert previous_turn.cached_input_tokens == 2 + assert previous_turn.tool_output_tokens == 0 @pytest.mark.asyncio @@ -156,6 +164,15 @@ async def test_multi_turn_token_counting(): assert context.num_tool_output_tokens == expected_tool_output assert context.num_cached_tokens == 5 + 15 + # Validate all turn metrics + assert len(context.all_turn_metrics) == 3 + for i, turn in enumerate(context.all_turn_metrics): + assert turn.input_tokens == prompt_token_counts[i] + assert turn.output_tokens == output_token_counts[i] + assert turn.cached_input_tokens == cached_token_counts[i] + assert context.all_turn_metrics[1].tool_output_tokens == 7 + assert context.all_turn_metrics[2].tool_output_tokens == 1 + def test_empty_output_tokens(): """Test behavior when RequestOutput has empty output tokens.""" @@ -314,6 +331,10 @@ async def test_streaming_multi_turn_token_counting(mock_parser): # Create a streaming context context = StreamingHarmonyContext(messages=[], available_tools=["browser"]) + num_prompt_tokens = [3, 8, 13] + num_output_tokens = [3, 3, 2] + num_cached_tokens = [0, 3, 8] + # Simulate three turns of conversation: # Turn 1: stream tokens one by one, then finish the message # Turn 2: new prompt, stream more tokens with a reasoning segment @@ -325,7 +346,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): create_mock_request_output( prompt_token_ids=[1, 2, 3], # 3 prompt tokens output_token_ids=[101], # Single token - num_cached_tokens=0, + num_cached_tokens=num_cached_tokens[0], finished=False, # Not end of message yet ) ) @@ -370,7 +391,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): 5, ], # 8 tokens (includes previous) output_token_ids=[201], - num_cached_tokens=3, # Some tokens cached + num_cached_tokens=num_cached_tokens[1], # Some tokens cached finished=False, ) ) @@ -422,7 +443,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): 7, ], # 13 tokens output_token_ids=[301], - num_cached_tokens=8, # More cached tokens + num_cached_tokens=num_cached_tokens[2], # More cached tokens finished=False, ) ) @@ -435,10 +456,12 @@ async def test_streaming_multi_turn_token_counting(mock_parser): ) # Final token counts check - assert context.num_prompt_tokens == 3 + 8 + 13 # All prompts - assert context.num_output_tokens == 3 + 3 + 2 # All outputs + assert context.num_prompt_tokens == sum(num_prompt_tokens) # All prompts + assert context.num_output_tokens == sum(num_output_tokens) # All outputs assert context.num_reasoning_tokens == 3 # Unchanged from second turn - assert context.num_cached_tokens == 3 + 8 # Accumulated cached tokens + assert context.num_cached_tokens == sum( + num_cached_tokens + ) # Accumulated cached tokens # Additional tool tokens from third turn # Formula: this turn prompt - last turn prompt - last turn output @@ -447,6 +470,15 @@ async def test_streaming_multi_turn_token_counting(mock_parser): context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens ) + # Validate all turn metrics + assert len(context.all_turn_metrics) == 3 + for i, turn in enumerate(context.all_turn_metrics): + assert turn.input_tokens == num_prompt_tokens[i] + assert turn.output_tokens == num_output_tokens[i] + assert turn.cached_input_tokens == num_cached_tokens[i] + assert context.all_turn_metrics[1].tool_output_tokens == 2 + assert context.all_turn_metrics[2].tool_output_tokens == 2 + @pytest.mark.asyncio async def test_streaming_message_synchronization(mock_parser): @@ -522,3 +554,46 @@ async def test_streaming_message_synchronization(mock_parser): assert len(context._messages) == 3 assert context.num_init_messages == 1 assert context._messages[2].content[0].text == "Response 4" + + +def test_turn_metrics_copy_and_reset(): + """Test TurnMetrics copy and reset methods work correctly.""" + # Create a TurnMetrics with specific values + original_metrics = TurnMetrics( + input_tokens=10, + output_tokens=20, + cached_input_tokens=5, + tool_output_tokens=3, + ) + + # Test copy functionality + copied_metrics = original_metrics.copy() + + # Verify copy has same values + assert copied_metrics.input_tokens == 10 + assert copied_metrics.output_tokens == 20 + assert copied_metrics.cached_input_tokens == 5 + assert copied_metrics.tool_output_tokens == 3 + + # Verify they are separate objects + assert copied_metrics is not original_metrics + + # Modify copy to ensure independence + copied_metrics.input_tokens = 999 + assert original_metrics.input_tokens == 10 # Original unchanged + assert copied_metrics.input_tokens == 999 + + # Test reset functionality + original_metrics.reset() + + # Verify all fields are reset to zero + assert original_metrics.input_tokens == 0 + assert original_metrics.output_tokens == 0 + assert original_metrics.cached_input_tokens == 0 + assert original_metrics.tool_output_tokens == 0 + + # Verify copied metrics are unaffected by reset + assert copied_metrics.input_tokens == 999 + assert copied_metrics.output_tokens == 20 + assert copied_metrics.cached_input_tokens == 5 + assert copied_metrics.tool_output_tokens == 3 diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py index 418c700bbf003..63b5a37d3c779 100644 --- a/tests/kernels/core/test_fused_quant_layernorm.py +++ b/tests/kernels/core/test_fused_quant_layernorm.py @@ -15,7 +15,6 @@ VEC_HIDDEN_SIZES = range(1024, 1030) # Avoid combinatorial explosion with full Cartesian product NUM_TOKENS_HIDDEN_SIZES = [ *[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]], - *[(83, i) for i in [1, 1033, 2048, 5120]], *[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]], *[(4096, i) for i in [1, 64, 5137]], ] diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py index 7553d45e00576..aaa13c06623ac 100644 --- a/tests/kernels/core/test_layernorm.py +++ b/tests/kernels/core/test_layernorm.py @@ -11,19 +11,7 @@ from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing -HIDDEN_SIZES = [ - 8, - 768, - 769, - 770, - 771, - 5120, - 5124, - 5125, - 5126, - 8192, - 8199, -] # Arbitrary values for testing +HIDDEN_SIZES = [8, 768, 769, 5120, 5125, 8192] # Arbitrary values for testing ADD_RESIDUAL = [False, True] SEEDS = [0] CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] @@ -118,7 +106,7 @@ def test_poly_norm( @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("quant_scale", [1.0, 0.01, 10.0]) +@pytest.mark.parametrize("quant_scale", [0.01, 1.0, 10.0]) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("strided_input", [False, True]) diff --git a/tests/kernels/core/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py index 1e264735cb3c2..08fdd0e055eac 100644 --- a/tests/kernels/core/test_permute_cols.py +++ b/tests/kernels/core/test_permute_cols.py @@ -9,7 +9,7 @@ from vllm._custom_ops import permute_cols @pytest.mark.parametrize("shape", [(1, 512), (544, 4096), (67, 8192)]) -@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) def test_permute_cols(shape, dtype): x = torch.randn(shape, dtype=dtype).cuda() perm = torch.randperm(x.shape[1]).to(torch.int).cuda() diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index e1ddc5de067bb..c35ee5016ba05 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -12,8 +12,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform IS_NEOX_STYLE = [True, False] -DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 112, 120, 256] +DTYPES = [torch.bfloat16, torch.float] +HEAD_SIZES = [64, 80, 120, 256] ROTARY_DIMS = [None, 32] # None means rotary dim == head size NUM_HEADS = [17] # Arbitrary values for testing BATCH_SIZES = [5] # Arbitrary values for testing diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py index d9023490d7fc2..4647b97c47718 100644 --- a/tests/kernels/mamba/test_causal_conv1d.py +++ b/tests/kernels/mamba/test_causal_conv1d.py @@ -183,7 +183,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, ity assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) @pytest.mark.parametrize("silu_activation", [False, True]) @pytest.mark.parametrize("has_bias", [False, True]) @pytest.mark.parametrize("seqlen", [1, 3]) @@ -265,7 +265,7 @@ def test_causal_conv1d_update_with_batch_gather( @pytest.mark.parametrize("silu_activation", [True]) @pytest.mark.parametrize("has_bias", [True]) @pytest.mark.parametrize("width", [4]) -@pytest.mark.parametrize("seqlen", [8, 30, 249, 2049, 4096]) +@pytest.mark.parametrize("seqlen", [8, 249, 4096]) @pytest.mark.parametrize("dim", [64, 4096]) @pytest.mark.parametrize("with_padding", [True, False]) @pytest.mark.parametrize("batch", [4, 10]) diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py index d23daefa7b436..25934c409744b 100644 --- a/tests/kernels/mamba/test_mamba_mixer2.py +++ b/tests/kernels/mamba/test_mamba_mixer2.py @@ -25,7 +25,6 @@ from vllm.utils import update_environment_variables (64, 1), (64, 2), (64, 4), # hidden_size be divisible by num_gpus - (100, 5), # and n_groups must divide hidden_size ], ) @pytest.mark.parametrize("dtype", [torch.float16]) diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py index 9a6137239ebfc..c59fc7af0c897 100644 --- a/tests/kernels/mamba/test_mamba_ssm.py +++ b/tests/kernels/mamba/test_mamba_ssm.py @@ -229,8 +229,8 @@ def selective_scan_opcheck_fn( @pytest.mark.parametrize("wtype", [torch.float32]) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("seqlen", [128, 256, 512, 1024, 2048, 4096]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) +@pytest.mark.parametrize("seqlen", [128, 1024, 4096]) @pytest.mark.parametrize("has_delta_bias", [True]) @pytest.mark.parametrize("delta_softplus", [True]) @pytest.mark.parametrize("has_z", [True]) @@ -238,7 +238,7 @@ def selective_scan_opcheck_fn( @pytest.mark.parametrize("varBC_groups", [1, 2]) @pytest.mark.parametrize("is_variable_C", [True]) @pytest.mark.parametrize("is_variable_B", [True]) -@pytest.mark.parametrize("scan_chunks", [1, 2, 3]) +@pytest.mark.parametrize("scan_chunks", [1, 3]) def test_selective_scan( is_variable_B, is_variable_C, @@ -375,9 +375,9 @@ def test_selective_scan( ) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) @pytest.mark.parametrize("has_z", [False, True]) -@pytest.mark.parametrize("dstate", [16, 32, 64]) +@pytest.mark.parametrize("dstate", [16, 64]) @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096]) def test_selective_state_update(dim, dstate, has_z, itype): device = "cuda" @@ -413,7 +413,7 @@ def test_selective_state_update(dim, dstate, has_z, itype): @pytest.mark.parametrize("wtype", [torch.float32]) @pytest.mark.parametrize("itype", [torch.float32]) -@pytest.mark.parametrize("seqlen", [1, 128, 129, 256, 512, 1024, 2048, 4096]) +@pytest.mark.parametrize("seqlen", [1, 256, 1024, 4096]) @pytest.mark.parametrize("return_last_state", [True]) @pytest.mark.parametrize("has_delta_bias", [True]) @pytest.mark.parametrize("delta_softplus", [True]) @@ -589,9 +589,9 @@ def test_selective_scan_varlen( ) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) @pytest.mark.parametrize("has_z", [True]) -@pytest.mark.parametrize("dstate", [16, 32, 64]) +@pytest.mark.parametrize("dstate", [16, 64]) @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096]) # tests correctness in case subset of the sequences are padded @pytest.mark.parametrize("with_padding", [True, False]) @@ -679,11 +679,11 @@ def test_selective_state_update_with_batch_indices( assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) @pytest.mark.parametrize("has_z", [False, True]) @pytest.mark.parametrize("tie_hdim", [False, True]) -@pytest.mark.parametrize("ngroups", [1, 2, 4]) -@pytest.mark.parametrize("dstate", [16, 32, 64]) +@pytest.mark.parametrize("ngroups", [1, 4]) +@pytest.mark.parametrize("dstate", [16, 64]) @pytest.mark.parametrize("dim", [2048, 4096]) def test_selective_state_update_with_heads_with_batch_indices( dim, dstate, ngroups, has_z, tie_hdim, itype diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index 57dcb789e97ba..0b0b82e484a1c 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -188,9 +188,9 @@ def generate_continuous_batched_examples( ) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32]) -@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) +@pytest.mark.parametrize("n_heads", [4, 16, 32]) +@pytest.mark.parametrize("d_head", [5, 8, 32, 128]) @pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)]) def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, itype): # this tests the kernels on a single example (bs=1) @@ -254,15 +254,14 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, it ) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16]) -@pytest.mark.parametrize("n_heads", [4, 8, 13]) -@pytest.mark.parametrize("d_head", [5, 16, 21, 32]) +@pytest.mark.parametrize("itype", [torch.float32]) +@pytest.mark.parametrize("n_heads", [4, 8]) +@pytest.mark.parametrize("d_head", [5, 16, 32]) @pytest.mark.parametrize( "seq_len_chunk_size_cases", [ # small-ish chunk_size (8) (64, 8, 2, [(64, 32), (64, 32)]), - (64, 8, 2, [(32, 32), (32, 32), (32, 32)]), (64, 8, 2, [(8, 8), (8, 8), (8, 8)]), # chunk size boundary ( 64, @@ -270,16 +269,7 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, it 2, [(4, 4), (4, 4), (4, 4), (4, 4)], ), # chunk_size larger than cont batches - ( - 64, - 8, - 5, - [ - (64, 32, 16, 8, 8), - (8, 16, 32, 16, 8), - (8, 8, 16, 32, 16), - ], - ), # mode examples with varied lengths + (64, 8, 5, [(64, 32, 16, 8, 8)]), # large-ish chunk_size (256) (64, 256, 1, [(5,), (1,), (1,), (1,)]), # irregular sizes with small sequences ( @@ -359,11 +349,7 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, @pytest.mark.parametrize("chunk_size", [8, 256]) @pytest.mark.parametrize( "seqlens", - [ - (16, 2, 8, 13), - (270, 88, 212, 203), - (16, 20), - ], + [(16, 20), (270, 88, 212, 203)], ) def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens): # This test verifies the correctness of the chunked prefill implementation diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 6b391c173f0bc..966e2f8f3b131 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -26,6 +26,7 @@ from vllm.model_executor.layers.fused_moe.config import ( int4_w4a16_moe_quant_config, int8_w8a16_moe_quant_config, ) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, modular_triton_fused_moe, @@ -724,7 +725,7 @@ def test_fused_marlin_moe( with set_current_vllm_config(vllm_config): torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, expert_map=e_map) - marlin_output = torch.ops.vllm.fused_marlin_moe( + marlin_output = fused_marlin_moe( a, qweight1, qweight2, @@ -837,7 +838,7 @@ def test_fused_marlin_moe_with_bias(m): with set_current_vllm_config(vllm_config): torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, b_bias1, b_bias2) - marlin_output = torch.ops.vllm.fused_marlin_moe( + marlin_output = fused_marlin_moe( a, qweight1, qweight2, diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index d8058c5f87a81..f4f151180decb 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -58,7 +58,6 @@ def test_chatglm3_lora(chatglm3_lora_files): max_loras=4, max_lora_rank=64, trust_remote_code=True, - enable_chunked_prefill=True, ) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) @@ -70,7 +69,6 @@ def test_chatglm3_lora(chatglm3_lora_files): @multi_gpu_test(num_gpus=4) -@create_new_process_for_each_test() def test_chatglm3_lora_tp4(chatglm3_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -81,7 +79,6 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files): tensor_parallel_size=4, trust_remote_code=True, fully_sharded_loras=False, - enable_chunked_prefill=True, ) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) @@ -93,7 +90,6 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files): @multi_gpu_test(num_gpus=4) -@create_new_process_for_each_test() def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files): # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use @@ -107,7 +103,6 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files): tensor_parallel_size=4, trust_remote_code=True, fully_sharded_loras=True, - enable_chunked_prefill=True, gpu_memory_utilization=0.85, ) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 50fd63d35cded..e1d6a8674a01a 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -113,7 +113,6 @@ def test_llama_lora(sql_lora_files): @multi_gpu_test(num_gpus=4) -@create_new_process_for_each_test() def test_llama_lora_tp4(sql_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -127,7 +126,6 @@ def test_llama_lora_tp4(sql_lora_files): @multi_gpu_test(num_gpus=4) -@create_new_process_for_each_test() def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -142,7 +140,6 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): @multi_gpu_test(num_gpus=2) -@create_new_process_for_each_test() def test_tp2_serialize_and_deserialize_lora( tmp_path, sql_lora_files, sql_lora_huggingface_id ): diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index ce98fe2f86137..1cf8ed602b6a4 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -8,7 +8,7 @@ from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest from vllm.platforms import current_platform -from ..utils import create_new_process_for_each_test +from ..utils import multi_gpu_test MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" @@ -88,7 +88,7 @@ def test_minicpmv_lora(minicpmv_lora_files): current_platform.is_rocm(), reason="MiniCPM-V dependency xformers incompatible with ROCm", ) -@create_new_process_for_each_test() +@multi_gpu_test(num_gpus=4) def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -112,7 +112,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): current_platform.is_rocm(), reason="MiniCPM-V dependency xformers incompatible with ROCm", ) -@create_new_process_for_each_test() +@multi_gpu_test(num_gpus=4) def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index bf290079469aa..254e9b3ab8af0 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -36,55 +36,56 @@ class Relu3(ReLUSquaredActivation): @pytest.mark.parametrize( - "env, torch_level, use_inductor, ops_enabled, default_on", + "env, torch_level, backend, ops_enabled, default_on", [ # Default values based on compile level # - All by default (no Inductor compilation) - (None, 0, False, [True] * 4, True), - (None, 1, True, [True] * 4, True), - (None, 2, False, [True] * 4, True), + (None, 0, "eager", [True] * 4, True), + (None, 1, "eager", [True] * 4, True), + (None, 2, "eager", [True] * 4, True), + (None, 3, "eager", [True] * 4, True), # - None by default (with Inductor) - (None, 3, True, [False] * 4, False), - (None, 4, True, [False] * 4, False), - # - All by default (without Inductor) - (None, 3, False, [True] * 4, True), - (None, 4, False, [True] * 4, True), + (None, 0, "inductor", [True] * 4, True), + # - None by default (with Inductor) + (None, 1, "inductor", [False] * 4, False), + (None, 2, "inductor", [False] * 4, False), + (None, 3, "inductor", [False] * 4, False), # Explicitly enabling/disabling # # Default: all # # All but SiluAndMul - ("+rms_norm,-silu_and_mul", 0, True, [1, 0, 1, 1], True), + ("+rms_norm,-silu_and_mul", 0, "inductor", [1, 0, 1, 1], True), # Only ReLU3 - ("none,-rms_norm,+relu3", 1, False, [0, 0, 0, 1], False), + ("none,-rms_norm,+relu3", 1, "eager", [0, 0, 0, 1], False), # All but SiluAndMul - ("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True), + ("all,-silu_and_mul", 2, "inductor", [1, 0, 1, 1], True), # All but ReLU3 (even if ReLU2 is on) - ("-relu3,+relu2", 3, False, [1, 1, 1, 0], True), + ("-relu3,+relu2", 3, "eager", [1, 1, 1, 0], True), # RMSNorm and SiluAndMul - ("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False), + ("none,-relu3,+rms_norm,+silu_and_mul", 3, "eager", [1, 1, 0, 0], False), # All but RMSNorm - ("-rms_norm", 3, False, [0, 1, 1, 1], True), + ("-rms_norm", 3, "eager", [0, 1, 1, 1], True), # # Default: none # # Only ReLU3 - ("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False), + ("none,+relu3", 3, "inductor", [0, 0, 0, 1], False), # All but RMSNorm - ("all,-rms_norm", 4, True, [0, 1, 1, 1], True), + ("all,-rms_norm", 3, "inductor", [0, 1, 1, 1], True), ], ) def test_enabled_ops( env: str | None, torch_level: int, - use_inductor: bool, + backend: str, ops_enabled: list[int], default_on: bool, ): custom_ops = env.split(",") if env else [] vllm_config = VllmConfig( compilation_config=CompilationConfig( - use_inductor=bool(use_inductor), level=torch_level, custom_ops=custom_ops + backend=backend, level=torch_level, custom_ops=custom_ops ) ) with set_current_vllm_config(vllm_config): diff --git a/tests/models/language/pooling/test_multi_vector_retrieval.py b/tests/models/language/pooling/test_multi_vector_retrieval.py new file mode 100644 index 0000000000000..302f2df135579 --- /dev/null +++ b/tests/models/language/pooling/test_multi_vector_retrieval.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch +from transformers import AutoModel + +from tests.models.utils import check_embeddings_close + + +@pytest.mark.parametrize( + "model", + ["BAAI/bge-m3"], +) +@pytest.mark.parametrize("dtype", ["half"]) +@torch.inference_mode +def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str): + with vllm_runner( + model, + runner="pooling", + max_model_len=None, + ) as vllm_model: + vllm_outputs = vllm_model.token_embed(example_prompts) + + with hf_runner( + model, + auto_cls=AutoModel, + ) as hf_model: + tokenizer = hf_model.tokenizer + hf_outputs = [] + for prompt in example_prompts: + inputs = tokenizer([prompt], return_tensors="pt") + inputs = hf_model.wrap_device(inputs) + output = hf_model.model(**inputs) + embedding = output.last_hidden_state[0].float() + # normal + hf_outputs.append(embedding.cpu()) + + for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): + check_embeddings_close( + embeddings_0_lst=hf_output, + embeddings_1_lst=vllm_output, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py index 674bf02b7b98b..55663ee3f1b41 100644 --- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py +++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py @@ -93,7 +93,7 @@ def test_embed_models_using_normalize( ], ) @pytest.mark.parametrize("dtype", ["half"]) -def test_reward_models_using_softmax( +def test_reward_models_using_activation( hf_runner, vllm_runner, example_prompts, @@ -104,22 +104,64 @@ def test_reward_models_using_softmax( model, max_model_len=1024, dtype=dtype, - pooler_config=PoolerConfig(softmax=False), + pooler_config=PoolerConfig(activation=False), ) as vllm_model: - wo_softmax = vllm_model.encode(example_prompts) + wo_activation = vllm_model.reward(example_prompts) with vllm_runner( - model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True) + model, + max_model_len=1024, + dtype=dtype, + pooler_config=PoolerConfig(activation=True), ) as vllm_model: - w_softmax = vllm_model.encode(example_prompts) + w_activation = vllm_model.reward(example_prompts) - for wo, w in zip(wo_softmax, w_softmax): + for wo, w in zip(wo_activation, w_activation): wo = torch.tensor(wo) w = torch.tensor(w) assert not torch.allclose(wo, w, atol=1e-2), ( - "pooler_config softmax is not working" + "pooler_config activation is not working" ) assert torch.allclose(softmax(wo), w, atol=1e-2), ( - "w_softmax should be close to softmax(wo_softmax)." + "w_activation should be close to activation(wo_activation)." + ) + + +@pytest.mark.parametrize( + "model", + [ + "intfloat/multilingual-e5-small", + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_multi_vector_retrieval_models_using_normalize( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +) -> None: + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(normalize=False), + ) as vllm_model: + wo_normalize = vllm_model.token_embed(example_prompts) + + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(normalize=True), + ) as vllm_model: + w_normalize = vllm_model.token_embed(example_prompts) + + for wo, w in zip(wo_normalize, w_normalize): + assert not torch.allclose(wo, w, atol=1e-2), ( + "pooler_config normalize is not working" + ) + assert torch.allclose(F.normalize(wo, p=2, dim=-1), w, atol=1e-2), ( + "w_normal should be close to normal(wo_normal)." ) diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py index 784d9fc312679..2dfc0072126bc 100644 --- a/tests/models/language/pooling/test_token_classification.py +++ b/tests/models/language/pooling/test_token_classification.py @@ -19,7 +19,7 @@ def test_bert_models( dtype: str, ) -> None: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) + vllm_outputs = vllm_model.token_classify(example_prompts) with hf_runner( model, dtype=dtype, auto_cls=AutoModelForTokenClassification @@ -50,7 +50,7 @@ def test_modernbert_models( dtype: str, ) -> None: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) + vllm_outputs = vllm_model.token_classify(example_prompts) with hf_runner( model, dtype=dtype, auto_cls=AutoModelForTokenClassification diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 0572898368d6d..af7dad079a9b3 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -17,7 +17,7 @@ from transformers import ( ) from vllm.platforms import current_platform -from vllm.utils import identity +from vllm.utils.func import identity from ....conftest import ( IMAGE_ASSETS, @@ -707,8 +707,6 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output, prompt_path_encoder=model_utils.qwen_prompt_path_encoder, - # FIXME: https://github.com/huggingface/transformers/issues/38358 - marks=[pytest.mark.skip("Model initialization fails")], ), "qwen2_vl": VLMTestInfo( models=["Qwen/Qwen2-VL-2B-Instruct"], diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py index b474e851319ae..74e30c4307fac 100644 --- a/tests/models/multimodal/pooling/test_intern_vit.py +++ b/tests/models/multimodal/pooling/test_intern_vit.py @@ -38,7 +38,7 @@ def run_intern_vit_test( config.norm_type = "rms_norm" hf_model = AutoModel.from_pretrained( - model, torch_dtype=torch_dtype, trust_remote_code=True + model, dtype=torch_dtype, trust_remote_code=True ).to("cuda") hf_outputs_per_image = [ hf_model(pixel_value.to("cuda")).last_hidden_state diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py index abf4150a91329..62154b0834878 100644 --- a/tests/models/multimodal/pooling/test_prithvi_mae.py +++ b/tests/models/multimodal/pooling/test_prithvi_mae.py @@ -39,7 +39,7 @@ def _run_test( max_num_seqs=32, default_torch_num_threads=1, ) as vllm_model: - vllm_model.encode(prompt) + vllm_model.llm.encode(prompt, pooling_task="token_classify") MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] diff --git a/tests/models/multimodal/pooling/test_radio.py b/tests/models/multimodal/pooling/test_radio.py index 80f594021ca8a..414e99a71e7b0 100644 --- a/tests/models/multimodal/pooling/test_radio.py +++ b/tests/models/multimodal/pooling/test_radio.py @@ -45,7 +45,7 @@ def run_radio_test( hf_model = AutoModel.from_pretrained( model_id, config=config, - torch_dtype=torch_dtype, + dtype=torch_dtype, trust_remote_code=True, ).to("cuda") hf_model.eval() diff --git a/tests/models/registry.py b/tests/models/registry.py index fbc11c2ddfd4c..617dc30691aa8 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -76,9 +76,6 @@ class _HfExamplesInfo: trust_remote_code: bool = False """The ``trust_remote_code`` level required to load the model.""" - v0_only: bool = False - """The model is only available with the vLLM V0 engine.""" - hf_overrides: dict[str, Any] = field(default_factory=dict) """The ``hf_overrides`` required to load the model.""" @@ -265,7 +262,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo( "bigcode/starcoder", - extras={"tiny": "bigcode/tiny_starcoder_py"}, + extras={ + "tiny": "bigcode/tiny_starcoder_py", + "santacoder": "bigcode/gpt_bigcode-santacoder", + }, min_transformers_version="4.55.1", transformers_version_reason="HF model broken in 4.55.0", ), @@ -694,7 +694,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo( "MiniMaxAI/MiniMax-VL-01", trust_remote_code=True, - v0_only=True, ), "Mistral3ForConditionalGeneration": _HfExamplesInfo( "mistralai/Mistral-Small-3.1-24B-Instruct-2503", @@ -752,6 +751,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen/Qwen-VL", extras={"chat": "Qwen/Qwen-VL-Chat"}, trust_remote_code=True, + max_transformers_version="4.53.3", + transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501 hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}, ), "Qwen2AudioForConditionalGeneration": _HfExamplesInfo( diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index f501798ffa36b..80bee3d8cf86c 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -88,13 +88,15 @@ def can_initialize( # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config return 1, 0, scheduler_kv_cache_config + if model_arch == "MiniMaxVL01ForConditionalGeneration": + pytest.skip( + "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`" + ) + with ( patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1), monkeypatch.context() as m, ): - if model_info.v0_only: - # NOTE(woosuk): skip the test for V0-only models - return if model_arch == "GptOssForCausalLM": # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when @@ -132,8 +134,6 @@ def can_initialize( @pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST) def test_can_initialize_small_subset(model_arch: str, monkeypatch: pytest.MonkeyPatch): """Test initializing small subset of supported models""" - if model_arch == "Lfm2ForCausalLM": - pytest.skip("Skipping until test supports V1-only models") can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS) @@ -144,8 +144,6 @@ def test_can_initialize_large_subset(model_arch: str, monkeypatch: pytest.Monkey This test covers the complement of the tests covered in the "small subset" test. """ - if model_arch == "Lfm2ForCausalLM": - pytest.skip("Skipping until test supports V1-only models") can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS) diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index d1dae587d38eb..98245cdf0c984 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -30,7 +30,7 @@ class MyGemma2Embedding(nn.Module): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_embed": Pooler.for_token_embed(pooler_config), "embed": Pooler.for_embed(pooler_config), } ) diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py index 912b32755e80f..936f27fb69bc6 100644 --- a/tests/plugins_tests/test_io_processor_plugins.py +++ b/tests/plugins_tests/test_io_processor_plugins.py @@ -93,7 +93,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str): out_data_format="b64_json", ) - pooling_params = PoolingParams(task="encode", softmax=False) + pooling_params = PoolingParams(activation=False) with vllm_runner( model_name, @@ -108,8 +108,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str): io_processor_plugin="prithvi_to_tiff", ) as llm_runner: pooler_output = llm_runner.get_llm().encode( - img_prompt, - pooling_params=pooling_params, + img_prompt, pooling_params=pooling_params, pooling_task="token_classify" ) output = pooler_output[0].outputs diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index ef7164c8813da..5aeb002238cf9 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -697,7 +697,8 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): @pytest.mark.parametrize( "args", [ - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4), + # TODO: Enable once model is available again + # ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4), ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4), ], ) diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py new file mode 100644 index 0000000000000..3d12f3e5b30e8 --- /dev/null +++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.reasoning import ( + DeepSeekR1ReasoningParser, + DeepSeekV3ReasoningParser, + IdentityReasoningParser, +) + +REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-V3.1" + + +@pytest.fixture(scope="module") +def tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +@pytest.mark.parametrize( + "thinking,expected_parser_type", + [ + (True, DeepSeekR1ReasoningParser), + (False, IdentityReasoningParser), + ], +) +def test_parser_selection(tokenizer, thinking, expected_parser_type): + parser = DeepSeekV3ReasoningParser( + tokenizer, chat_template_kwargs={"thinking": thinking} + ) + + assert isinstance(parser._parser, expected_parser_type) + + +def test_identity_reasoning_parser_basic(tokenizer): + parser = IdentityReasoningParser(tokenizer) + + # Test is_reasoning_end always returns True + input_text = "This is some output" + input_tokens = tokenizer.tokenize(input_text) + input_ids = tokenizer.convert_tokens_to_ids(input_tokens) + assert parser.is_reasoning_end(input_ids) is True + + # Test extract_content_ids returns all input_ids + assert parser.extract_content_ids(input_ids) == input_ids + + # Test extract_reasoning_content returns (None, model_output) + request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0) + reasoning, content = parser.extract_reasoning_content(input_text, request) + assert reasoning is None + assert content == input_text + + # Test extract_reasoning_content_streaming returns DeltaMessage or None + result = parser.extract_reasoning_content_streaming( + previous_text="", + current_text="Hello world", + delta_text="Hello world", + previous_token_ids=[], + current_token_ids=input_ids, + delta_token_ids=input_ids, + ) + assert isinstance(result, DeltaMessage) + assert result.content == "Hello world" + + # If delta_text is empty, should return None + result_none = parser.extract_reasoning_content_streaming( + previous_text="Hello world", + current_text="Hello world", + delta_text="", + previous_token_ids=input_ids, + current_token_ids=input_ids, + delta_token_ids=[], + ) + assert result_none is None diff --git a/tests/test_envs.py b/tests/test_envs.py index 62d529c363608..023767505f108 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -6,7 +6,54 @@ from unittest.mock import patch import pytest -from vllm.envs import env_list_with_choices, env_with_choices +import vllm.envs as envs +from vllm.envs import ( + enable_envs_cache, + env_list_with_choices, + env_with_choices, + environment_variables, +) + + +def test_getattr_without_cache(monkeypatch: pytest.MonkeyPatch): + assert envs.VLLM_HOST_IP == "" + assert envs.VLLM_PORT is None + monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1") + monkeypatch.setenv("VLLM_PORT", "1234") + assert envs.VLLM_HOST_IP == "1.1.1.1" + assert envs.VLLM_PORT == 1234 + # __getattr__ is not decorated with functools.cache + assert not hasattr(envs.__getattr__, "cache_info") + + +def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1") + monkeypatch.setenv("VLLM_PORT", "1234") + # __getattr__ is not decorated with functools.cache + assert not hasattr(envs.__getattr__, "cache_info") + + # Enable envs cache and ignore ongoing environment changes + enable_envs_cache() + + # __getattr__ is not decorated with functools.cache + assert hasattr(envs.__getattr__, "cache_info") + start_hits = envs.__getattr__.cache_info().hits + + # 2 more hits due to VLLM_HOST_IP and VLLM_PORT accesses + assert envs.VLLM_HOST_IP == "1.1.1.1" + assert envs.VLLM_PORT == 1234 + assert envs.__getattr__.cache_info().hits == start_hits + 2 + + # All environment variables are cached + for environment_variable in environment_variables: + envs.__getattr__(environment_variable) + assert envs.__getattr__.cache_info().hits == start_hits + 2 + len( + environment_variables + ) + + # Reset envs.__getattr__ back to none-cached version to + # avoid affecting other tests + envs.__getattr__ = envs.__getattr__.__wrapped__ class TestEnvWithChoices: diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py index e3561ac3a577e..e73d7efc1483a 100644 --- a/tests/test_pooling_params.py +++ b/tests/test_pooling_params.py @@ -1,10 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + import pytest from tests.models.utils import EmbedModelInfo from vllm import PoolingParams -from vllm.config import ModelConfig +from vllm.config import ModelConfig, PoolerConfig EMBEDDING_MODELS = [ EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), @@ -15,6 +17,15 @@ EMBEDDING_MODELS = [ ), ] +classify_parameters = ["activation"] +embed_parameters = ["dimensions", "normalize"] +step_pooling_parameters = ["step_tag_id", "returned_token_ids"] + + +@dataclass() +class MockModelConfig: + pooler_config: PoolerConfig + def test_task(): pooling_params = PoolingParams() @@ -24,25 +35,27 @@ def test_task(): pooling_params.verify(task="score") with pytest.raises(ValueError): - pooling_params.verify(task="encode") + pooling_params.verify(task="classify") def test_embed(): task = "embed" + model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS")) + pooling_params = PoolingParams(normalize=None) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(normalize=True) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(normalize=False) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) - invalid_parameters = ["activation", "softmax"] + invalid_parameters = classify_parameters + step_pooling_parameters for p in invalid_parameters: with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) @pytest.mark.parametrize("model_info", EMBEDDING_MODELS) @@ -73,35 +86,71 @@ def test_embed_dimensions(model_info: EmbedModelInfo): @pytest.mark.parametrize("task", ["score", "classify"]) def test_classify(task): + model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS")) + pooling_params = PoolingParams(activation=None) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(activation=True) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(activation=False) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) - invalid_parameters = ["dimensions", "normalize", "softmax"] + invalid_parameters = embed_parameters + step_pooling_parameters for p in invalid_parameters: with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) -def test_encode(): - task = "encode" - pooling_params = PoolingParams(softmax=None) - pooling_params.verify(task=task) +@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"]) +def test_token_embed(pooling_type: str): + task = "token_embed" + model_config = MockModelConfig( + pooler_config=PoolerConfig(pooling_type=pooling_type) + ) - pooling_params = PoolingParams(softmax=True) - pooling_params.verify(task=task) + pooling_params = PoolingParams(normalize=None) + pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(softmax=False) - pooling_params.verify(task=task) + pooling_params = PoolingParams(normalize=True) + pooling_params.verify(task=task, model_config=model_config) + + pooling_params = PoolingParams(normalize=False) + pooling_params.verify(task=task, model_config=model_config) + + invalid_parameters = classify_parameters + if pooling_type != "STEP": + invalid_parameters = classify_parameters + step_pooling_parameters - invalid_parameters = ["dimensions", "normalize", "activation"] for p in invalid_parameters: with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) + + +@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"]) +def test_token_classify(pooling_type: str): + task = "token_classify" + model_config = MockModelConfig( + pooler_config=PoolerConfig(pooling_type=pooling_type) + ) + + pooling_params = PoolingParams(activation=None) + pooling_params.verify(task=task, model_config=model_config) + + pooling_params = PoolingParams(activation=True) + pooling_params.verify(task=task, model_config=model_config) + + pooling_params = PoolingParams(activation=False) + pooling_params.verify(task=task, model_config=model_config) + + invalid_parameters = embed_parameters + if pooling_type != "STEP": + invalid_parameters = embed_parameters + step_pooling_parameters + + for p in invalid_parameters: + with pytest.raises(ValueError): + pooling_params = PoolingParams(**{p: True}) + pooling_params.verify(task=task, model_config=model_config) diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index b4f0989b1b19c..93ef1049fc07e 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -40,7 +40,7 @@ def qwen3_xml_tool_parser(qwen3_tokenizer): return Qwen3XMLToolParser(qwen3_tokenizer) -@pytest.fixture(params=["original", "xml"]) +@pytest.fixture(params=["xml"]) def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser, request): """Parameterized fixture that provides both parser types for testing""" if request.param == "original": @@ -664,6 +664,9 @@ def test_extract_tool_calls_streaming( # Verify we got all expected tool calls assert len(tool_states) == len(expected_tool_calls) + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == len( + expected_tool_calls + ) # Verify each tool call for idx, expected_tool in enumerate(expected_tool_calls): @@ -780,9 +783,10 @@ fahrenheit # Verify content was streamed assert "Let me check the weather for you:" in other_content - # Verify we got the tool call assert len(tool_states) == 1 + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 + state = tool_states[0] assert state["id"] is not None assert state["type"] == "function" @@ -892,3 +896,83 @@ def test_extract_tool_calls_complex_type_with_single_quote( args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) assert args["obj_param"] == {"key": "value"} + + +def test_extract_tool_calls_streaming_missing_opening_tag( + qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools +): + """Test streaming with missing opening tag + + This tests that the streaming parser correctly handles + tool calls that start directly with + """ + model_output = """I'll check the weather for you. + + + +Dallas + + +TX + + +fahrenheit + + +""" + + request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) + + other_content = "" + tool_states = {} + + for delta_message in stream_delta_message_generator( + qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request + ): + if delta_message.content: + other_content += delta_message.content + + if delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + + if idx not in tool_states: + tool_states[idx] = { + "id": None, + "name": None, + "arguments": "", + "type": None, + } + + if tool_call.id: + tool_states[idx]["id"] = tool_call.id + + if tool_call.type: + assert tool_call.type == "function" + tool_states[idx]["type"] = tool_call.type + + if tool_call.function: + if tool_call.function.name: + tool_states[idx]["name"] = tool_call.function.name + + if tool_call.function.arguments is not None: + tool_states[idx]["arguments"] += tool_call.function.arguments + + # Verify content was streamed + assert "I'll check the weather for you." in other_content + + # Verify we got the tool call + assert len(tool_states) == 1 + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 + + state = tool_states[0] + assert state["id"] is not None + assert state["type"] == "function" + assert state["name"] == "get_current_weather" + + # Verify arguments were parsed correctly despite missing opening tag + assert state["arguments"] is not None + args = json.loads(state["arguments"]) + assert args["city"] == "Dallas" + assert args["state"] == "TX" + assert args["unit"] == "fahrenheit" diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index 102e5ddf16d6d..cf455ff3edbd3 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -3,7 +3,7 @@ import pytest -from vllm.config import CompilationLevel +from vllm.config import CompilationMode from ..utils import compare_two_settings @@ -21,13 +21,13 @@ def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch): "--max-model-len=256", "--max-num-seqs=32", "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_ONCE}", + f"-O{CompilationMode.DYNAMO_TRACE_ONCE}", ], arg2=[ "--max-model-len=256", "--max-num-seqs=32", "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_AS_IS}", + f"-O{CompilationMode.STOCK_TORCH_COMPILE}", ], env1={}, env2={}, diff --git a/tests/utils.py b/tests/utils.py index 8fee507084382..5bfdf703390ee 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -157,7 +157,7 @@ class RemoteOpenAIServer: self.host = None self.port = None else: - self.host = str(args.host or "localhost") + self.host = str(args.host or "127.0.0.1") self.port = int(args.port) self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None diff --git a/tests/utils_/test_func_utils.py b/tests/utils_/test_func_utils.py new file mode 100644 index 0000000000000..147a396994596 --- /dev/null +++ b/tests/utils_/test_func_utils.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa + +import pytest + +from vllm.utils.func import deprecate_kwargs, supports_kw + +from ..utils import error_on_warning + + +def test_deprecate_kwargs_always(): + @deprecate_kwargs("old_arg", is_deprecated=True) + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with pytest.warns(DeprecationWarning, match="'old_arg'"): + dummy(old_arg=1) + + with error_on_warning(DeprecationWarning): + dummy(new_arg=1) + + +def test_deprecate_kwargs_never(): + @deprecate_kwargs("old_arg", is_deprecated=False) + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with error_on_warning(DeprecationWarning): + dummy(old_arg=1) + + with error_on_warning(DeprecationWarning): + dummy(new_arg=1) + + +def test_deprecate_kwargs_dynamic(): + is_deprecated = True + + @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated) + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with pytest.warns(DeprecationWarning, match="'old_arg'"): + dummy(old_arg=1) + + with error_on_warning(DeprecationWarning): + dummy(new_arg=1) + + is_deprecated = False + + with error_on_warning(DeprecationWarning): + dummy(old_arg=1) + + with error_on_warning(DeprecationWarning): + dummy(new_arg=1) + + +def test_deprecate_kwargs_additional_message(): + @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd") + def dummy(*, old_arg: object = None, new_arg: object = None): + pass + + with pytest.warns(DeprecationWarning, match="abcd"): + dummy(old_arg=1) + + +@pytest.mark.parametrize( + ("callable", "kw_name", "requires_kw_only", "allow_var_kwargs", "is_supported"), + [ + # Tests for positional argument support + (lambda foo: None, "foo", True, True, False), + (lambda foo: None, "foo", False, True, True), + # Tests for positional or keyword / keyword only + (lambda foo=100: None, "foo", True, True, False), + (lambda *, foo: None, "foo", False, True, True), + # Tests to make sure the names of variadic params are NOT supported + (lambda *args: None, "args", False, True, False), + (lambda **kwargs: None, "kwargs", False, True, False), + # Tests for if we allow var kwargs to add support + (lambda foo: None, "something_else", False, True, False), + (lambda foo, **kwargs: None, "something_else", False, True, True), + (lambda foo, **kwargs: None, "kwargs", True, True, False), + (lambda foo, **kwargs: None, "foo", True, True, False), + ], +) +def test_supports_kw( + callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported +): + assert ( + supports_kw( + callable=callable, + kw_name=kw_name, + requires_kw_only=requires_kw_only, + allow_var_kwargs=allow_var_kwargs, + ) + == is_supported + ) diff --git a/tests/utils_/test_jsontree.py b/tests/utils_/test_jsontree.py new file mode 100644 index 0000000000000..0af2751b2638c --- /dev/null +++ b/tests/utils_/test_jsontree.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.utils.jsontree import json_count_leaves + + +def test_json_count_leaves(): + """Test json_count_leaves function from jsontree utility.""" + + # Single leaf values + assert json_count_leaves(42) == 1 + assert json_count_leaves("hello") == 1 + assert json_count_leaves(None) == 1 + + # Empty containers + assert json_count_leaves([]) == 0 + assert json_count_leaves({}) == 0 + assert json_count_leaves(()) == 0 + + # Flat structures + assert json_count_leaves([1, 2, 3]) == 3 + assert json_count_leaves({"a": 1, "b": 2}) == 2 + assert json_count_leaves((1, 2, 3)) == 3 + + # Nested structures + nested_dict = {"a": 1, "b": {"c": 2, "d": 3}} + assert json_count_leaves(nested_dict) == 3 + + nested_list = [1, [2, 3], 4] + assert json_count_leaves(nested_list) == 4 + + mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4} + assert json_count_leaves(mixed_nested) == 4 diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 308629ab05834..b4883a4fea31a 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -30,7 +30,6 @@ from vllm.utils import ( bind_kv_cache, common_broadcastable_dtype, current_stream, - deprecate_kwargs, get_open_port, get_tcp_uri, is_lossless_cast, @@ -42,12 +41,11 @@ from vllm.utils import ( sha256, split_host_port, split_zmq_path, - supports_kw, swap_dict_values, unique_filepath, ) -from ..utils import create_new_process_for_each_test, error_on_warning +from ..utils import create_new_process_for_each_test @pytest.mark.asyncio @@ -83,61 +81,6 @@ async def test_merge_async_iterators(): raise AssertionError() from e -def test_deprecate_kwargs_always(): - @deprecate_kwargs("old_arg", is_deprecated=True) - def dummy(*, old_arg: object = None, new_arg: object = None): - pass - - with pytest.warns(DeprecationWarning, match="'old_arg'"): - dummy(old_arg=1) - - with error_on_warning(DeprecationWarning): - dummy(new_arg=1) - - -def test_deprecate_kwargs_never(): - @deprecate_kwargs("old_arg", is_deprecated=False) - def dummy(*, old_arg: object = None, new_arg: object = None): - pass - - with error_on_warning(DeprecationWarning): - dummy(old_arg=1) - - with error_on_warning(DeprecationWarning): - dummy(new_arg=1) - - -def test_deprecate_kwargs_dynamic(): - is_deprecated = True - - @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated) - def dummy(*, old_arg: object = None, new_arg: object = None): - pass - - with pytest.warns(DeprecationWarning, match="'old_arg'"): - dummy(old_arg=1) - - with error_on_warning(DeprecationWarning): - dummy(new_arg=1) - - is_deprecated = False - - with error_on_warning(DeprecationWarning): - dummy(old_arg=1) - - with error_on_warning(DeprecationWarning): - dummy(new_arg=1) - - -def test_deprecate_kwargs_additional_message(): - @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd") - def dummy(*, old_arg: object = None, new_arg: object = None): - pass - - with pytest.warns(DeprecationWarning, match="abcd"): - dummy(old_arg=1) - - def test_get_open_port(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv("VLLM_PORT", "5678") @@ -299,7 +242,7 @@ def test_dict_args(parser): "val2", "--hf-overrides.key2.key4", "val3", - # Test compile config and compilation level + # Test compile config and compilation mode "-O.use_inductor=true", "-O.backend", "custom", @@ -352,7 +295,7 @@ def test_dict_args(parser): }, } assert parsed_args.compilation_config == { - "level": 1, + "mode": 1, "use_inductor": True, "backend": "custom", "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"], @@ -367,7 +310,7 @@ def test_duplicate_dict_args(caplog_vllm, parser): "--hf-overrides.key1", "val2", "-O1", - "-O.level", + "-O.mode", "2", "-O3", ] @@ -375,45 +318,12 @@ def test_duplicate_dict_args(caplog_vllm, parser): parsed_args = parser.parse_args(args) # Should be the last value assert parsed_args.hf_overrides == {"key1": "val2"} - assert parsed_args.compilation_config == {"level": 3} + assert parsed_args.compilation_config == {"mode": 3} assert len(caplog_vllm.records) == 1 assert "duplicate" in caplog_vllm.text assert "--hf-overrides.key1" in caplog_vllm.text - assert "-O.level" in caplog_vllm.text - - -@pytest.mark.parametrize( - "callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported", - [ - # Tests for positional argument support - (lambda foo: None, "foo", True, True, False), - (lambda foo: None, "foo", False, True, True), - # Tests for positional or keyword / keyword only - (lambda foo=100: None, "foo", True, True, False), - (lambda *, foo: None, "foo", False, True, True), - # Tests to make sure the names of variadic params are NOT supported - (lambda *args: None, "args", False, True, False), - (lambda **kwargs: None, "kwargs", False, True, False), - # Tests for if we allow var kwargs to add support - (lambda foo: None, "something_else", False, True, False), - (lambda foo, **kwargs: None, "something_else", False, True, True), - (lambda foo, **kwargs: None, "kwargs", True, True, False), - (lambda foo, **kwargs: None, "foo", True, True, False), - ], -) -def test_supports_kw( - callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported -): - assert ( - supports_kw( - callable=callable, - kw_name=kw_name, - requires_kw_only=requires_kw_only, - allow_var_kwargs=allow_var_kwargs, - ) - == is_supported - ) + assert "-O.mode" in caplog_vllm.text @create_new_process_for_each_test() @@ -863,36 +773,6 @@ def test_join_host_port(): assert join_host_port("::1", 5555) == "[::1]:5555" -def test_json_count_leaves(): - """Test json_count_leaves function from jsontree utility.""" - from vllm.utils.jsontree import json_count_leaves - - # Single leaf values - assert json_count_leaves(42) == 1 - assert json_count_leaves("hello") == 1 - assert json_count_leaves(None) == 1 - - # Empty containers - assert json_count_leaves([]) == 0 - assert json_count_leaves({}) == 0 - assert json_count_leaves(()) == 0 - - # Flat structures - assert json_count_leaves([1, 2, 3]) == 3 - assert json_count_leaves({"a": 1, "b": 2}) == 2 - assert json_count_leaves((1, 2, 3)) == 3 - - # Nested structures - nested_dict = {"a": 1, "b": {"c": 2, "d": 3}} - assert json_count_leaves(nested_dict) == 3 - - nested_list = [1, [2, 3], 4] - assert json_count_leaves(nested_list) == 4 - - mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4} - assert json_count_leaves(mixed_nested) == 4 - - def test_convert_ids_list_to_tokens(): tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") token_ids = tokenizer.encode("Hello, world!") diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 35f7c61458f2d..f41f63ed2af46 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -1,6 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for v1 MLA backends without GPUModelRunner dependency.""" +"""Tests for v1 MLA backends without GPUModelRunner dependency. + +Known Issues: +- FLASH_ATTN_MLA backend occasionally produces NaN values in + test_backend_correctness[mixed_small] when run after + test_backend_correctness[small_prefill], but passes when run alone. +""" import pytest import torch @@ -14,6 +20,8 @@ from tests.v1.attention.utils import ( ) from vllm import _custom_ops as ops from vllm.attention.backends.registry import _Backend +from vllm.attention.ops.flashmla import is_flashmla_dense_supported +from vllm.config.vllm import set_current_vllm_config from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import FullAttentionSpec @@ -29,6 +37,10 @@ BACKENDS_TO_TEST = [ if not torch.cuda.is_available() or torch.cuda.get_device_properties(0).major < 10: BACKENDS_TO_TEST.remove(_Backend.CUTLASS_MLA) +# Remove FLASHMLA from the list if not supported +if not is_flashmla_dense_supported()[0]: + BACKENDS_TO_TEST.remove(_Backend.FLASHMLA) + torch.manual_seed(42) @@ -66,6 +78,12 @@ BATCH_SPECS = { "large_prefill": BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8), "single_decode": BatchSpec(seq_lens=[1024], query_lens=[1]), "single_prefill": BatchSpec(seq_lens=[1024], query_lens=[64]), + "spec_decode_small": BatchSpec( + seq_lens=[128, 256, 512, 1024], query_lens=[4, 4, 4, 4] + ), + "spec_decode_medium": BatchSpec( + seq_lens=[512, 1024, 2048, 512, 1024, 2048], query_lens=[8, 8, 8, 8, 8, 8] + ), } @@ -239,61 +257,64 @@ def run_attention_backend( builder_cls, impl_cls = try_get_attention_backend(backend) - # Build metadata - builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device) - attn_metadata = builder.build( - common_prefix_len=0, - common_attn_metadata=common_attn_metadata, - ) + # Set the current vllm config so that get_current_vllm_config() works + # in the backend implementations + with set_current_vllm_config(vllm_config): + # Build metadata + builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device) + attn_metadata = builder.build( + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) - # Instantiate MLA implementation - num_heads = vllm_config.model_config.get_num_attention_heads( - vllm_config.parallel_config - ) - num_kv_heads = vllm_config.model_config.get_num_kv_heads( - vllm_config.parallel_config - ) - head_size = vllm_config.model_config.get_head_size() - scale = 1.0 / (head_size**0.5) - impl = impl_cls( - num_heads=num_heads, - head_size=head_size, - scale=scale, - num_kv_heads=num_kv_heads, - alibi_slopes=None, - sliding_window=None, - kv_cache_dtype="auto", - logits_soft_cap=None, - attn_type="decoder", - kv_sharing_target_layer_name=None, - q_lora_rank=None, - kv_lora_rank=kv_lora_rank, - qk_nope_head_dim=qk_nope_head_dim, - qk_rope_head_dim=qk_rope_head_dim, - qk_head_dim=qk_nope_head_dim + qk_rope_head_dim, - v_head_dim=v_head_dim, - kv_b_proj=mock_kv_b_proj, - ) + # Instantiate MLA implementation + num_heads = vllm_config.model_config.get_num_attention_heads( + vllm_config.parallel_config + ) + num_kv_heads = vllm_config.model_config.get_num_kv_heads( + vllm_config.parallel_config + ) + head_size = vllm_config.model_config.get_head_size() + scale = 1.0 / (head_size**0.5) + impl = impl_cls( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype="auto", + logits_soft_cap=None, + attn_type="decoder", + kv_sharing_target_layer_name=None, + q_lora_rank=None, + kv_lora_rank=kv_lora_rank, + qk_nope_head_dim=qk_nope_head_dim, + qk_rope_head_dim=qk_rope_head_dim, + qk_head_dim=qk_nope_head_dim + qk_rope_head_dim, + v_head_dim=v_head_dim, + kv_b_proj=mock_kv_b_proj, + ) - # Process weights to create W_UK_T and W_UV attributes needed by MLA - act_dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) - impl.process_weights_after_loading(act_dtype) + # Process weights to create W_UK_T and W_UV attributes needed by MLA + act_dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) + impl.process_weights_after_loading(act_dtype) - # Create mock layer and output buffer - mock_layer = MockAttentionLayer(device) - num_tokens = query.shape[0] - output = torch.empty( - num_tokens, num_heads * v_head_dim, dtype=query.dtype, device=query.device - ) + # Create mock layer and output buffer + mock_layer = MockAttentionLayer(device) + num_tokens = query.shape[0] + output = torch.empty( + num_tokens, num_heads * v_head_dim, dtype=query.dtype, device=query.device + ) - # Run forward pass - # NOTE: The query, key, and value are already shaped correctly - # in the calling test function. - output = impl.forward( - mock_layer, query, kv_c, k_pe, kv_cache, attn_metadata, output=output - ) + # Run forward pass + # NOTE: The query, key, and value are already shaped correctly + # in the calling test function. + output = impl.forward( + mock_layer, query, kv_c, k_pe, kv_cache, attn_metadata, output=output + ) - return output + return output @pytest.mark.parametrize( @@ -309,6 +330,8 @@ def run_attention_backend( "large_prefill", "single_decode", "single_prefill", + "spec_decode_small", + "spec_decode_medium", ], ) @pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-V2-Lite-Chat"]) @@ -328,10 +351,39 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): simulated paged KV cache. 5. Comparing the vLLM backend's output to the ground-truth SDPA output. """ + from vllm.v1.attention.backends.mla.common import QueryLenSupport + batch_spec = BATCH_SPECS[batch_spec_name] - vllm_config = create_vllm_config( - model_name=model, max_model_len=max(batch_spec.seq_lens), num_gpu_blocks=2048 + is_spec_decode_test = batch_spec_name.startswith("spec_decode") + spec_decode_backends = {_Backend.FLASH_ATTN_MLA, _Backend.FLASHMLA} + + block_size = 16 + required_blocks = sum( + (seq_len + block_size - 1) // block_size for seq_len in batch_spec.seq_lens ) + # Add 1 for null block at index 0, and some buffer + num_gpu_blocks = required_blocks + 1 + 100 + + vllm_config = create_vllm_config( + model_name=model, + max_model_len=max(batch_spec.seq_lens), + num_gpu_blocks=num_gpu_blocks, + block_size=block_size, + ) + + # For spec decode tests, add a speculative_config to set the reorder_batch_threshold + if is_spec_decode_test: + from vllm.config import SpeculativeConfig + + # Get the query length from the batch spec (they should all be uniform) + query_len = batch_spec.query_lens[0] + # Set num_speculative_tokens to query_len - 1 + # (since threshold is 1 + num_spec_tokens) + # Use ngram method which doesn't require a draft model + vllm_config.speculative_config = SpeculativeConfig( + method="ngram", num_speculative_tokens=query_len - 1 + ) + device = torch.device("cuda:0") kv_cache_spec = create_standard_kv_cache_spec(vllm_config) @@ -395,11 +447,37 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): # K_PE (rope component): [s_len, 1, qk_rope_head_dim] k_pe_full = torch.randn(s_len, 1, qk_rope_head_dim, dtype=dtype, device=device) - # Determine if this is decode or prefill + # Determine if this sequence uses the decode pipeline or prefill + # pipeline for each backend + # NOTE: For spec decode tests with uniform query_len > 1, backends that + # support spec decode (FLASH_ATTN_MLA with varlen support, FLASHMLA with + # uniform support) will use the decode pipeline (MQA-style), while + # backends that only support single-token queries will use the prefill + # pipeline (MHA-style). This ensures the reference implementation + # matches each backend's actual decode/prefill pipeline path. is_decode = [] - for i, backend in enumerate(BACKENDS_TO_TEST): + for backend_idx, backend in enumerate(BACKENDS_TO_TEST): builder_cls, _ = try_get_attention_backend(backend) - is_decode.append(q_len <= builder_cls.reorder_batch_threshold) + if is_spec_decode_test: + query_len_support = getattr( + builder_cls, "query_len_support", QueryLenSupport.SINGLE_ONLY + ) + supports_spec = query_len_support != QueryLenSupport.SINGLE_ONLY + is_decode.append(supports_spec) + else: + threshold = getattr(builder_cls, "reorder_batch_threshold", None) + query_len_support = getattr( + builder_cls, "query_len_support", QueryLenSupport.SINGLE_ONLY + ) + within_threshold = q_len <= threshold if threshold else False + if ( + within_threshold + and query_len_support == QueryLenSupport.UNIFORM + and i > 0 + ): + first_q_len = query_lens[0] + within_threshold = q_len == first_q_len + is_decode.append(within_threshold) # Split q into nope and rope components q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1) @@ -478,11 +556,11 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): sdpa_out_i_prefill = sdpa_out_i_prefill.transpose(1, 2).squeeze(0) sdpa_out_i_prefill = sdpa_out_i_prefill.flatten(start_dim=-2) - for i, backend in enumerate(BACKENDS_TO_TEST): - if is_decode[i]: - all_sdpa_outputs[i].append(sdpa_out_i_decode) + for backend_idx, backend in enumerate(BACKENDS_TO_TEST): + if is_decode[backend_idx]: + all_sdpa_outputs[backend_idx].append(sdpa_out_i_decode) else: - all_sdpa_outputs[i].append(sdpa_out_i_prefill) + all_sdpa_outputs[backend_idx].append(sdpa_out_i_prefill) # Inputs for vLLM MLA backends are just the new tokens all_q_vllm.append(q_c) @@ -497,9 +575,9 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): query_vllm = torch.cat(all_q_vllm, dim=0) kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0) k_pe_vllm = torch.cat(all_k_pe_vllm, dim=0) - sdpa_outputs = [] - for i, backend in enumerate(BACKENDS_TO_TEST): - sdpa_outputs.append(torch.cat(all_sdpa_outputs[i], dim=0)) + sdpa_outputs = {} + for backend_idx, backend in enumerate(BACKENDS_TO_TEST): + sdpa_outputs[backend] = torch.cat(all_sdpa_outputs[backend_idx], dim=0) # Create mock kv_b_proj using the same weights as reference implementation from vllm.model_executor.layers.linear import ColumnParallelLinear @@ -516,7 +594,7 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): kv_b_proj_weight = kv_b_proj_weight.view( kv_lora_rank, num_q_heads * (qk_nope_head_dim + v_head_dim) ) - mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T) + mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T, requires_grad=False) # Create metadata using original batch spec common_attn_metadata = create_common_attn_metadata( @@ -537,7 +615,11 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): ) # 4. Run vLLM backends and compare - for i, backend_name in enumerate(BACKENDS_TO_TEST): + for backend_idx, backend_name in enumerate(BACKENDS_TO_TEST): + # Skip backends that don't support spec decode for spec decode tests + if is_spec_decode_test and backend_name not in spec_decode_backends: + continue + backend_output = run_attention_backend( backend_name, kv_cache_spec, @@ -556,14 +638,17 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): mock_kv_b_proj, ) + # Use backend_idx to get the correct SDPA output for this backend + expected_output = sdpa_outputs[backend_name] + # Check shape and dtype consistency - assert backend_output.shape == sdpa_outputs[i].shape, ( + assert backend_output.shape == expected_output.shape, ( f"[{backend_name}] shape {backend_output.shape} != " - f"SDPA shape {sdpa_outputs[i].shape}" + f"SDPA shape {expected_output.shape}" ) - assert backend_output.dtype == sdpa_outputs[i].dtype, ( + assert backend_output.dtype == expected_output.dtype, ( f"[{backend_name}] dtype {backend_output.dtype} != " - f"SDPA dtype {sdpa_outputs[i].dtype}" + f"SDPA dtype {expected_output.dtype}" ) assert torch.isfinite(backend_output).all(), ( @@ -574,12 +659,12 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): rtol = 1e-2 atol = 5e-1 - max_diff = torch.max(torch.abs(backend_output - sdpa_outputs[i])).item() + max_diff = torch.max(torch.abs(backend_output - expected_output)).item() max_rel_diff = torch.max( - torch.abs(backend_output - sdpa_outputs[i]) / torch.abs(sdpa_outputs[i]) + torch.abs(backend_output - expected_output) / torch.abs(expected_output) ).item() all_close = torch.allclose( - backend_output, sdpa_outputs[i], rtol=rtol, atol=atol + backend_output, expected_output, rtol=rtol, atol=atol ) assert all_close, ( diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 76408fba2e169..aaac2deb12ac2 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -30,7 +30,6 @@ from vllm.v1.kv_cache_interface import ( from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm.v1.structured_output.request import StructuredOutputRequest from .utils import EOS_TOKEN_ID, create_requests, create_scheduler @@ -335,10 +334,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [], requests[1].request_id: [10], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -383,10 +382,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [10, 42], requests[1].request_id: [13], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -429,10 +428,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [10, 11], requests[1].request_id: [], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -470,10 +469,10 @@ def test_stop_via_update_from_output(): total_num_scheduled_tokens=3, scheduled_encoder_inputs={}, scheduled_spec_decode_tokens={requests[0].request_id: [EOS_TOKEN_ID, 10]}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -1941,7 +1940,6 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): sampling_params=sampling_params, pooling_params=None, eos_token_id=EOS_TOKEN_ID, - structured_output_request=StructuredOutputRequest(sampling_params), ) scheduler.add_request(request) output = scheduler.schedule() diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index 59841a446db3e..02fa27e3f05f7 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -11,7 +11,7 @@ from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, ParallelConfig, SchedulerConfig, @@ -42,7 +42,7 @@ def _create_vllm_config( mock_config.parallel_config = ParallelConfig() # Mimic the behavior of VllmConfig.__post_init__() - if compilation_config.level == CompilationLevel.PIECEWISE: + if compilation_config.mode == CompilationMode.VLLM_COMPILE: compilation_config.set_splitting_ops_for_v1() return mock_config @@ -50,23 +50,23 @@ def _create_vllm_config( class TestCudagraphDispatcher: @pytest.mark.parametrize( - "case_id,cudagraph_mode_str,compilation_level", + "case_id,cudagraph_mode_str,compilation_mode", [ # Test case 0: Full CG for mixed batches, no separate routine - (0, "FULL", CompilationLevel.NO_COMPILATION), + (0, "FULL", CompilationMode.NONE), # Test case 1: Full CG for uniform batches, piecewise for mixed - (1, "FULL_AND_PIECEWISE", CompilationLevel.NO_COMPILATION), + (1, "FULL_AND_PIECEWISE", CompilationMode.NONE), # Test case 2: Full CG for uniform batches, no CG for mixed - (2, "FULL_DECODE_ONLY", CompilationLevel.NO_COMPILATION), - # Test case 3: Piecewise for all - (3, "PIECEWISE", CompilationLevel.PIECEWISE), + (2, "FULL_DECODE_ONLY", CompilationMode.NONE), + # Test case 3: PIECEWISE for all + (3, "PIECEWISE", CompilationMode.VLLM_COMPILE), ], ) - def test_dispatcher(self, cudagraph_mode_str, compilation_level): + def test_dispatcher(self, cudagraph_mode_str, compilation_mode): # Setup dispatcher comp_config = CompilationConfig( cudagraph_mode=cudagraph_mode_str, - level=compilation_level, + mode=compilation_mode, cudagraph_capture_sizes=[1, 8], ) @@ -242,7 +242,7 @@ class TestCudagraphIntegration: def setup_method(self): # only FULL mode for non-uniform batches self.comp_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, cudagraph_mode="FULL", cudagraph_capture_sizes=[10, 20], ) diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 8c8148ae20948..818ae1d7ba677 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -10,7 +10,7 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from vllm import LLM -from vllm.config import CompilationConfig +from vllm.config import CompilationConfig, CompilationMode from vllm.platforms import current_platform @@ -73,7 +73,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte gpu_memory_utilization=0.45, max_model_len=1024, compilation_config=CompilationConfig( - level=3, cudagraph_mode=cudagraph_mode + mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode ), ) llm.generate(["Hello, my name is"] * 10) @@ -90,32 +90,27 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte ) -# test cudagraph_mode with different compilation level. -# (backend_name, cudagraph_mode, compilation_level, supported) +# test cudagraph_mode with different compilation mode. +# (backend_name, cudagraph_mode, compilation_mode, supported) combo_cases_2 = [ - ("FA2", "FULL", 0, True), # no compilation + full cudagraph - ("FA2", "FULL", 3, True), # piecewise compilation + full cudagraph - ("FA2", "PIECEWISE", 0, False), # no compilation + piecewise cudagraph - ("FA2", "PIECEWISE", 3, True), # piecewise compilation + piecewise cudagraph - ( - "FA2", - "FULL_AND_PIECEWISE", - 0, - False, - ), # piecewise cudagraph not supported without piecewise compilation - ("FA2", "FULL_AND_PIECEWISE", 3, True), - ("FA2", "FULL_DECODE_ONLY", 0, True), - ("FA2", "FULL_DECODE_ONLY", 3, True), - ("FA2", "NONE", 0, True), # no compilation + no cudagraph - ("FA2", "NONE", 3, True), # piecewise compilation + no cudagraph + ("FA2", "FULL", CompilationMode.NONE, True), + ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True), + ("FA2", "PIECEWISE", CompilationMode.NONE, False), + ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), + ("FA2", "NONE", CompilationMode.NONE, True), + ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True), ] @pytest.mark.parametrize( - "backend_name,cudagraph_mode,compilation_level,supported", combo_cases_2 + "backend_name,cudagraph_mode,compilation_mode,supported", combo_cases_2 ) def test_cudagraph_compilation_combo(combo_case): - backend_name, cudagraph_mode, compilation_level, supported = combo_case + backend_name, cudagraph_mode, compilation_mode, supported = combo_case env_vars = backend_configs[backend_name].env_vars @@ -130,7 +125,7 @@ def test_cudagraph_compilation_combo(combo_case): gpu_memory_utilization=0.45, max_model_len=1024, compilation_config=CompilationConfig( - level=compilation_level, cudagraph_mode=cudagraph_mode + mode=compilation_mode, cudagraph_mode=cudagraph_mode ), ) llm.generate(["Hello, my name is"] * 10) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index 89e5f26ac627f..f2c6d1c1fd1a4 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -7,7 +7,7 @@ import pytest import torch from vllm import LLM, SamplingParams -from vllm.config import CompilationConfig, CompilationLevel +from vllm.config import CompilationConfig, CompilationMode from vllm.distributed import cleanup_dist_env_and_memory from ...utils import fork_new_process_for_each_test @@ -75,9 +75,9 @@ def test_kv_sharing_fast_prefill( # This allows vLLM compilation backend to handle allocating and # managing buffers for cudagraph cudagraph_copy_inputs=True, - level=CompilationLevel.PIECEWISE + mode=CompilationMode.VLLM_COMPILE if not enforce_eager - else CompilationLevel.NO_COMPILATION, + else CompilationMode.NONE, ) with monkeypatch.context() as m: diff --git a/tests/v1/e2e/test_pooling_chunked_prefill.py b/tests/v1/e2e/test_pooling_chunked_prefill.py new file mode 100644 index 0000000000000..a196e359920de --- /dev/null +++ b/tests/v1/e2e/test_pooling_chunked_prefill.py @@ -0,0 +1,167 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch.nn as nn + +from vllm.platforms import current_platform + +prompt = """ +Generals gathered in their masses +Just like witches at black masses +Evil minds that plot destruction +Sorcerer of death's construction +In the fields, the bodies burning +As the war machine keeps turning +Death and hatred to mankind +Poisoning their brainwashed minds +Oh, Lord, yeah + +Politicians hide themselves away +They only started the war +Why should they go out to fight? +They leave that all to the poor, yeah +Time will tell on their power minds +Making war just for fun +Treating people just like pawns in chess +Wait till their judgment day comes, yeah + +Now, in darkness, world stops turning +Ashes where their bodies burning +No more war pigs have the power +Hand of God has struck the hour +Day of Judgment, God is calling +On their knees, the war pigs crawling +Begging mercies for their sins +Satan, laughing, spreads his wings +Oh, Lord, yeah +""" + + +class WrapperPooler(nn.Module): + def __init__(self, pooler): + super().__init__() + self.pooler = pooler + self.chunks = [] + + def get_pooling_updates(self, task): + return self.pooler.get_pooling_updates(task) + + def forward( + self, + hidden_states, + pooling_metadata, + ): + self.chunks.append(hidden_states.shape[0]) + return self.pooler(hidden_states, pooling_metadata) + + +def inject_pooler(self): + model = self.get_model() + wrapper = WrapperPooler(model.pooler) + model.pooler = wrapper + + +def retrieve_chunks(self): + model = self.get_model() + chunks = model.pooler.chunks + model.pooler.chunks = [] + return chunks + + +@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available") +def test_pooling_chunked_prefill(vllm_runner, monkeypatch): + """Test chunked prefill for pooling models with LastPool.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + model_id = "Qwen/Qwen3-Embedding-0.6B" + + chunk_size = 10 + + # Set chunking parameters to force chunked prefill + # Note: Chunked prefill is automatically handled by vLLM + # internally based on the model size and prompt + with vllm_runner( + model_id, + runner="pooling", + long_prefill_token_threshold=chunk_size, + tensor_parallel_size=1, + enforce_eager=True, + enable_chunked_prefill=True, + ) as llm: + llm.get_llm().llm_engine.collective_rpc(inject_pooler) + + tokenizer = llm.get_llm().get_tokenizer() + tokens = tokenizer(prompt)["input_ids"] + prompt_len = len(tokens) + full_chunks, last_chunk = divmod(prompt_len, chunk_size) + expected_chunks = [chunk_size] * full_chunks + if last_chunk: + expected_chunks.append(last_chunk) + llm.embed([prompt]) + chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0] + + # Check that PoolerWrapper was called and chunks were received + assert len(chunks) > 1 + assert chunks == expected_chunks + + # Disable chunked prefill + with vllm_runner( + model_id, + runner="pooling", + tensor_parallel_size=1, + enforce_eager=True, + ) as llm: + llm.get_llm().llm_engine.collective_rpc(inject_pooler) + llm.embed([prompt]) + chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0] + + # Check that PoolerWrapper was called and no chunks were received + assert len(chunks) == 1 + assert chunks[0] == prompt_len + + +@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available") +def test_pooling_prefix_cache(vllm_runner, monkeypatch): + """Test chunked prefill for pooling models with LastPool.""" + + verses = prompt.split("\n\n") + + with monkeypatch.context() as m: + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + model_id = "Qwen/Qwen3-Embedding-0.6B" + + with vllm_runner( + model_id, + runner="pooling", + enable_prefix_caching=True, + tensor_parallel_size=1, + enforce_eager=True, + ) as llm: + llm.get_llm().llm_engine.collective_rpc(inject_pooler) + tokenizer = llm.get_llm().get_tokenizer() + + prompt1 = "\n\n".join([verses[0], verses[1]]) + prompt2 = "\n\n".join([verses[0], verses[2]]) + tokens1 = tokenizer(prompt1)["input_ids"] + tokens2 = tokenizer(prompt2)["input_ids"] + prompt1_len = len(tokens1) + prompt2_len = len(tokens2) + + llm.embed([prompt1]) + chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0] + + assert len(chunks) == 1 + assert chunks[0] == prompt1_len + + llm.embed([prompt2]) + chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0] + + assert len(chunks) == 1 + assert chunks[0] <= prompt1_len + assert chunks[0] < prompt2_len + + cache_config = llm.get_llm().llm_engine.cache_config + print(f"{cache_config=}") + # Prefixes are cached in blocks + assert (prompt2_len - chunks[0]) % cache_config.block_size == 0 diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py index c5c5d35b83c3e..283a76dab6723 100644 --- a/tests/v1/engine/conftest.py +++ b/tests/v1/engine/conftest.py @@ -6,6 +6,7 @@ import torch from transformers import AutoTokenizer from tests.v1.engine.utils import ( + FULL_STRINGS, NUM_PROMPT_LOGPROBS_UNDER_TEST, NUM_SAMPLE_LOGPROBS_UNDER_TEST, PROMPT_LEN, @@ -18,8 +19,6 @@ from vllm.engine.arg_utils import EngineArgs from ...distributed.conftest import publisher_config, random_port # noqa: F401 -from tests.v1.engine.utils import FULL_STRINGS # isort: skip - EngineCoreSampleLogprobsType = list[tuple[torch.Tensor, torch.Tensor]] EngineCorePromptLogprobsType = tuple[torch.Tensor, torch.Tensor] diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 8f715c085b5d1..b9fa553142781 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -17,7 +17,12 @@ from vllm.platforms import current_platform from vllm.sampling_params import RequestOutputKind from vllm.utils import set_default_torch_num_threads from vllm.v1.engine.async_llm import AsyncLLM -from vllm.v1.metrics.loggers import LoggingStatLogger +from vllm.v1.metrics.loggers import ( + AggregatedLoggingStatLogger, + LoggingStatLogger, + PerEngineStatLoggerAdapter, + PrometheusStatLogger, +) if not current_platform.is_cuda(): pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True) @@ -384,6 +389,12 @@ class MockLoggingStatLogger(LoggingStatLogger): self.log = MagicMock() +class MockAggregatedStatLogger(AggregatedLoggingStatLogger): + def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): + super().__init__(vllm_config, engine_indexes) + self.log = MagicMock() + + @pytest.mark.asyncio async def test_customize_loggers(monkeypatch): """Test that we can customize the loggers. @@ -401,10 +412,45 @@ async def test_customize_loggers(monkeypatch): await engine.do_log_stats() - stat_loggers = engine.logger_manager.per_engine_logger_dict - assert len(stat_loggers) == 1 - assert len(stat_loggers[0]) == 2 # LoggingStatLogger + MockLoggingStatLogger - stat_loggers[0][0].log.assert_called_once() + stat_loggers = engine.logger_manager.stat_loggers + assert ( + len(stat_loggers) == 3 + ) # MockLoggingStatLogger + LoggingStatLogger + Promethus Logger + print(f"{stat_loggers=}") + stat_loggers[0].per_engine_stat_loggers[0].log.assert_called_once() + assert isinstance(stat_loggers[1], PerEngineStatLoggerAdapter) + assert isinstance(stat_loggers[1].per_engine_stat_loggers[0], LoggingStatLogger) + assert isinstance(stat_loggers[2], PrometheusStatLogger) + + +@pytest.mark.asyncio +async def test_customize_aggregated_loggers(monkeypatch): + """Test that we can customize the aggregated loggers. + If a customized logger is provided at the init, it should + be added to the default loggers. + """ + + with monkeypatch.context() as m, ExitStack() as after: + m.setenv("VLLM_USE_V1", "1") + + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args( + TEXT_ENGINE_ARGS, + stat_loggers=[MockLoggingStatLogger, MockAggregatedStatLogger], + ) + after.callback(engine.shutdown) + + await engine.do_log_stats() + + stat_loggers = engine.logger_manager.stat_loggers + assert len(stat_loggers) == 4 + # MockLoggingStatLogger + MockAggregatedStatLogger + # + LoggingStatLogger + PrometheusStatLogger + stat_loggers[0].per_engine_stat_loggers[0].log.assert_called_once() + stat_loggers[1].log.assert_called_once() + assert isinstance(stat_loggers[2], PerEngineStatLoggerAdapter) + assert isinstance(stat_loggers[2].per_engine_stat_loggers[0], LoggingStatLogger) + assert isinstance(stat_loggers[3], PrometheusStatLogger) @pytest.mark.asyncio(scope="module") diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index 3bf722900df37..ed6154462bb2b 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -19,11 +19,18 @@ done echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE" +DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD +if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then + KV_CONFIG_HETERO_LAYOUT=',"enable_permute_local_kv":"True"' +else + KV_CONFIG_HETERO_LAYOUT='' +fi + # Build the kv-transfer-config once if [[ "$KV_BUFFER_DEVICE" == "cuda" ]]; then - KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"'${KV_CONFIG_HETERO_LAYOUT}'}' else - KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}" + KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\""${KV_CONFIG_HETERO_LAYOUT}"}" fi # Models to run @@ -117,6 +124,7 @@ run_tests_for_model() { # Build the command with or without model-specific args BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID \ + VLLM_KV_CACHE_LAYOUT='HND' \ UCX_NET_DEVICES=all \ VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \ vllm serve $model_name \ @@ -157,6 +165,7 @@ run_tests_for_model() { # Build the command with or without model-specific args BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID \ + VLLM_KV_CACHE_LAYOUT=$DECODER_KV_LAYOUT \ UCX_NET_DEVICES=all \ VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \ vllm serve $model_name \ diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py index 0bb67b574fa14..b5c8f378be182 100644 --- a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py +++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py @@ -26,7 +26,7 @@ def _make_empty_scheduler_output(): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, kv_connector_metadata=SharedStorageConnectorMetadata(), ) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 71f5d4b2b0fd9..869e80a1af88c 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -286,9 +286,12 @@ def test_prompt_less_than_block_size(): class FakeNixlConnectorWorker(NixlConnectorWorker): REMOTE_ENGINE_ID = "remote_engine" - def __init__(self, *args, hand_shake_latency: float = 1.8, **kwargs): + def __init__( + self, *args, hand_shake_latency: float = 1.8, kv_cache_layout="HND", **kwargs + ): super().__init__(*args, **kwargs) self._hand_shake_latency = hand_shake_latency + self.kv_cache_layout = kv_cache_layout def _nixl_handshake( self, host: str, port: int, remote_tp_size: int, expected_engine_id: str @@ -564,10 +567,63 @@ class TestNixlHandshake: # We don't check layout for homogeneous TP and MLA for now, as the # whole block is moved. - worker.add_remote_agent(meta, remote_tp_size=2) + with pytest.raises(RuntimeError): + # mismatched layout is expected to fail + worker.add_remote_agent(meta, remote_tp_size=2) with pytest.raises(AssertionError): worker.add_remote_agent(meta, remote_tp_size=1) + @patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper, + ) + def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental( + self, dist_init + ): + """ + Verify that adding a remote agent fails if kv_cache_layout differs. + This test is only relevant for heterogeneous TP. + """ + vllm_config = create_vllm_config(enable_permute_local_kv=True) + + # Mock TP world size to 2 to force heterogeneous TP when + # remote_tp_size=1 + with patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.get_tensor_model_parallel_world_size", # noqa: E501 + return_value=2, + ): + # Initialize connector and worker (with fake NIXL wrapper) + connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector.connector_worker = FakeNixlConnectorWorker( + vllm_config, + connector.engine_id, + hand_shake_latency=0, + kv_cache_layout="NHD", + ) + worker = connector.connector_worker + + # Minimal local registration params used by add_remote_agent + worker.slot_size_per_layer = [2048] + worker.block_len_per_layer = [2048 * worker.block_size] + worker.num_blocks = 1 + worker.dst_num_blocks[worker.engine_id] = worker.num_blocks + + # Metadata with different kv_cache_layout than local worker + meta = NixlAgentMetadata( + engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + agent_metadata=FakeNixlWrapper.AGENT_METADATA, + kv_caches_base_addr=[0], + num_blocks=1, + # prefill TP=1, decode TP=2, remote block_lens is double to local + block_lens=[i * 2 for i in worker.block_len_per_layer], + attn_backend_name=worker.backend_name, + kv_cache_layout="HND", + ) + + # We don't check layout for homogeneous TP and MLA for now, as the + # whole block is moved. + worker.add_remote_agent(meta, remote_tp_size=1) + # NOTE: resource cleanup in mp backend is a bit finicky, so the order in which # we put here is important. First run ray, it will clean up the resources, then @@ -783,6 +839,75 @@ def test_multi_kv_connector_stats_aggregation(): assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6 +@patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper, +) +def test_scheduler_kv_connector_stats_aggregation(): + """Test scheduler and worker KV connector stats aggregation.""" + from vllm.v1.core.sched.output import SchedulerOutput + + scheduler = create_scheduler(create_vllm_config()) + + # Worker stats with transfer metrics + worker_stats = NixlKVConnectorStats() + worker_stats.record_transfer(get_default_xfer_telemetry()) + worker_stats.data["remote_tokens"] = [] + + # Scheduler stats with custom metric (needs dummy transfer to avoid being skipped) + scheduler_stats = NixlKVConnectorStats() + scheduler_stats.data.update( + { # dummy transfer just for testing, to bypass is_empty() check + "transfer_duration": [0], + "post_duration": [0], + "bytes_transferred": [0], + "num_descriptors": [0], + "remote_tokens": [128], + } + ) + + # Mock the scheduler connector's stats method + scheduler.connector.get_kv_connector_stats = lambda: MultiKVConnectorStats( + data={"NixlConnector": scheduler_stats} + ) + + model_output = ModelRunnerOutput( + req_ids=["req_0"], + req_id_to_index={"req_0": 0}, + sampled_token_ids=[[123]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[None], + kv_connector_output=KVConnectorOutput( + kv_connector_stats=MultiKVConnectorStats( + data={"NixlConnector": worker_stats} + ) + ), + ) + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=None, + num_scheduled_tokens={"req_0": 1}, + total_num_scheduled_tokens=1, + scheduled_spec_decode_tokens={}, + scheduled_encoder_inputs={}, + num_common_prefix_blocks=[0], + finished_req_ids=set(), + free_encoder_mm_hashes=set(), + structured_output_request_ids={}, + grammar_bitmask=None, + ) + + engine_core_outputs = scheduler.update_from_output(scheduler_output, model_output) + + final_stats = next( + iter(engine_core_outputs.values()) + ).scheduler_stats.kv_connector_stats + nixl_stats = final_stats["NixlConnector"] + assert nixl_stats.num_successful_transfers == 2 + assert nixl_stats.data["remote_tokens"] == [128] + + @pytest.mark.parametrize("distributed_executor_backend", ["ray", None]) @patch( "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index b07fd0536a436..e7f505d55e7a4 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -83,6 +83,7 @@ def create_vllm_config( block_size: int = 16, max_model_len: int = 10000, enable_chunked_prefill: bool = True, + enable_permute_local_kv: bool = False, ) -> VllmConfig: """Initialize VllmConfig For Testing.""" scheduler_config = SchedulerConfig( @@ -108,6 +109,7 @@ def create_vllm_config( kv_transfer_config = KVTransferConfig( kv_connector="NixlConnector", kv_role="kv_both", + enable_permute_local_kv=enable_permute_local_kv, ) return VllmConfig( scheduler_config=scheduler_config, diff --git a/tests/v1/metrics/test_engine_logger_apis.py b/tests/v1/metrics/test_engine_logger_apis.py index bf780b1f36adf..6dd5b2b069c09 100644 --- a/tests/v1/metrics/test_engine_logger_apis.py +++ b/tests/v1/metrics/test_engine_logger_apis.py @@ -54,7 +54,7 @@ async def test_async_llm_replace_default_loggers(log_stats_enabled_engine_args): engine = AsyncLLM.from_engine_args( log_stats_enabled_engine_args, stat_loggers=[RayPrometheusStatLogger] ) - assert isinstance(engine.logger_manager.prometheus_logger, RayPrometheusStatLogger) + assert isinstance(engine.logger_manager.stat_loggers[0], RayPrometheusStatLogger) engine.shutdown() @@ -73,9 +73,11 @@ async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args): disabled_log_engine_args, stat_loggers=[DummyStatLogger] ) - assert len(engine.logger_manager.per_engine_logger_dict[0]) == 1 + assert len(engine.logger_manager.stat_loggers) == 2 + assert len(engine.logger_manager.stat_loggers[0].per_engine_stat_loggers) == 1 assert isinstance( - engine.logger_manager.per_engine_logger_dict[0][0], DummyStatLogger + engine.logger_manager.stat_loggers[0].per_engine_stat_loggers[0], + DummyStatLogger, ) # log_stats is still True, since custom stat loggers are used diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py index c2fc24442c7cd..c6634395bb167 100644 --- a/tests/v1/tpu/test_topk_topp_sampler.py +++ b/tests/v1/tpu/test_topk_topp_sampler.py @@ -8,10 +8,7 @@ import torch_xla from vllm.platforms import current_platform from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p - -# isort: off from vllm.v1.sample.tpu.sampler import apply_top_k_top_p as apply_top_k_top_p_tpu -# isort: on if not current_platform.is_tpu(): pytest.skip("This test needs a TPU.", allow_module_level=True) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index df9fcdc37fa37..e471174ef6744 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -89,10 +89,10 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -168,10 +168,10 @@ def test_update_states_request_finished(model_runner): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids={req_id}, free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -198,10 +198,10 @@ def test_update_states_request_resumed(model_runner): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -225,10 +225,10 @@ def test_update_states_request_resumed(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -256,10 +256,10 @@ def test_update_states_no_changes(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -291,10 +291,10 @@ def test_update_states_request_unscheduled(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 817cd7f10c1c6..fe52f565c8a86 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -146,10 +146,10 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -212,10 +212,10 @@ def test_update_states_request_finished(model_runner, dist_init): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids={req_id}, free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -244,10 +244,10 @@ def test_update_states_request_resumed(model_runner, dist_init): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -273,10 +273,10 @@ def test_update_states_request_resumed(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -366,10 +366,10 @@ def test_update_states_no_changes(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -403,10 +403,10 @@ def test_update_states_request_unscheduled(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 7fdfdb37a0c0f..a3aa546347255 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -28,6 +28,7 @@ FILES = [ "vllm/assets", "vllm/distributed", "vllm/entrypoints", + "vllm/executor", "vllm/inputs", "vllm/logging_utils", "vllm/multimodal", @@ -44,7 +45,6 @@ SEPARATE_GROUPS = [ "vllm/attention", "vllm/compilation", "vllm/engine", - "vllm/executor", "vllm/inputs", "vllm/lora", "vllm/model_executor", diff --git a/vllm/assets/video.py b/vllm/assets/video.py index a4e67ca0b63e3..277c8ea1bf0d7 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -5,7 +5,6 @@ from dataclasses import dataclass from functools import lru_cache from typing import Any, ClassVar, Literal -import cv2 import numpy as np import numpy.typing as npt from huggingface_hub import hf_hub_download @@ -43,6 +42,8 @@ def download_video_asset(filename: str) -> str: def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: + import cv2 + cap = cv2.VideoCapture(path) if not cap.isOpened(): raise ValueError(f"Could not open video file {path}") @@ -78,6 +79,8 @@ def video_to_pil_images_list(path: str, num_frames: int = -1) -> list[Image.Imag def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]: + import cv2 + cap = cv2.VideoCapture(path) if not cap.isOpened(): raise ValueError(f"Could not open video file {path}") diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 929c3b6a4906b..16c5799f7d0be 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -38,10 +38,6 @@ from vllm.utils import GiB_bytes, direct_register_custom_op logger = init_logger(__name__) USE_XFORMERS_OPS = None -try: - tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe,) -except AttributeError: - tag_cudagraph_unsafe = () # type: ignore[assignment] def check_xformers_availability(): @@ -346,7 +342,7 @@ class Attention(nn.Module, AttentionLayerBase): if self.use_output: output_shape = output_shape if output_shape is not None else query.shape - output = torch.zeros(output_shape, dtype=output_dtype, device=query.device) + output = torch.empty(output_shape, dtype=output_dtype, device=query.device) hidden_size = output_shape[-1] # Reshape the query, key, and value tensors. # NOTE(woosuk): We do this outside the custom op to minimize the @@ -591,6 +587,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): prefix: str = "", use_sparse: bool = False, indexer: object | None = None, + **extra_impl_args, ): super().__init__() self.num_heads = num_heads @@ -643,6 +640,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): v_head_dim=self.v_head_dim, kv_b_proj=kv_b_proj, indexer=indexer, + **extra_impl_args, ) self.use_direct_call = not current_platform.opaque_attention_op() @@ -705,7 +703,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): self.calc_kv_scales(q, kv_c_normed, k_pe) if self.attn_backend.accept_output_buffer: - output = torch.zeros(output_shape, dtype=q.dtype, device=q.device) + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) self.impl.forward( self, q, @@ -722,7 +720,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): ) else: if self.attn_backend.accept_output_buffer: - output = torch.zeros(output_shape, dtype=q.dtype, device=q.device) + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) torch.ops.vllm.unified_mla_attention_with_output( q, kv_c_normed, @@ -879,7 +877,6 @@ direct_register_custom_op( op_name="unified_attention", op_func=unified_attention, fake_impl=unified_attention_fake, - tags=tag_cudagraph_unsafe, ) @@ -931,7 +928,6 @@ direct_register_custom_op( op_func=unified_attention_with_output, mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, - tags=tag_cudagraph_unsafe, ) diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py index 1234e1b2e46a8..b6b7ecd2552a7 100644 --- a/vllm/attention/ops/common.py +++ b/vllm/attention/ops/common.py @@ -173,6 +173,7 @@ def cp_lse_ag_out_rs( cp_attn_lse: torch.Tensor, cp_group: GroupCoordinator, ctx: CPTritonContext = None, + return_lse=False, ): """ cp_attn_out: [ B, H, D ] @@ -192,8 +193,15 @@ def cp_lse_ag_out_rs( cp_attn_lse = cp_attn_lse.contiguous() lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses) - out, _ = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx) + out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx) + assert out.is_contiguous() out = cp_group.reduce_scatter(out, dim=1) + + if return_lse: + cp_num_heads = lse.shape[1] // cp_group.world_size + cp_rank = cp_group.rank_in_group + lse = lse[:, cp_num_heads * cp_rank : cp_num_heads * (cp_rank + 1)] + return out, lse return out diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 331d31c1d0e63..20a15bbc31e38 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -572,6 +572,7 @@ class RandomDataset(BenchmarkDataset): # Ensure the lower bound for output length is at least 1 to # prevent sampling 0 tokens. output_low = max(output_low, 1) + output_high = max(output_high, 1) if input_low > input_high: raise ValueError( @@ -638,6 +639,112 @@ class RandomDataset(BenchmarkDataset): return prompt, total_input_len, token_mismatch +# ----------------------------------------------------------------------------- +# Random Dataset Implementation (Synthetic Data) +# ----------------------------------------------------------------------------- + + +class RandomDatasetForReranking(RandomDataset): + """ + Random dataset specialized for the needs of scoring: + - Batches of inputs + - Inputs composed of pairs + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", + range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, + input_len: int = RandomDataset.DEFAULT_INPUT_LEN, + batchsize: int = 1, + is_reranker: bool = True, + **kwargs, + ) -> list[SampleRequest]: + n_sep_tokens = int(is_reranker) + + query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len + + query_lens, _, query_offsets = self.get_sampling_params( + 1, range_ratio, query_len_param, 0, tokenizer + ) + + query_len = int(query_lens[0]) + + if not is_reranker: + assert num_requests > 1 and batchsize > 1 + num_requests -= 1 + batchsize -= 1 + doc_len_param = input_len + else: + doc_len_param = input_len - query_len - n_sep_tokens + + doc_lens, _, doc_offsets = self.get_sampling_params( + num_requests, range_ratio, doc_len_param, 0, tokenizer + ) + vocab_size = tokenizer.vocab_size + + query_prompt, query_input_len, token_mismatch_total = ( + self.generate_token_sequence( + tokenizer=tokenizer, + prefix_token_ids=[], + prefix_len=0, + vocab_size=vocab_size, + input_len=query_len, + offset=int(query_offsets[0]), + index=0, + ) + ) + + requests = [] + for i in range(num_requests): + prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501 + tokenizer=tokenizer, + prefix_token_ids=[], + prefix_len=0, + vocab_size=vocab_size, + input_len=int(doc_lens[i]), + offset=int(doc_offsets[i]), + index=i + 1, + ) + token_mismatch_total += token_mismatch + requests.append((prompt, total_input_len)) + + batch_requests = [] + # Create batched requests + for i in range(0, num_requests, batchsize): + batch = requests[i : i + batchsize] + query_contrib = ( + (query_input_len + n_sep_tokens) * len(batch) + if is_reranker + else query_input_len + ) + batch_requests.append( + SampleRequest( + prompt=[query_prompt] + [req[0] for req in batch], + prompt_len=query_contrib + sum(req[1] for req in batch), + expected_output_len=0, + request_id=request_id_prefix + str(i // batchsize), + ) + ) + + if token_mismatch_total != 0: + logger.warning( + "Across all generated prompts, there were %d %s tokens " + "than expected after decoding and re-encoding. This is " + "expected due to the imperfect nature of the sampling " + "procedure.", + abs(token_mismatch_total), + "more" if token_mismatch_total > 0 else "fewer", + ) + + return batch_requests + + # ----------------------------------------------------------------------------- # MultiModalDataset Implementation # ----------------------------------------------------------------------------- @@ -1149,6 +1256,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "sonnet", "random", "random-mm", + "random-rerank", "hf", "custom", "prefix_repetition", @@ -1292,6 +1400,14 @@ def add_dataset_parser(parser: FlexibleArgumentParser): default=1, help=("Batch size for random sampling. Only used for embeddings benchmark."), ) + random_group.add_argument( + "--no-reranker", + action="store_true", + help=( + "Whether the model supports reranking natively." + " Only used for reranker benchmark." + ), + ) # random multimodal dataset options random_mm_group = parser.add_argument_group( @@ -1678,6 +1794,19 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: request_id_prefix=args.request_id_prefix, no_oversample=args.no_oversample, ), + "random-rerank": lambda: RandomDatasetForReranking( + random_seed=args.seed, + dataset_path=args.dataset_path, + disable_shuffle=args.disable_shuffle, + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + input_len=args.random_input_len, + range_ratio=args.random_range_ratio, + request_id_prefix=args.request_id_prefix, + batchsize=args.random_batch_size, + is_reranker=not args.no_reranker, + ), "prefix_repetition": lambda: PrefixRepetitionRandomDataset( random_seed=args.seed, dataset_path=args.dataset_path, @@ -2850,13 +2979,14 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset): requests = [] token_mismatch_total = 0 for _ in range(num_prefixes): - prefix_tokens = _generate_exact_length_tokens(prefix_len) + prefix_tokens, prefix_mismatch = _generate_exact_length_tokens(prefix_len) + token_mismatch_total += prefix_mismatch for _ in range(prompts_per_prefix): - suffix_tokens, token_mistmatch = _generate_exact_length_tokens( + suffix_tokens, suffix_mismatch = _generate_exact_length_tokens( suffix_len ) - token_mismatch_total += token_mistmatch + token_mismatch_total += suffix_mismatch combined_tokens = prefix_tokens + suffix_tokens prompt = tokenizer.decode(combined_tokens) prompt_len = len(combined_tokens) diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 2e5c100a3031d..4f427a31b9ee1 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -64,7 +64,7 @@ class StreamedResponseHandler: class RequestFuncInput: """The input for the request function.""" - prompt: str + prompt: str | list[str] api_url: str prompt_len: int output_len: int @@ -484,7 +484,7 @@ async def async_request_openai_audio( return output -async def _run_openai_embeddings( +async def _run_pooling_request( session: aiohttp.ClientSession, api_url: str, payload: dict[str, Any], @@ -497,7 +497,7 @@ async def _run_openai_embeddings( try: async with session.post(url=api_url, headers=headers, json=payload) as response: if response.status == 200: - output.latency = time.perf_counter() - st + output.ttft = output.latency = time.perf_counter() - st data = await response.json() output.success = True output.generated_text = "" @@ -536,7 +536,43 @@ async def async_request_openai_embeddings( } _update_headers_common(headers, request_func_input) - return await _run_openai_embeddings( + return await _run_pooling_request( + session, + api_url, + payload=payload, + headers=headers, + pbar=pbar, + ) + + +async def async_request_vllm_rerank( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + _validate_api_url(api_url, "vLLM score API", "rerank") + + assert ( + isinstance(request_func_input.prompt, list) + and len(request_func_input.prompt) > 1 + ) + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "query": request_func_input.prompt[0], + "documents": request_func_input.prompt[1:], + } + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + _update_headers_common(headers, request_func_input) + + return await _run_pooling_request( session, api_url, payload=payload, @@ -572,7 +608,7 @@ async def async_request_openai_embeddings_chat( } _update_headers_common(headers, request_func_input) - return await _run_openai_embeddings( + return await _run_pooling_request( session, api_url, payload=payload, @@ -685,7 +721,7 @@ async def async_request_infinity_embeddings( } _update_headers_common(headers, request_func_input) - return await _run_openai_embeddings( + return await _run_pooling_request( session, api_url, payload=payload, @@ -722,6 +758,7 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "infinity-embeddings": async_request_infinity_embeddings, "infinity-embeddings-clip": async_request_infinity_embeddings_clip, # (Infinity embedding server does not support vlm2vec) + "vllm-rerank": async_request_vllm_rerank, } OPENAI_COMPATIBLE_BACKENDS = [ diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index c52e384a40023..3c85a1e8fdd9e 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1230,6 +1230,15 @@ def add_cli_args(parser: argparse.ArgumentParser): "the ready check will be skipped.", ) + parser.add_argument( + "--extra-body", + help="A JSON string representing extra body parameters to include " + "in each request." + 'Example: \'{"chat_template_kwargs":{"enable_thinking":false}}\'', + type=json.loads, + default=None, + ) + def main(args: argparse.Namespace) -> dict[str, Any]: return asyncio.run(main_async(args)) @@ -1330,6 +1339,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: else: sampling_params = {} + extra_body = args.extra_body or {} + extra_body = {**sampling_params, **extra_body} + # Avoid GC processing "static" data - reduce pause times. gc.collect() gc.freeze() @@ -1355,7 +1367,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, extra_headers=headers, - extra_body=sampling_params, + extra_body=extra_body, ramp_up_strategy=args.ramp_up_strategy, ramp_up_start_rps=args.ramp_up_start_rps, ramp_up_end_rps=args.ramp_up_end_rps, diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 01c6824ac91f8..ad111a1ebd5be 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -251,7 +251,7 @@ def run_hf( disable_detokenize: bool = False, ) -> float: llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code + model, dtype=torch.float16, trust_remote_code=trust_remote_code ) if llm.config.model_type == "llama": # To enable padding in the HF backend. diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index ea1963e5f2cd3..dbd3122cb3d03 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -41,7 +41,7 @@ logger = init_logger(__name__) def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: - if compilation_config.use_inductor: + if compilation_config.backend == "inductor": # Use standalone compile only if requested, version is new enough, # and the symbol actually exists in this PyTorch build. if ( @@ -55,6 +55,10 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: logger.debug("Using InductorAdaptor") return InductorAdaptor() else: + assert compilation_config.backend == "eager", ( + "Custom backends not supported with CompilationMode.VLLM_COMPILE" + ) + logger.debug("Using EagerAdaptor") return EagerAdaptor() @@ -477,7 +481,7 @@ def set_model_tag(tag: str): class VllmBackend: """The compilation backend for `torch.compile` with vLLM. - It is used for compilation level of `CompilationLevel.PIECEWISE`, + It is used for compilation mode of `CompilationMode.VLLM_COMPILE`, where we customize the compilation. The major work of this backend is to split the graph into diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 1dc8888607f54..7c85c89bcd7ac 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -431,8 +431,15 @@ class AsyncTPPass(VllmPatternMatcherPass): self.dump_patterns(config, self.patterns) - def is_applicable_for_shape(self, shape: int | None) -> bool: - # only do replace for specific shapes + def is_applicable(self, shape: int | None) -> bool: + # This pass is applied on top of the sequence parallelism pass. + # It inherits the same applicability condition as `SequenceParallelismPass`. + # See `SequenceParallelismPass.is_applicable` for more details. + if ( + not self.compilation_config.splitting_ops + or self.compilation_config.use_inductor_graph_partition + ): + return True tp_size = get_tensor_model_parallel_world_size() return shape is not None and shape % tp_size == 0 diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 4553007027e39..e2369a635ad1f 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -575,7 +575,7 @@ class InductorAdaptor(CompilerInterface): Because it is re-entrant, we always set it (even if entering via Dynamo and the context was already entered). We might want to revisit if it - should be set at a different level of compilation. + should be set at a different mode of compilation. This is likely a bug in PyTorch: public APIs should not rely on manually setting up internal contexts. But we also rely on non-public diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 9e8de831bcb29..20918099f169d 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -27,8 +27,8 @@ class CompilationCounter: num_cache_entries_updated: int = 0 # The number of standalone_compile compiled artifacts saved num_compiled_artifacts_saved: int = 0 - # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS - dynamo_as_is_count: int = 0 + # Number of times a model was loaded with CompilationMode.STOCK_TORCH_COMPILE + stock_torch_compile_count: int = 0 def clone(self) -> "CompilationCounter": return copy.deepcopy(self) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index fe19d4e851294..20d4681e2c789 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -18,7 +18,7 @@ from torch._dynamo.symbolic_convert import InliningInstructionTranslator import vllm.envs as envs from vllm.compilation.counter import compilation_counter from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel, VllmConfig, set_current_vllm_config +from vllm.config import CompilationMode, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils import resolve_obj_by_qualname, supports_dynamo @@ -233,11 +233,11 @@ def _support_torch_compile( old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) self.vllm_config = vllm_config enable_compile = enable_if is None or enable_if(vllm_config) - # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner + # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner # will handle the compilation, so we don't need to do anything here. self.do_not_compile = ( - vllm_config.compilation_config.level - in [CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS] + vllm_config.compilation_config.mode + in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE] or not supports_dynamo() or _should_ignore_torch_compile(self.__class__) or not enable_compile @@ -247,7 +247,7 @@ def _support_torch_compile( compilation_counter.num_models_seen += 1 TorchCompileWrapperWithCustomDispatcher.__init__( - self, compilation_level=vllm_config.compilation_config.level + self, compilation_mode=vllm_config.compilation_config.mode ) cls.__init__ = __init__ diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index b9ec3cf6c5edb..4b263fa6f5a2b 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -96,7 +96,7 @@ class InductorPass(CustomGraphPass): encoded = json.dumps(dict_, sort_keys=True).encode("utf-8") return hashlib.sha256(encoded).hexdigest() - def is_applicable_for_shape(self, shape: int | None): + def is_applicable(self, shape: int | None): return True diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index d3c437795fabb..1e6d0e79228b0 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -3,7 +3,7 @@ import time -from vllm.config import CompilationConfig, CompilationLevel, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, VllmConfig from vllm.logger import init_logger logger = init_logger(__name__) @@ -18,7 +18,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig): compilation_config: CompilationConfig = vllm_config.compilation_config path = vllm_config.compile_debug_dump_path() - if compilation_config.level == CompilationLevel.PIECEWISE and path: + if compilation_config.mode == CompilationMode.VLLM_COMPILE and path: import depyf path.mkdir(parents=True, exist_ok=True) @@ -29,7 +29,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig): def end_monitoring_torch_compile(vllm_config: VllmConfig): compilation_config: CompilationConfig = vllm_config.compilation_config - if compilation_config.level == CompilationLevel.PIECEWISE: + if compilation_config.mode == CompilationMode.VLLM_COMPILE: logger.info( "torch.compile takes %.2f s in total", compilation_config.compilation_time ) diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py index 5ea1b30860f59..cea4f9a816377 100644 --- a/vllm/compilation/partition_rules.py +++ b/vllm/compilation/partition_rules.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib +import logging from typing import TYPE_CHECKING from torch._library.utils import lookup_op @@ -38,8 +39,16 @@ def resolve_defined_ops(op_names: list[str]) -> list["torch._ops.OpOverload"]: resolved.append(lookup_op(op_name)) except Exception: # Skip operators that don't exist (e.g., model-specific ops) - logger.warning( - "Failed to resolve operator for Inductor partition: %s", op_name + # Do not warn for attention ops, warn for others + # (most likely manually specified) + from vllm.config import CompilationConfig + + logger.log( + logging.DEBUG + if op_name in CompilationConfig._attention_ops + else logging.WARNING, + "Failed to resolve operator for CUDAGraph partition: %s", + op_name, ) continue diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index e323fa1f77349..55fe235e2d2c1 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -71,9 +71,11 @@ class PostGradPassManager(CustomGraphPass): shape = get_pass_context().runtime_shape for pass_ in self.passes: - if pass_.is_applicable_for_shape(shape): + if pass_.is_applicable(shape): pass_(graph) VllmInductorPass.dump_prefix += 1 + else: + logger.debug("Skipping %s with shape %s", pass_, shape) # post-cleanup goes before fix_functionalization # because it requires a functional graph diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index 8ff530cebd82d..31624a8fdcc0f 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -482,7 +482,25 @@ class SequenceParallelismPass(VllmPatternMatcherPass): ).register(self.patterns) self.dump_patterns(config, self.patterns) - def is_applicable_for_shape(self, shape: int | None) -> bool: + def is_applicable(self, shape: int | None) -> bool: + # When sequence parallelism is enabled, the residual tensor from RMSNorm + # needs to be split along the sequence dimension. However, this dimension + # is symbolic during piecewise compilation, and splitting symbolic shapes + # is not supported. + # + # This pass is therefore only applied when the sequence dimension is + # concrete: + # 1. In full-graph compilation mode (no Dynamo splitting ops are used). + # For this case we always pad num_tokens to be a multiple of + # tensor_parallel_size, so there's no need to check shape % tp_size == 0. + # 2. For specific shape provided during compilation (e.g., from + # `compile_sizes`), which must be divisible by the tensor-parallel + # size. + if ( + not self.compilation_config.splitting_ops + or self.compilation_config.use_inductor_graph_partition + ): + return True tp_size = get_tensor_model_parallel_world_size() return shape is not None and shape % tp_size == 0 diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index ad83e7b3e0c2e..beac928b5d718 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -3,6 +3,7 @@ import functools import operator import time +import weakref from typing import ClassVar import regex as re @@ -28,6 +29,7 @@ class VllmInductorPass(InductorPass): """Keep track of pass index for debug dump ordering.""" def __init__(self, config: VllmConfig): + self.compilation_config = weakref.proxy(config.compilation_config) self.pass_config = config.compilation_config.pass_config self.model_dtype = config.model_config.dtype if config.model_config else None self.device = config.device_config.device if config.device_config else None diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index b4a0d89af0d6d..4b10c85209f63 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -11,7 +11,7 @@ from types import CodeType import torch import vllm.envs as envs -from vllm.config import CompilationLevel, CUDAGraphMode, get_current_vllm_config +from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config from vllm.logger import init_logger logger = init_logger(__name__) @@ -31,7 +31,7 @@ class TorchCompileWrapperWithCustomDispatcher: """ def __init__( - self, compiled_callable: Callable | None = None, compilation_level: int = 0 + self, compiled_callable: Callable | None = None, compilation_mode: int = 0 ): vllm_config = get_current_vllm_config() self.vllm_config = vllm_config @@ -72,7 +72,7 @@ class TorchCompileWrapperWithCustomDispatcher: # subclasses can use this to switch between the custom dispatcher # and the default Dynamo guard mechanism. self.use_custom_dispatcher: bool = ( - compilation_level >= CompilationLevel.DYNAMO_ONCE + compilation_mode >= CompilationMode.DYNAMO_TRACE_ONCE ) def aot_compile(self, *args, **kwargs): @@ -85,7 +85,7 @@ class TorchCompileWrapperWithCustomDispatcher: return self.compiled_callable.aot_compile((args, kwargs)) def __call__(self, *args, **kwargs): - """Implement the dispatch logic here, beyond the torch.compile level. + """Implement the dispatch logic here, beyond the torch.compile mode. NOTE: this function can have additional arguments beyond the forward method, for directly dispatching to the compiled code. """ diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 6a0197d044dcd..7f1cc52024205 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -4,7 +4,7 @@ from vllm.config.cache import CacheConfig from vllm.config.compilation import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, PassConfig, ) @@ -49,7 +49,7 @@ __all__ = [ "CacheConfig", # From vllm.config.compilation "CompilationConfig", - "CompilationLevel", + "CompilationMode", "CUDAGraphMode", "PassConfig", # From vllm.config.device diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 657c430049f86..a34fb0bf920c0 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -15,6 +15,7 @@ from pydantic.dataclasses import dataclass from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.config.utils import config from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname if TYPE_CHECKING: @@ -25,12 +26,20 @@ else: logger = init_logger(__name__) -class CompilationLevel: - # constants for the levels of the compilation process - NO_COMPILATION = 0 - DYNAMO_AS_IS = 1 - DYNAMO_ONCE = 2 - PIECEWISE = 3 +class CompilationMode: + """The compilation approach used for torch.compile-based compilation of the + model.""" + + NONE = 0 + """No torch.compile compilation is applied, model runs in fully eager pytorch mode. + The model runs as-is.""" + STOCK_TORCH_COMPILE = 1 + """The standard `torch.compile` compilation pipeline.""" + DYNAMO_TRACE_ONCE = 2 + """Single Dynamo trace through the model, avoiding recompilation.""" + VLLM_COMPILE = 3 + """Custom vLLM Inductor-based backend with caching, piecewise compilation, + shape specialization, and custom passes.""" class CUDAGraphMode(enum.Enum): @@ -133,7 +142,7 @@ class CompilationConfig: """Configuration for compilation. It has three parts: - Top-level Compilation control: - - [`level`][vllm.config.CompilationConfig.level] + - [`mode`][vllm.config.CompilationConfig.mode] - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path] - [`cache_dir`][vllm.config.CompilationConfig.cache_dir] - [`backend`][vllm.config.CompilationConfig.backend] @@ -170,14 +179,26 @@ class CompilationConfig: # Top-level Compilation control level: int | None = None - """The level of compilation: + """ + Level is deprecated and will be removed in the next release, + either 0.12.0 or 0.11.2 whichever is soonest. + Please use mode. Currently all levels are mapped to mode. + """ + # Top-level Compilation control + mode: int | None = None + """The compilation approach used for torch.compile-based compilation of the + model. - - None: If None, we will select the default compilation level. - For V1 engine this is 3, for V0 engine this is 0. - - 0: no compilation. - - 1: dynamo as is. - - 2: dynamo once. - - 3: piecewise compilation.""" + - None: If None, we will select the default compilation mode. + For V1 engine this is 3. + - 0: NONE: No torch.compile compilation is applied, model runs in fully + eager pytorch mode. The model runs as-is. + - 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline. + - 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding + recompilation by removing guards. + Requires no dynamic-shape-dependent control-flow. + - 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching, + piecewise compilation, shape specialization, and custom passes.""" debug_dump_path: Path | None = None """The path to dump the debug information.""" cache_dir: str = "" @@ -187,16 +208,22 @@ class CompilationConfig: backend: str = "" """The backend for compilation. It needs to be a string: - - "" (empty string): use the default backend. + - "" (empty string): use the default backend ("inductor" on CUDA-alike + platforms). - "eager"/"openxla"/...: use the specified backend registered in PyTorch. - "full.module.name": a qualified name which can be used to import the backend function. We use string to avoid serialization issues when using compilation in a - distributed setting. When the compilation level is 1 or 2, the backend is + distributed setting. When the compilation mode is 1 or 2, the backend is used for the compilation directly (it sees the whole graph). When the - compilation level is 3, the backend is used for the piecewise compilation - (it sees a part of the graph).""" + compilation mode is 3, the backend is used for the piecewise compilation + (it sees a part of the graph). The backend can not be custom for compilation + mode 3, i.e. the backend must be either eager or inductor. Furthermore, + compilation is only piecewise if splitting ops is set accordingly and + use_inductor_graph_partition is off. Note that the default options for + splitting ops are sufficient for piecewise compilation. + """ custom_ops: list[str] = field(default_factory=list) """Fine-grained control over which custom ops to enable/disable. Use 'all' to enable all, 'none' to disable all. Also specify a list of custom op @@ -207,7 +234,7 @@ class CompilationConfig: - 'none,+op1,+op2' to enable only op1 and op2 By default, all custom ops are enabled when running without Inductor and - disabled when running with Inductor: level>=PIECEWISE and use_inductor=True. + disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True. Inductor generates (fused) Triton kernels for disabled custom ops.""" splitting_ops: list[str] | None = None """A list of ops to exclude from cudagraphs, used in piecewise compilation. @@ -229,8 +256,12 @@ class CompilationConfig: If empty list [], no ops are excluded (suitable for full cudagraphs).""" # Inductor capture - use_inductor: bool = True - """Whether to use inductor compilation: + use_inductor: bool | None = None + """ + Whether to use inductor compilation. + + This flag is deprecated and will be removed in the next release 0.12.0. + Please use the 'backend' option instead. - False: inductor compilation is not used. graph runs in eager (custom_ops enabled by default). @@ -238,7 +269,11 @@ class CompilationConfig: One graph for symbolic shape and one graph per size in compile_sizes are compiled using configurations in inductor_compile_config. - This setting is ignored if level