mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 14:17:16 +08:00
Merge branch 'main' into woosuk/test-router
This commit is contained in:
commit
8935ca208d
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.419
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.416
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
|
||||||
|
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||||
|
backend: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "chartqa"
|
||||||
|
metrics:
|
||||||
|
- name: "relaxed_accuracy,none"
|
||||||
|
value: 0.90
|
||||||
|
limit: 100
|
||||||
|
num_fewshot: 0
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
|
||||||
|
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||||
|
backend: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "mmlu_pro"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,custom-extract"
|
||||||
|
value: 0.80
|
||||||
|
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||||
|
num_fewshot: 5
|
||||||
@ -1,4 +1,5 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
# For vllm script, with -t option (tensor parallel size)
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -0,0 +1,12 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
||||||
|
|
||||||
|
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||||
|
backend: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "chartqa"
|
||||||
|
metrics:
|
||||||
|
- name: "relaxed_accuracy,none"
|
||||||
|
value: 0.855
|
||||||
|
limit: 2500
|
||||||
|
num_fewshot: 0
|
||||||
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||||
@ -0,0 +1 @@
|
|||||||
|
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
||||||
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Qwen2.5-VL-7B-Instruct.yaml
|
||||||
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
@ -0,0 +1,44 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install lm-eval==0.4.9
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our correctness tests in vllm's CI."
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:l:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm-vlm \
|
||||||
|
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
||||||
|
--tasks chartqa \
|
||||||
|
--batch_size auto \
|
||||||
|
--apply_chat_template \
|
||||||
|
--limit $LIMIT
|
||||||
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
@ -0,0 +1,50 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
||||||
|
# We use this for fp8, which HF does not support.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm \
|
||||||
|
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
||||||
|
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||||
|
--batch_size auto
|
||||||
@ -19,21 +19,27 @@ RTOL = 0.08
|
|||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
max_model_len = eval_config.get("max_model_len", 4096)
|
||||||
|
batch_size = eval_config.get("batch_size", "auto")
|
||||||
|
backend = eval_config.get("backend", "vllm")
|
||||||
model_args = (
|
model_args = (
|
||||||
f"pretrained={eval_config['model_name']},"
|
f"pretrained={eval_config['model_name']},"
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"tensor_parallel_size={tp_size},"
|
||||||
f"enforce_eager=true,"
|
f"enforce_eager=true,"
|
||||||
f"add_bos_token=true,"
|
f"add_bos_token=true,"
|
||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len}"
|
f"max_model_len={max_model_len},"
|
||||||
)
|
)
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model=backend,
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto",
|
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||||
|
# text models. however, this is regressing measured strict-match for
|
||||||
|
# existing text models in CI, so only apply it for mm.
|
||||||
|
apply_chat_template=backend == "vllm-vlm",
|
||||||
|
batch_size=batch_size,
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@ -63,7 +63,7 @@ steps:
|
|||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
||||||
timeout_in_minutes: 10
|
timeout_in_minutes: 10
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -353,7 +353,7 @@ steps:
|
|||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: V1 Test others (CPU) # 5 mins
|
- label: V1 Test others (CPU) # 5 mins
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -459,6 +459,7 @@ steps:
|
|||||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||||
- pytest -v -s compile/test_decorator.py
|
- pytest -v -s compile/test_decorator.py
|
||||||
- pytest -v -s compile/test_noop_elimination.py
|
- pytest -v -s compile/test_noop_elimination.py
|
||||||
|
- pytest -v -s compile/test_aot_compile.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@ -487,14 +488,14 @@ steps:
|
|||||||
|
|
||||||
- label: Kernels Core Operation Test # 48min
|
- label: Kernels Core Operation Test # 48min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
- tests/kernels/core
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/core
|
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||||
|
|
||||||
- label: Kernels Attention Test %N # 23min
|
- label: Kernels Attention Test %N # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
@ -603,7 +604,8 @@ steps:
|
|||||||
# since torchao nightly is only compatible with torch nightly currently
|
# since torchao nightly is only compatible with torch nightly currently
|
||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
# TODO(jerryzh168): resolve the above comment
|
||||||
|
- uv pip install --system torchao==0.13.0
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
@ -631,7 +633,7 @@ steps:
|
|||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 23 min
|
- label: OpenAI-Compatible Tool Use # 23 min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
fast_check: false
|
fast_check: false
|
||||||
|
|||||||
@ -527,7 +527,8 @@ steps:
|
|||||||
# since torchao nightly is only compatible with torch nightly currently
|
# since torchao nightly is only compatible with torch nightly currently
|
||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
# TODO(jerryzh168): resolve the above comment
|
||||||
|
- uv pip install --system torchao==0.13.0
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
@ -733,6 +734,16 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
|
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||||
|
timeout_in_minutes: 70
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/multimodal/
|
||||||
|
- vllm/inputs/
|
||||||
|
- vllm/v1/core/
|
||||||
|
commands:
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
|
|||||||
3
.github/CODEOWNERS
vendored
3
.github/CODEOWNERS
vendored
@ -5,9 +5,7 @@
|
|||||||
/vllm/attention @LucasWilkinson
|
/vllm/attention @LucasWilkinson
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
|
||||||
/vllm/model_executor/layers/fused_moe @mgoin
|
/vllm/model_executor/layers/fused_moe @mgoin
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
@ -26,7 +24,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1/attention @LucasWilkinson
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
|
|||||||
@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
|
|||||||
else:
|
else:
|
||||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
block_quant_shape = get_weight_block_size_safety(config)
|
block_quant_shape = get_weight_block_size_safety(config)
|
||||||
|
|||||||
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
|
|
||||||
hidden_size = config.hidden_size
|
hidden_size = config.hidden_size
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
use_customized_permute = args.use_customized_permute
|
use_customized_permute = args.use_customized_permute
|
||||||
|
|||||||
@ -22,10 +22,10 @@ else()
|
|||||||
CONFIGURE_COMMAND ""
|
CONFIGURE_COMMAND ""
|
||||||
BUILD_COMMAND ""
|
BUILD_COMMAND ""
|
||||||
)
|
)
|
||||||
FetchContent_Populate(qutlass)
|
|
||||||
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
FetchContent_Populate(qutlass)
|
||||||
|
|
||||||
if(NOT qutlass_SOURCE_DIR)
|
if(NOT qutlass_SOURCE_DIR)
|
||||||
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
|
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "cub_helpers.h"
|
||||||
#include "core/batch_invariant.hpp"
|
#include "core/batch_invariant.hpp"
|
||||||
|
#include "quantization/vectorization_utils.cuh"
|
||||||
|
|
||||||
#include <torch/cuda.h>
|
#include <torch/cuda.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
|
|||||||
const float epsilon, const int num_tokens, const int hidden_size) {
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
__shared__ float s_variance;
|
__shared__ float s_variance;
|
||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
|
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
constexpr int VEC_SIZE = 8;
|
||||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||||
|
float x = static_cast<float>(vec.val[i]);
|
||||||
|
variance += x * x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
auto scalar_op = [&variance](const scalar_t& val) {
|
||||||
|
float x = static_cast<float>(val);
|
||||||
variance += x * x;
|
variance += x * x;
|
||||||
}
|
};
|
||||||
|
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||||
|
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
|
|||||||
@ -10,6 +10,7 @@
|
|||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "cub_helpers.h"
|
||||||
#include "core/batch_invariant.hpp"
|
#include "core/batch_invariant.hpp"
|
||||||
|
#include "quantization/vectorization_utils.cuh"
|
||||||
|
|
||||||
#include <torch/cuda.h>
|
#include <torch/cuda.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
|||||||
__shared__ float s_variance;
|
__shared__ float s_variance;
|
||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
|
||||||
|
constexpr int VEC_SIZE = 8;
|
||||||
|
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||||
|
float x = static_cast<float>(vec.val[i]);
|
||||||
|
variance += x * x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
auto scalar_op = [&variance](const scalar_t& val) {
|
||||||
|
float x = static_cast<float>(val);
|
||||||
variance += x * x;
|
variance += x * x;
|
||||||
}
|
};
|
||||||
|
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||||
|
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
|
|||||||
@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.config import CompilationConfig, CompilationLevel
|
from vllm.config import CompilationConfig, CompilationMode
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
# By default, it goes up to max_num_seqs
|
# By default, it goes up to max_num_seqs
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
|
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
|
||||||
),
|
),
|
||||||
|
|||||||
@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
|
|||||||
"""NO CUDA Graphs support"""
|
"""NO CUDA Graphs support"""
|
||||||
```
|
```
|
||||||
|
|
||||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
||||||
|
|
||||||
The following table lists backends that support full CUDA Graphs at the time of writing.
|
The following table lists backends that support full CUDA Graphs at the time of writing.
|
||||||
|
|
||||||
@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
|
|||||||
import vllm
|
import vllm
|
||||||
from vllm.config import CUDAGraphMode
|
from vllm.config import CUDAGraphMode
|
||||||
|
|
||||||
compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
||||||
model = vllm.LLM(
|
model = vllm.LLM(
|
||||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
|
|||||||
@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
|
|||||||
from awq import AutoAWQForCausalLM
|
from awq import AutoAWQForCausalLM
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
|
model_path = "mistralai/Mistral-7B-Instruct-v0.2"
|
||||||
quant_path = 'mistral-instruct-v0.2-awq'
|
quant_path = "mistral-instruct-v0.2-awq"
|
||||||
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
|
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
|
||||||
|
|
||||||
# Load model
|
# Load model
|
||||||
model = AutoAWQForCausalLM.from_pretrained(
|
model = AutoAWQForCausalLM.from_pretrained(
|
||||||
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
|
model_path,
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
|
use_cache=False,
|
||||||
)
|
)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
|
|||||||
@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|||||||
from auto_round import AutoRound
|
from auto_round import AutoRound
|
||||||
|
|
||||||
model_name = "Qwen/Qwen3-0.6B"
|
model_name = "Qwen/Qwen3-0.6B"
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
|
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
bits, group_size, sym = 4, 128, True
|
bits, group_size, sym = 4, 128, True
|
||||||
|
|||||||
@ -34,7 +34,7 @@ llm = LLM(
|
|||||||
model=model_id,
|
model=model_id,
|
||||||
dtype=torch.bfloat16,
|
dtype=torch.bfloat16,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
quantization="bitblas"
|
quantization="bitblas",
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -53,6 +53,6 @@ llm = LLM(
|
|||||||
dtype=torch.float16,
|
dtype=torch.float16,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
quantization="bitblas",
|
quantization="bitblas",
|
||||||
max_model_len=1024
|
max_model_len=1024,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|||||||
@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
|
|||||||
llm = LLM(
|
llm = LLM(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
dtype=torch.bfloat16,
|
dtype=torch.bfloat16,
|
||||||
trust_remote_code=True
|
trust_remote_code=True,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -43,7 +43,7 @@ llm = LLM(
|
|||||||
model=model_id,
|
model=model_id,
|
||||||
dtype=torch.bfloat16,
|
dtype=torch.bfloat16,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
quantization="bitsandbytes"
|
quantization="bitsandbytes",
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
|||||||
|
|
||||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
MODEL_ID,
|
||||||
|
device_map="auto",
|
||||||
|
dtype="auto",
|
||||||
)
|
)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||||
```
|
```
|
||||||
@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
|
|||||||
|
|
||||||
# Configure the simple PTQ quantization
|
# Configure the simple PTQ quantization
|
||||||
recipe = QuantizationModifier(
|
recipe = QuantizationModifier(
|
||||||
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
|
targets="Linear",
|
||||||
|
scheme="FP8_DYNAMIC",
|
||||||
|
ignore=["lm_head"],
|
||||||
|
)
|
||||||
|
|
||||||
# Apply the quantization algorithm.
|
# Apply the quantization algorithm.
|
||||||
oneshot(model=model, recipe=recipe)
|
oneshot(model=model, recipe=recipe)
|
||||||
|
|||||||
@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
|
|||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "You are a helpful assistant"
|
"content": "You are a helpful assistant",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "Hello"
|
"content": "Hello",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
"content": "Hello! How can I assist you today?"
|
"content": "Hello! How can I assist you today?",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
|
|||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
# Create an LLM.
|
# Create an LLM.
|
||||||
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
llm = LLM(
|
||||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||||
|
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||||
|
)
|
||||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||||
# that contain the prompt, generated text, and other information.
|
# that contain the prompt, generated text, and other information.
|
||||||
outputs = llm.chat(conversation, sampling_params)
|
outputs = llm.chat(conversation, sampling_params)
|
||||||
|
|||||||
@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
|||||||
calibration_dataset = load_dataset(
|
calibration_dataset = load_dataset(
|
||||||
"allenai/c4",
|
"allenai/c4",
|
||||||
data_files="en/c4-train.00001-of-01024.json.gz",
|
data_files="en/c4-train.00001-of-01024.json.gz",
|
||||||
split="train"
|
split="train",
|
||||||
).select(range(1024))["text"]
|
).select(range(1024))["text"]
|
||||||
|
|
||||||
quant_config = QuantizeConfig(bits=4, group_size=128)
|
quant_config = QuantizeConfig(bits=4, group_size=128)
|
||||||
|
|||||||
@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
|||||||
|
|
||||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
MODEL_ID,
|
||||||
|
device_map="auto",
|
||||||
|
dtype="auto",
|
||||||
)
|
)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||||
```
|
```
|
||||||
@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
|
|||||||
},
|
},
|
||||||
ignore=["lm_head"],
|
ignore=["lm_head"],
|
||||||
update_size=NUM_CALIBRATION_SAMPLES,
|
update_size=NUM_CALIBRATION_SAMPLES,
|
||||||
dampening_frac=0.01
|
dampening_frac=0.01,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
|||||||
|
|
||||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
MODEL_ID,
|
||||||
|
device_map="auto",
|
||||||
|
dtype="auto",
|
||||||
)
|
)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||||
```
|
```
|
||||||
|
|||||||
@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
|
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
|
||||||
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
|
|
||||||
|
# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
|
||||||
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
|
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
|
||||||
|
|||||||
@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
|
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
|
||||||
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
|
llm = LLM(
|
||||||
kv_cache_dtype="fp8",
|
model="meta-llama/Llama-2-7b-chat-hf",
|
||||||
calculate_kv_scales=True)
|
kv_cache_dtype="fp8",
|
||||||
|
calculate_kv_scales=True,
|
||||||
|
)
|
||||||
prompt = "London is the capital of"
|
prompt = "London is the capital of"
|
||||||
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
|
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
|
||||||
print(out)
|
print(out)
|
||||||
@ -80,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
|
|||||||
|
|
||||||
# Select model and load it
|
# Select model and load it
|
||||||
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
|
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
|
||||||
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
|
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||||
|
|
||||||
# Select calibration dataset
|
# Select calibration dataset
|
||||||
|
|||||||
@ -48,7 +48,9 @@ to fetch model and tokenizer.
|
|||||||
MAX_SEQ_LEN = 512
|
MAX_SEQ_LEN = 512
|
||||||
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
MODEL_ID,
|
||||||
|
device_map="auto",
|
||||||
|
dtype="auto",
|
||||||
)
|
)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
|
|||||||
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
|
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
|
||||||
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
|
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
|
||||||
|
|
||||||
tokenized_outputs = tokenizer(text_data, return_tensors="pt",
|
tokenized_outputs = tokenizer(
|
||||||
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
|
text_data,
|
||||||
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
|
return_tensors="pt",
|
||||||
batch_size=BATCH_SIZE, drop_last=True)
|
padding=True,
|
||||||
|
truncation=True,
|
||||||
|
max_length=MAX_SEQ_LEN,
|
||||||
|
)
|
||||||
|
calib_dataloader = DataLoader(
|
||||||
|
tokenized_outputs['input_ids'],
|
||||||
|
batch_size=BATCH_SIZE,
|
||||||
|
drop_last=True,
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
### 3. Set the Quantization Configuration
|
### 3. Set the Quantization Configuration
|
||||||
@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
|
|||||||
load_quant_algo_config_from_file)
|
load_quant_algo_config_from_file)
|
||||||
|
|
||||||
# Define fp8/per-tensor/static spec.
|
# Define fp8/per-tensor/static spec.
|
||||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
|
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
|
||||||
is_dynamic=False).to_quantization_spec()
|
observer_method="min_max",
|
||||||
|
is_dynamic=False,
|
||||||
|
).to_quantization_spec()
|
||||||
|
|
||||||
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
|
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
|
||||||
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
|
global_quant_config = QuantizationConfig(
|
||||||
weight=FP8_PER_TENSOR_SPEC)
|
input_tensors=FP8_PER_TENSOR_SPEC,
|
||||||
|
weight=FP8_PER_TENSOR_SPEC,
|
||||||
|
)
|
||||||
|
|
||||||
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
|
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
|
||||||
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
|
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
|
||||||
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
|
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
|
||||||
kv_cache_quant_config = {name :
|
kv_cache_quant_config = {
|
||||||
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
|
name: QuantizationConfig(
|
||||||
weight=global_quant_config.weight,
|
input_tensors=global_quant_config.input_tensors,
|
||||||
output_tensors=KV_CACHE_SPEC)
|
weight=global_quant_config.weight,
|
||||||
for name in kv_cache_layer_names_for_llama}
|
output_tensors=KV_CACHE_SPEC,
|
||||||
|
)
|
||||||
|
for name in kv_cache_layer_names_for_llama
|
||||||
|
}
|
||||||
layer_quant_config = kv_cache_quant_config.copy()
|
layer_quant_config = kv_cache_quant_config.copy()
|
||||||
|
|
||||||
# Define algorithm config by config file.
|
# Define algorithm config by config file.
|
||||||
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
|
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
|
||||||
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
|
|
||||||
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
|
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
|
||||||
|
|
||||||
EXCLUDE_LAYERS = ["lm_head"]
|
EXCLUDE_LAYERS = ["lm_head"]
|
||||||
@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
|
|||||||
layer_quant_config=layer_quant_config,
|
layer_quant_config=layer_quant_config,
|
||||||
kv_cache_quant_config=kv_cache_quant_config,
|
kv_cache_quant_config=kv_cache_quant_config,
|
||||||
exclude=EXCLUDE_LAYERS,
|
exclude=EXCLUDE_LAYERS,
|
||||||
algo_config=algo_config)
|
algo_config=algo_config,
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4. Quantize the Model and Export
|
### 4. Quantize the Model and Export
|
||||||
@ -165,8 +182,11 @@ for more exporting format details.
|
|||||||
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
||||||
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
exporter.export_safetensors_model(freezed_model,
|
exporter.export_safetensors_model(
|
||||||
quant_config=quant_config, tokenizer=tokenizer)
|
freezed_model,
|
||||||
|
quant_config=quant_config,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
### 5. Evaluation in vLLM
|
### 5. Evaluation in vLLM
|
||||||
@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
|
|||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
# Create an LLM.
|
# Create an LLM.
|
||||||
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
llm = LLM(
|
||||||
kv_cache_dtype='fp8',quantization='quark')
|
model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
||||||
|
kv_cache_dtype="fp8",
|
||||||
|
quantization="quark",
|
||||||
|
)
|
||||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||||
# that contain the prompt, generated text, and other information.
|
# that contain the prompt, generated text, and other information.
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|||||||
@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
|
|||||||
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
|
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
|
||||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||||
model_name,
|
model_name,
|
||||||
torch_dtype="auto",
|
dtype="auto",
|
||||||
device_map="auto",
|
device_map="auto",
|
||||||
quantization_config=quantization_config
|
quantization_config=quantization_config
|
||||||
)
|
)
|
||||||
|
|||||||
@ -11,6 +11,7 @@ vLLM currently supports the following reasoning models:
|
|||||||
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|
||||||
|--------------|-------------|------------------|-------------|
|
|--------------|-------------|------------------|-------------|
|
||||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
|
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
|
||||||
|
| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
|
||||||
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
|
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
|
||||||
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
|
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
|
||||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
|
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
|
||||||
@ -20,8 +21,9 @@ vLLM currently supports the following reasoning models:
|
|||||||
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
|
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
||||||
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
|
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
|
||||||
|
DeepSeek-V3.1 tool calling is supported in non-thinking mode.
|
||||||
|
|
||||||
## Quickstart
|
## Quickstart
|
||||||
|
|
||||||
|
|||||||
@ -352,6 +352,16 @@ Supported models:
|
|||||||
|
|
||||||
Flags: `--tool-call-parser qwen3_xml`
|
Flags: `--tool-call-parser qwen3_xml`
|
||||||
|
|
||||||
|
### Olmo 3 Models (`olmo3`)
|
||||||
|
|
||||||
|
Olmo 3 models output tool calls in a format that is very similar to the one expected by the `pythonic` parser (see below), with a few differences. Each tool call is a pythonic string, but the parallel tool calls are newline-delimited, and the calls are wrapped within XML tags as `<function_calls>..</function_calls>`. In addition, the parser also allows JSON boolean and null literals (`true`, `false`, and `null`) in addition to the pythonic ones (`True`, `False`, and `None`).
|
||||||
|
|
||||||
|
Supported models:
|
||||||
|
|
||||||
|
* TODO (will be updated after Olmo 3 release)
|
||||||
|
|
||||||
|
Flags: `--tool-call-parser olmo3`
|
||||||
|
|
||||||
### Models with Pythonic Tool Calls (`pythonic`)
|
### Models with Pythonic Tool Calls (`pythonic`)
|
||||||
|
|
||||||
A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
|
A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
|
||||||
|
|||||||
@ -23,7 +23,46 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
|
|||||||
# --8<-- [end:pre-built-wheels]
|
# --8<-- [end:pre-built-wheels]
|
||||||
# --8<-- [start:build-wheel-from-source]
|
# --8<-- [start:build-wheel-from-source]
|
||||||
|
|
||||||
--8<-- "docs/getting_started/installation/cpu/build.inc.md:extra-information"
|
First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt-get update -y
|
||||||
|
sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
|
||||||
|
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
```
|
||||||
|
|
||||||
|
Second, clone the vLLM project:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/vllm-project/vllm.git vllm_source
|
||||||
|
cd vllm_source
|
||||||
|
```
|
||||||
|
|
||||||
|
Third, install required dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
|
||||||
|
uv pip install -r requirements/cpu.txt --torch-backend cpu
|
||||||
|
```
|
||||||
|
|
||||||
|
??? console "pip"
|
||||||
|
```bash
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
|
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally, build and install vLLM:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
|
||||||
|
```
|
||||||
|
|
||||||
|
If you want to develop vLLM, install it in editable mode instead.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
|
||||||
|
```
|
||||||
|
|
||||||
Testing has been conducted on AWS Graviton3 instances for compatibility.
|
Testing has been conducted on AWS Graviton3 instances for compatibility.
|
||||||
|
|
||||||
|
|||||||
@ -1,44 +0,0 @@
|
|||||||
# --8<-- [start:extra-information]
|
|
||||||
|
|
||||||
First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
sudo apt-get update -y
|
|
||||||
sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
|
|
||||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
|
||||||
```
|
|
||||||
|
|
||||||
Second, clone the vLLM project:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git clone https://github.com/vllm-project/vllm.git vllm_source
|
|
||||||
cd vllm_source
|
|
||||||
```
|
|
||||||
|
|
||||||
Third, install required dependencies:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
|
|
||||||
uv pip install -r requirements/cpu.txt --torch-backend cpu
|
|
||||||
```
|
|
||||||
|
|
||||||
??? console "pip"
|
|
||||||
```bash
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
|
||||||
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
|
||||||
```
|
|
||||||
|
|
||||||
Finally, build and install vLLM:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
VLLM_TARGET_DEVICE=cpu python setup.py install
|
|
||||||
```
|
|
||||||
|
|
||||||
If you want to develop vLLM, install it in editable mode instead.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
VLLM_TARGET_DEVICE=cpu python setup.py develop
|
|
||||||
```
|
|
||||||
|
|
||||||
# --8<-- [end:extra-information]
|
|
||||||
@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
|
|||||||
api_key=openai_api_key,
|
api_key=openai_api_key,
|
||||||
base_url=openai_api_base,
|
base_url=openai_api_base,
|
||||||
)
|
)
|
||||||
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
|
completion = client.completions.create(
|
||||||
prompt="San Francisco is a")
|
model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||||
|
prompt="San Francisco is a",
|
||||||
|
)
|
||||||
print("Completion result:", completion)
|
print("Completion result:", completion)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package:
|
|||||||
messages=[
|
messages=[
|
||||||
{"role": "system", "content": "You are a helpful assistant."},
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
{"role": "user", "content": "Tell me a joke."},
|
{"role": "user", "content": "Tell me a joke."},
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
print("Chat response:", chat_response)
|
print("Chat response:", chat_response)
|
||||||
```
|
```
|
||||||
|
|||||||
@ -22,6 +22,11 @@ sys.modules["vllm._C"] = MagicMock()
|
|||||||
class PydanticMagicMock(MagicMock):
|
class PydanticMagicMock(MagicMock):
|
||||||
"""`MagicMock` that's able to generate pydantic-core schemas."""
|
"""`MagicMock` that's able to generate pydantic-core schemas."""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
name = kwargs.pop("name", None)
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.__spec__ = importlib.machinery.ModuleSpec(name, None)
|
||||||
|
|
||||||
def __get_pydantic_core_schema__(self, source_type, handler):
|
def __get_pydantic_core_schema__(self, source_type, handler):
|
||||||
return core_schema.any_schema()
|
return core_schema.any_schema()
|
||||||
|
|
||||||
@ -42,7 +47,9 @@ def auto_mock(module, attr, max_mocks=50):
|
|||||||
raise e
|
raise e
|
||||||
except ModuleNotFoundError as e:
|
except ModuleNotFoundError as e:
|
||||||
logger.info("Mocking %s for argparse doc generation", e.name)
|
logger.info("Mocking %s for argparse doc generation", e.name)
|
||||||
sys.modules[e.name] = PydanticMagicMock()
|
sys.modules[e.name] = PydanticMagicMock(name=e.name)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to import %s.%s: %s", module, attr, e)
|
||||||
|
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
|
f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
|
||||||
|
|||||||
@ -60,7 +60,7 @@ from vllm import LLM
|
|||||||
llm = LLM(
|
llm = LLM(
|
||||||
"s3://my-bucket/vllm/facebook/opt-125m/v1",
|
"s3://my-bucket/vllm/facebook/opt-125m/v1",
|
||||||
load_format="tensorizer",
|
load_format="tensorizer",
|
||||||
enable_lora=True
|
enable_lora=True,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -97,6 +97,6 @@ llm = LLM(
|
|||||||
"s3://my-bucket/vllm/facebook/opt-125m/v1",
|
"s3://my-bucket/vllm/facebook/opt-125m/v1",
|
||||||
load_format="tensorizer",
|
load_format="tensorizer",
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}
|
model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}},
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|||||||
@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
|
|||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "You are a helpful assistant"
|
"content": "You are a helpful assistant",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "Hello"
|
"content": "Hello",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
"content": "Hello! How can I assist you today?"
|
"content": "Hello! How can I assist you today?",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
|||||||
@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u
|
|||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
|
llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
|
||||||
(output,) = llm.score("What is the capital of France?",
|
(output,) = llm.score(
|
||||||
"The capital of Brazil is Brasilia.")
|
"What is the capital of France?",
|
||||||
|
"The capital of Brazil is Brasilia.",
|
||||||
|
)
|
||||||
|
|
||||||
score = output.outputs.score
|
score = output.outputs.score
|
||||||
print(f"Score: {score}")
|
print(f"Score: {score}")
|
||||||
@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please
|
|||||||
|
|
||||||
Here is an example to serve a model with Matryoshka Embeddings enabled.
|
Here is an example to serve a model with Matryoshka Embeddings enabled.
|
||||||
|
|
||||||
```text
|
```bash
|
||||||
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
|
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka
|
|||||||
```python
|
```python
|
||||||
from vllm import LLM, PoolingParams
|
from vllm import LLM, PoolingParams
|
||||||
|
|
||||||
llm = LLM(model="jinaai/jina-embeddings-v3",
|
llm = LLM(
|
||||||
runner="pooling",
|
model="jinaai/jina-embeddings-v3",
|
||||||
trust_remote_code=True)
|
runner="pooling",
|
||||||
outputs = llm.embed(["Follow the white rabbit."],
|
trust_remote_code=True,
|
||||||
pooling_params=PoolingParams(dimensions=32))
|
)
|
||||||
|
outputs = llm.embed(
|
||||||
|
["Follow the white rabbit."],
|
||||||
|
pooling_params=PoolingParams(dimensions=32),
|
||||||
|
)
|
||||||
print(outputs[0].outputs)
|
print(outputs[0].outputs)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -234,13 +240,13 @@ A code example can be found here: <gh-file:examples/offline_inference/pooling/em
|
|||||||
|
|
||||||
Use the following command to start vllm server.
|
Use the following command to start vllm server.
|
||||||
|
|
||||||
```text
|
```bash
|
||||||
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
|
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
|
||||||
```
|
```
|
||||||
|
|
||||||
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
|
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
|
||||||
|
|
||||||
```text
|
```bash
|
||||||
curl http://127.0.0.1:8000/v1/embeddings \
|
curl http://127.0.0.1:8000/v1/embeddings \
|
||||||
-H 'accept: application/json' \
|
-H 'accept: application/json' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
|
|||||||
@ -278,8 +278,8 @@ https_proxy=http://your.proxy.server:port vllm serve <model_name>
|
|||||||
```python
|
```python
|
||||||
import os
|
import os
|
||||||
|
|
||||||
os.environ['http_proxy'] = 'http://your.proxy.server:port'
|
os.environ["http_proxy"] = "http://your.proxy.server:port"
|
||||||
os.environ['https_proxy'] = 'http://your.proxy.server:port'
|
os.environ["https_proxy"] = "http://your.proxy.server:port"
|
||||||
```
|
```
|
||||||
|
|
||||||
### ModelScope
|
### ModelScope
|
||||||
|
|||||||
47
docs/serving/context_parallel_deployment.md
Normal file
47
docs/serving/context_parallel_deployment.md
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
# Context Parallel Deployment
|
||||||
|
|
||||||
|
Context parallel mainly solves the problem of serving long context requests. As prefill and decode present quite different characteristics and have quite different SLO (service level objectives), we need to implement context parallel separately for them. The major considerations are:
|
||||||
|
|
||||||
|
- For long context prefill, we need to control the TTFT (time to first token) by amortizing the computation time of the prefill across query tokens.
|
||||||
|
- For long context decode, we need more space for KV cache to increase the batchsize (and hence the throughput).
|
||||||
|
|
||||||
|
## Prefill Context Parallel
|
||||||
|
|
||||||
|
During prefill, for a long request with `T` new tokens, we need to compute query/key/value tensors for these new tokens. Say we have `N` GPUs, we can split the request into `N` chunks, and each GPU computes one chunk of the query/key/value tensors.
|
||||||
|
|
||||||
|
Depending on the use case, there're two possible strategies:
|
||||||
|
|
||||||
|
1. Partial query, full key/value: If the request token length is moderately long (we can afford holding the full key/value tensors), and the goal is to accelerate the prefill (and amortize the computation time of the prefill across query tokens), then we can gather the key/value tensors from all GPUs and let each GPU compute the attention output corresponding to the query tokens of its chunk.
|
||||||
|
2. Partial query, partial key/value: If the request token length is too long, we cannot afford holding the full key/value tensors anymore, then we can only compute one chunk of query/key/value tensors for each GPU, and use techniques like [ring-attention](http://arxiv.org/abs/2310.01889) to send/recv key/value tensors chunk by chunk.
|
||||||
|
|
||||||
|
Both approaches are under active development.
|
||||||
|
|
||||||
|
## Decode Context Parallel
|
||||||
|
|
||||||
|
Due to the auto-regressive nature of decoding, every decoding step needs to compute a small amount of query tokens w.r.t. a large number of key/value tokens stored in the paged KV cache. The core of decode context parallel is how to shard the KV cache across GPUs.
|
||||||
|
|
||||||
|
For a model with `H` kv-heads, a request with `T` tokens in the context needs to store `H * T` key/value tensors in the KV cache.
|
||||||
|
|
||||||
|
1. If one GPU can hold them all, and the performance is good enough, then no parallelization is needed.
|
||||||
|
2. If one GPU cannot hold them all, or we want to hold more requests in the KV cache, we can first shard the KV cache along the `H` dimension, that's the plain tensor parallel sharding. It's as simple as adding `-tp <num_gpus>` to the command line.
|
||||||
|
3. Since `H` is limited (determined by the model architecture), when we continue to increase the tensor parallel size, the KV cache for each GPU will be duplicated for `tp_size / H` times. Of course, duplication is not good for efficiency. Then we need to add decode context parallel to further shard the KV cache along the `T` dimension. This is as simple as adding `-dcp <size>` to the command line. Note that `size` does not increase the number of GPUs we need to launch, but just reduces the KV cache duplication. The dcp size should lie in the range of `[1, tp_size/H]`. With larger dcp size, the KV cache duplication is reduced, but the communication overhead increases.
|
||||||
|
|
||||||
|
Theoretically, it is possible to extend the dcp size beyond `tp_size / H` to further shard the KV cache and accelerate the decoding phase. However, since the number of query tokens is limited in decoding, it's unclear what should we do for the remaining `dcp_size - tp_size / H` GPUs for non-attention layers. For the sake of simplicity, dcp size is upper bounded by `tp_size / H`. If you want to further accelerate the decoding phase, you can consider increasing the `tp_size` first, and then increasing the dcp size.
|
||||||
|
|
||||||
|
Note that kv cache can grow during decoding, and the sharding strategy needs to be carefully implemented. We use an interleaving strategy to shard the KV cache along the `T` dimension, so that kv cache for future tokens can be naturally sharded along the `T` dimension. This is proposed by [Chao Hong from Moonshot](https://github.com/youzhedian), and also explained in details in [this paper](http://arxiv.org/abs/2507.07120).
|
||||||
|
|
||||||
|
Case study:
|
||||||
|
|
||||||
|
For DeepSeek-R1, we have 1 kv-head when MLA is enabled. The typical single-node deployment with `-tp 8` causes 8x KV cache duplication. We can consider adding `-dcp 8` to reduce the KV cache duplication.
|
||||||
|
|
||||||
|
For Kimi-K2, the architecture is similar to DeepSeek-R1, but with more parameters. When we deploy it with `-tp 16`, the KV cache duplication is 16x. We can add `-dcp 16` to completely remove the KV cache duplication, at the cost of more communication overhead. We can also add `-dcp 8` to reduce the KV cache duplication to 2x. Although it still duplicates the KV cache twice, the communication overhead is smaller since the DCP communication only happens inside one node.
|
||||||
|
|
||||||
|
For Qwen3-235B-A22B, we have 4 kv-heads. When we deploy it with `-tp 8`, the KV cache duplication is 2x. Then we can add `-dcp 2` to remove the KV cache duplication.
|
||||||
|
|
||||||
|
In short, for decode context parallel, try to increase `-tp` size until you get satisfactory performance, and then add `-dcp` to reduce the KV cache duplication.
|
||||||
|
|
||||||
|
Decode context parallel is supported in vLLM, for both MLA and GQA models. Some attention backends also support the combination of decode context parallel and MTP (multi-token prediction) to further accelerate the decoding phase.
|
||||||
|
|
||||||
|
## Technical Discussions
|
||||||
|
|
||||||
|
The main discussions happen in the `#sig-context-parallel` channel of [vLLM Slack](https://slack.vllm.ai/).
|
||||||
@ -243,10 +243,10 @@ try:
|
|||||||
"remote_engine_id": None, # Will be populated by vLLM
|
"remote_engine_id": None, # Will be populated by vLLM
|
||||||
"remote_block_ids": None, # Will be populated by vLLM
|
"remote_block_ids": None, # Will be populated by vLLM
|
||||||
"remote_host": None, # Will be populated by vLLM
|
"remote_host": None, # Will be populated by vLLM
|
||||||
"remote_port": None # Will be populated by vLLM
|
"remote_port": None, # Will be populated by vLLM
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
extra_headers={"X-Request-Id": request_id}
|
extra_headers={"X-Request-Id": request_id},
|
||||||
)
|
)
|
||||||
|
|
||||||
print("-" * 50)
|
print("-" * 50)
|
||||||
@ -262,7 +262,7 @@ try:
|
|||||||
extra_body={
|
extra_body={
|
||||||
"kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info
|
"kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info
|
||||||
},
|
},
|
||||||
extra_headers={"X-Request-Id": request_id} # Same request ID
|
extra_headers={"X-Request-Id": request_id}, # Same request ID
|
||||||
)
|
)
|
||||||
|
|
||||||
print("-" * 50)
|
print("-" * 50)
|
||||||
|
|||||||
@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
|
|||||||
```python
|
```python
|
||||||
from langchain_community.llms import VLLM
|
from langchain_community.llms import VLLM
|
||||||
|
|
||||||
llm = VLLM(model="mosaicml/mpt-7b",
|
llm = VLLM(
|
||||||
trust_remote_code=True, # mandatory for hf models
|
model="mosaicml/mpt-7b",
|
||||||
max_new_tokens=128,
|
trust_remote_code=True, # mandatory for hf models
|
||||||
top_k=10,
|
max_new_tokens=128,
|
||||||
top_p=0.95,
|
top_k=10,
|
||||||
temperature=0.8,
|
top_p=0.95,
|
||||||
# tensor_parallel_size=... # for distributed inference
|
temperature=0.8,
|
||||||
|
# for distributed inference
|
||||||
|
# tensor_parallel_size=...,
|
||||||
)
|
)
|
||||||
|
|
||||||
print(llm("What is the capital of France ?"))
|
print(llm("What is the capital of France ?"))
|
||||||
|
|||||||
@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an
|
|||||||
completion = client.chat.completions.create(
|
completion = client.chat.completions.create(
|
||||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "user", "content": "Hello!"}
|
{"role": "user", "content": "Hello!"},
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
print(completion.choices[0].message)
|
print(completion.choices[0].message)
|
||||||
@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below:
|
|||||||
completion = client.chat.completions.create(
|
completion = client.chat.completions.create(
|
||||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
|
{
|
||||||
]
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
|
|||||||
completion = client.chat.completions.create(
|
completion = client.chat.completions.create(
|
||||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
|
||||||
],
|
],
|
||||||
extra_body={
|
extra_body={
|
||||||
"structured_outputs": {"choice": ["positive", "negative"]}
|
"structured_outputs": {"choice": ["positive", "negative"]},
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -149,11 +154,11 @@ with `--enable-request-id-headers`.
|
|||||||
completion = client.chat.completions.create(
|
completion = client.chat.completions.create(
|
||||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
|
||||||
],
|
],
|
||||||
extra_headers={
|
extra_headers={
|
||||||
"x-request-id": "sentiment-classification-00001",
|
"x-request-id": "sentiment-classification-00001",
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
print(completion._request_id)
|
print(completion._request_id)
|
||||||
|
|
||||||
@ -162,7 +167,7 @@ with `--enable-request-id-headers`.
|
|||||||
prompt="A robot may not injure a human being",
|
prompt="A robot may not injure a human being",
|
||||||
extra_headers={
|
extra_headers={
|
||||||
"x-request-id": "completion-test",
|
"x-request-id": "completion-test",
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
print(completion._request_id)
|
print(completion._request_id)
|
||||||
```
|
```
|
||||||
@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi
|
|||||||
model="openai/whisper-large-v3-turbo",
|
model="openai/whisper-large-v3-turbo",
|
||||||
file=audio_file,
|
file=audio_file,
|
||||||
language="en",
|
language="en",
|
||||||
response_format="verbose_json"
|
response_format="verbose_json",
|
||||||
)
|
)
|
||||||
|
|
||||||
print(transcription.text)
|
print(transcription.text)
|
||||||
@ -812,22 +817,22 @@ You can pass multi-modal inputs to scoring models by passing `content` including
|
|||||||
"model": "jinaai/jina-reranker-m0",
|
"model": "jinaai/jina-reranker-m0",
|
||||||
"text_1": "slm markdown",
|
"text_1": "slm markdown",
|
||||||
"text_2": {
|
"text_2": {
|
||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
]
|
],
|
||||||
}
|
|
||||||
},
|
},
|
||||||
|
},
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
response_json = response.json()
|
response_json = response.json()
|
||||||
|
|||||||
@ -95,7 +95,7 @@ def parse_args():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--compilation-config",
|
"--compilation-config",
|
||||||
type=int,
|
type=int,
|
||||||
help=("Compilation optimization (O) level 0-3."),
|
help=("Compilation optimization (O) mode 0-3."),
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--quantization",
|
"--quantization",
|
||||||
|
|||||||
@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
url = s3_client.generate_presigned_url(
|
url = s3_client.generate_presigned_url(
|
||||||
ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in
|
ClientMethod=client_method,
|
||||||
|
Params=method_parameters,
|
||||||
|
ExpiresIn=expires_in,
|
||||||
)
|
)
|
||||||
except ClientError:
|
except ClientError:
|
||||||
raise
|
raise
|
||||||
@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
|
|||||||
|
|
||||||
s3_client = boto3.client("s3")
|
s3_client = boto3.client("s3")
|
||||||
input_url = generate_presigned_url(
|
input_url = generate_presigned_url(
|
||||||
s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600
|
s3_client,
|
||||||
|
"get_object",
|
||||||
|
{"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"},
|
||||||
|
expires_in=3600,
|
||||||
)
|
)
|
||||||
output_url = generate_presigned_url(
|
output_url = generate_presigned_url(
|
||||||
s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600
|
s3_client,
|
||||||
|
"put_object",
|
||||||
|
{"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"},
|
||||||
|
expires_in=3600,
|
||||||
)
|
)
|
||||||
print(f"{input_url=}")
|
print(f"{input_url=}")
|
||||||
print(f"{output_url=}")
|
print(f"{output_url=}")
|
||||||
|
|||||||
@ -26,6 +26,12 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
|
|||||||
python examples/offline_inference/pooling/embed_matryoshka_fy.py
|
python examples/offline_inference/pooling/embed_matryoshka_fy.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Multi vector retrieval usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python examples/offline_inference/pooling/multi_vector_retrieval.py
|
||||||
|
```
|
||||||
|
|
||||||
## Named Entity Recognition (NER) usage
|
## Named Entity Recognition (NER) usage
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
56
examples/offline_inference/pooling/multi_vector_retrieval.py
Normal file
56
examples/offline_inference/pooling/multi_vector_retrieval.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from argparse import Namespace
|
||||||
|
|
||||||
|
from vllm import LLM, EngineArgs
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
# Set example specific arguments
|
||||||
|
parser.set_defaults(
|
||||||
|
model="BAAI/bge-m3",
|
||||||
|
runner="pooling",
|
||||||
|
enforce_eager=True,
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: Namespace):
|
||||||
|
# Sample prompts.
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create an LLM.
|
||||||
|
# You should pass runner="pooling" for embedding models
|
||||||
|
llm = LLM(**vars(args))
|
||||||
|
|
||||||
|
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||||
|
outputs = llm.embed(prompts)
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||||
|
for prompt, output in zip(prompts, outputs):
|
||||||
|
embeds = output.outputs.embedding
|
||||||
|
print(len(embeds))
|
||||||
|
|
||||||
|
# Generate embedding for each token. The output is a list of PoolingRequestOutput.
|
||||||
|
outputs = llm.encode(prompts, pooling_task="token_embed")
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||||
|
for prompt, output in zip(prompts, outputs):
|
||||||
|
multi_vector = output.outputs.data
|
||||||
|
print(multi_vector.shape)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
main(args)
|
||||||
@ -40,7 +40,7 @@ def main():
|
|||||||
model_impl="terratorch",
|
model_impl="terratorch",
|
||||||
)
|
)
|
||||||
|
|
||||||
pooling_params = PoolingParams(task="encode", softmax=False)
|
pooling_params = PoolingParams(task="token_classify", activation=False)
|
||||||
pooler_output = llm.encode(
|
pooler_output = llm.encode(
|
||||||
img_prompt,
|
img_prompt,
|
||||||
pooling_params=pooling_params,
|
pooling_params=pooling_params,
|
||||||
|
|||||||
@ -18,6 +18,12 @@ python examples/online_serving/pooling/embedding_embed_dtype_client.py
|
|||||||
python examples/online_serving/pooling/jinaai_rerank_client.py
|
python examples/online_serving/pooling/jinaai_rerank_client.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Multi vector retrieval usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python examples/online_serving/pooling/multi_vector_retrieval_client.py
|
||||||
|
```
|
||||||
|
|
||||||
## Named Entity Recognition (NER) usage
|
## Named Entity Recognition (NER) usage
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@ -0,0 +1,54 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example online usage of Pooling API for multi vector retrieval.
|
||||||
|
|
||||||
|
Run `vllm serve <model> --runner pooling`
|
||||||
|
to start up the server in vLLM. e.g.
|
||||||
|
|
||||||
|
vllm serve BAAI/bge-m3
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||||
|
headers = {"User-Agent": "Test Client"}
|
||||||
|
response = requests.post(api_url, headers=headers, json=prompt)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--host", type=str, default="localhost")
|
||||||
|
parser.add_argument("--port", type=int, default=8000)
|
||||||
|
parser.add_argument("--model", type=str, default="BAAI/bge-m3")
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
api_url = f"http://{args.host}:{args.port}/pooling"
|
||||||
|
model_name = args.model
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
prompt = {"model": model_name, "input": prompts}
|
||||||
|
|
||||||
|
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
|
||||||
|
for output in pooling_response.json()["data"]:
|
||||||
|
multi_vector = torch.tensor(output["data"])
|
||||||
|
print(multi_vector.shape)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
main(args)
|
||||||
@ -84,7 +84,7 @@ directly to load models:
|
|||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
"s3://my-bucket/vllm/facebook/opt-125m/v1",
|
"s3://my-bucket/vllm/facebook/opt-125m/v1",
|
||||||
load_format="tensorizer"
|
load_format="tensorizer",
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -107,7 +107,6 @@ markers = [
|
|||||||
"distributed: run this test only in distributed GPU tests",
|
"distributed: run this test only in distributed GPU tests",
|
||||||
"skip_v1: do not run this test with v1",
|
"skip_v1: do not run this test with v1",
|
||||||
"optional: optional tests that are automatically skipped, include --optional to run them",
|
"optional: optional tests that are automatically skipped, include --optional to run them",
|
||||||
"extra_server_args: extra arguments to pass to the server fixture",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.ty.src]
|
[tool.ty.src]
|
||||||
|
|||||||
@ -7,7 +7,7 @@ requests >= 2.26.0
|
|||||||
tqdm
|
tqdm
|
||||||
blake3
|
blake3
|
||||||
py-cpuinfo
|
py-cpuinfo
|
||||||
transformers >= 4.55.2
|
transformers >= 4.56.0
|
||||||
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
|
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
|
||||||
protobuf # Required by LlamaTokenizer.
|
protobuf # Required by LlamaTokenizer.
|
||||||
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
||||||
|
|||||||
@ -11,6 +11,7 @@ from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.config import CompilationConfig
|
from vllm.config import CompilationConfig
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
@ -32,13 +33,13 @@ def temporary_environ(env_vars):
|
|||||||
os.environ[k] = v
|
os.environ[k] = v
|
||||||
|
|
||||||
|
|
||||||
test_params_full_cudagraph = []
|
model_backends_full_cudagraph = []
|
||||||
|
|
||||||
# deepseek-ai/DeepSeek-V2-Lite with MLA
|
# deepseek-ai/DeepSeek-V2-Lite with MLA
|
||||||
MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
|
MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
|
||||||
for mla_backend in MLA_backends:
|
for mla_backend in MLA_backends:
|
||||||
test_params_full_cudagraph.append(
|
model_backends_full_cudagraph.append(
|
||||||
pytest.param(("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend]))
|
("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])
|
||||||
)
|
)
|
||||||
|
|
||||||
# Qwen/Qwen2-1.5B-Instruct with other backends
|
# Qwen/Qwen2-1.5B-Instruct with other backends
|
||||||
@ -46,14 +47,18 @@ other_backend_configs = [
|
|||||||
backend_configs[c] for c in backend_configs if c not in MLA_backends
|
backend_configs[c] for c in backend_configs if c not in MLA_backends
|
||||||
]
|
]
|
||||||
for backend_config in other_backend_configs:
|
for backend_config in other_backend_configs:
|
||||||
test_params_full_cudagraph.append(
|
model_backends_full_cudagraph.append(("Qwen/Qwen2-1.5B-Instruct", backend_config))
|
||||||
pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config))
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="class")
|
@pytest.fixture(scope="class")
|
||||||
def llm_pair(request):
|
def llm_pair(request):
|
||||||
model, backend_config = request.param
|
model, backend_config, use_inductor_graph_partition = request.param
|
||||||
|
backend_config.comp_config["use_inductor_graph_partition"] = (
|
||||||
|
use_inductor_graph_partition
|
||||||
|
)
|
||||||
|
|
||||||
|
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
|
pytest.skip("Inductor graph partition only supported in torch>=2.9")
|
||||||
|
|
||||||
# Dynamically skip test if GPU capability is not met
|
# Dynamically skip test if GPU capability is not met
|
||||||
if (
|
if (
|
||||||
@ -104,7 +109,15 @@ def llm_pair(request):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True)
|
@pytest.mark.parametrize(
|
||||||
|
"llm_pair",
|
||||||
|
[
|
||||||
|
pytest.param((model, backend_config, use_inductor_graph_partition))
|
||||||
|
for model, backend_config in model_backends_full_cudagraph
|
||||||
|
for use_inductor_graph_partition in [True, False]
|
||||||
|
],
|
||||||
|
indirect=True,
|
||||||
|
)
|
||||||
class TestFullCUDAGraph:
|
class TestFullCUDAGraph:
|
||||||
"""
|
"""
|
||||||
Use a class such that an llm pair is constructed once for all
|
Use a class such that an llm pair is constructed once for all
|
||||||
|
|||||||
@ -5,6 +5,7 @@ Test (piecewise) compilation with a simple model where multiple submodules
|
|||||||
are compiled and graph captured separately.
|
are compiled and graph captured separately.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
@ -13,12 +14,13 @@ from vllm.compilation.counter import compilation_counter
|
|||||||
from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
|
from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
|
||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
CompilationLevel,
|
CompilationMode,
|
||||||
CUDAGraphMode,
|
CUDAGraphMode,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
)
|
)
|
||||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||||
|
from vllm.utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
# This import automatically registers `torch.ops.silly.attention`
|
# This import automatically registers `torch.ops.silly.attention`
|
||||||
from .. import silly_attention # noqa: F401
|
from .. import silly_attention # noqa: F401
|
||||||
@ -190,16 +192,21 @@ def run_model(
|
|||||||
return output.cpu()
|
return output.cpu()
|
||||||
|
|
||||||
|
|
||||||
def test_multi_graph_piecewise_compile_outputs_equal():
|
@pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
|
||||||
|
def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
|
||||||
|
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
|
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||||
|
|
||||||
outputs = []
|
outputs = []
|
||||||
|
|
||||||
# piecewise compile
|
# vllmcompile compile
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_cudagraph=True,
|
use_cudagraph=True,
|
||||||
splitting_ops=["silly::attention"],
|
splitting_ops=["silly::attention"],
|
||||||
cudagraph_capture_sizes=[1, 2],
|
cudagraph_capture_sizes=[1, 2],
|
||||||
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||||
@ -220,23 +227,31 @@ def test_multi_graph_piecewise_compile_outputs_equal():
|
|||||||
# static tensor addresses
|
# static tensor addresses
|
||||||
inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
|
inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
|
||||||
|
|
||||||
with compilation_counter.expect(
|
if use_inductor_graph_partition:
|
||||||
num_graphs_seen=2, # two graphs for the model
|
# Splitting happens at Inductor lowering level,
|
||||||
num_piecewise_graphs_seen=6,
|
# total piecewise fx graphs is equal to total graphs
|
||||||
|
num_piecewise_fx = 2
|
||||||
|
num_piecewise_capturable_fx = 2
|
||||||
|
else:
|
||||||
# attn_one, attn_two each has 3 piecewise graphs
|
# attn_one, attn_two each has 3 piecewise graphs
|
||||||
# (pre attn, post attn, silly_attention) each
|
# (pre attn, post attn, silly_attention) each
|
||||||
num_piecewise_capturable_graphs_seen=4,
|
num_piecewise_fx = 6
|
||||||
# attn_one, attn_two has pre attn and post attn each, total=4
|
# attn_one, attn_two has pre attn and post attn each, total=4
|
||||||
num_backend_compilations=4, # num_piecewise_capturable_graphs_seen
|
num_piecewise_capturable_fx = 4
|
||||||
num_cudagraph_captured=8,
|
|
||||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
with compilation_counter.expect(
|
||||||
|
num_graphs_seen=2, # two graphs for the model
|
||||||
|
num_piecewise_graphs_seen=num_piecewise_fx,
|
||||||
|
num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
|
||||||
|
num_backend_compilations=num_piecewise_capturable_fx,
|
||||||
|
num_cudagraph_captured=8, # num_cudagraph_sizes * num_partitions
|
||||||
):
|
):
|
||||||
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
||||||
|
|
||||||
# no compile or cudagraph
|
# no compile or cudagraph
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.NO_COMPILATION,
|
mode=CompilationMode.NONE,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
||||||
@ -265,9 +280,10 @@ def test_multi_graph_piecewise_compile_outputs_equal():
|
|||||||
# piecewise compile without CUDA graph
|
# piecewise compile without CUDA graph
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_cudagraph=False,
|
use_cudagraph=False,
|
||||||
splitting_ops=["silly::attention"],
|
splitting_ops=["silly::attention"],
|
||||||
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||||
@ -286,9 +302,9 @@ def test_multi_graph_piecewise_compile_outputs_equal():
|
|||||||
|
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=2,
|
num_graphs_seen=2,
|
||||||
num_piecewise_graphs_seen=6,
|
num_piecewise_graphs_seen=num_piecewise_fx,
|
||||||
num_piecewise_capturable_graphs_seen=4,
|
num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
|
||||||
num_backend_compilations=4,
|
num_backend_compilations=num_piecewise_capturable_fx,
|
||||||
num_cudagraph_captured=0, # no cudagraph captured
|
num_cudagraph_captured=0, # no cudagraph captured
|
||||||
):
|
):
|
||||||
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from vllm.compilation.counter import compilation_counter
|
|||||||
from vllm.compilation.decorators import support_torch_compile
|
from vllm.compilation.decorators import support_torch_compile
|
||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
CompilationLevel,
|
CompilationMode,
|
||||||
CUDAGraphMode,
|
CUDAGraphMode,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
@ -61,7 +61,7 @@ def _run_simple_model(
|
|||||||
):
|
):
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_cudagraph=True,
|
use_cudagraph=True,
|
||||||
use_inductor=use_inductor,
|
use_inductor=use_inductor,
|
||||||
splitting_ops=splitting_ops,
|
splitting_ops=splitting_ops,
|
||||||
|
|||||||
@ -9,6 +9,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
|
|||||||
initialized randomly with a fixed seed.
|
initialized randomly with a fixed seed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from copy import deepcopy
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
@ -20,12 +21,13 @@ from vllm.compilation.counter import compilation_counter
|
|||||||
from vllm.compilation.decorators import support_torch_compile
|
from vllm.compilation.decorators import support_torch_compile
|
||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
CompilationLevel,
|
CompilationMode,
|
||||||
CUDAGraphMode,
|
CUDAGraphMode,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
)
|
)
|
||||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||||
|
from vllm.utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
# This import automatically registers `torch.ops.silly.attention`
|
# This import automatically registers `torch.ops.silly.attention`
|
||||||
from .. import silly_attention # noqa: F401
|
from .. import silly_attention # noqa: F401
|
||||||
@ -257,27 +259,13 @@ def tractable_computation(
|
|||||||
|
|
||||||
|
|
||||||
@torch.inference_mode
|
@torch.inference_mode
|
||||||
def run_model(
|
def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
|
||||||
llama_config, use_compile: bool, backend: str, split_attn: bool = False
|
# Start with a fresh copy to make sure there's no cache dir sharing
|
||||||
) -> torch.Tensor:
|
compile_config = deepcopy(compile_config)
|
||||||
if use_compile:
|
cudagraph_runtime_mode = compile_config.cudagraph_mode
|
||||||
compilation_config = CompilationConfig(
|
|
||||||
level=CompilationLevel.PIECEWISE,
|
|
||||||
use_cudagraph=True,
|
|
||||||
backend=backend,
|
|
||||||
cudagraph_capture_sizes=[1, 2],
|
|
||||||
)
|
|
||||||
if split_attn:
|
|
||||||
compilation_config.splitting_ops = ["silly::attention"]
|
|
||||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
|
||||||
else:
|
|
||||||
compilation_config = CompilationConfig(
|
|
||||||
level=CompilationLevel.NO_COMPILATION,
|
|
||||||
)
|
|
||||||
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
|
||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=compilation_config, additional_config=llama_config
|
compilation_config=compile_config, additional_config=llama_config
|
||||||
)
|
)
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
model = (
|
model = (
|
||||||
@ -338,8 +326,25 @@ def run_model(
|
|||||||
return output.cpu()
|
return output.cpu()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("backend", ["inductor", "eager"])
|
@pytest.mark.parametrize(
|
||||||
def test_toy_llama(backend: str):
|
"backend, use_inductor_graph_partition",
|
||||||
|
[
|
||||||
|
("eager", False), # No inductor
|
||||||
|
("inductor", False), # Inductor, Dynamo partition
|
||||||
|
("inductor", True), # Inductor, Inductor partition
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_toy_llama(
|
||||||
|
backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
|
||||||
|
):
|
||||||
|
# We disable the vLLM compile cache into a new tmp dir for 2 reasons:
|
||||||
|
# 1. To make sure we can properly track the number of Inductor compilations.
|
||||||
|
# 2. Inductor partitioning does not play nicely with Autograd cache (below)
|
||||||
|
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||||
|
|
||||||
|
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
|
pytest.skip("Inductor graph partition only supported in torch>=2.9")
|
||||||
|
|
||||||
# compare output with and without piecewise compilation
|
# compare output with and without piecewise compilation
|
||||||
|
|
||||||
llama_config = LlamaConfig(
|
llama_config = LlamaConfig(
|
||||||
@ -350,6 +355,32 @@ def test_toy_llama(backend: str):
|
|||||||
hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True
|
hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
compile_config_no_compile = CompilationConfig(
|
||||||
|
level=CompilationMode.NONE,
|
||||||
|
cudagraph_mode=CUDAGraphMode.NONE,
|
||||||
|
backend="eager",
|
||||||
|
)
|
||||||
|
|
||||||
|
compile_config_no_split = CompilationConfig(
|
||||||
|
level=CompilationMode.VLLM_COMPILE,
|
||||||
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
|
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||||
|
backend=backend,
|
||||||
|
cudagraph_capture_sizes=[1, 2],
|
||||||
|
)
|
||||||
|
|
||||||
|
# FIXME(luka/boyuan): the graph from the previous test case
|
||||||
|
# (no inductor partition) gets cached by AotAutograd so then the
|
||||||
|
# compilation with inductor partitioning incorrectly loads an unpartitioned
|
||||||
|
# graph and never partitions. I think this is a bug with custom inductor
|
||||||
|
# partitioning but does not affect vLLM more generally as vLLM uses its own
|
||||||
|
# cache (which takes inductor partitioning into account).
|
||||||
|
if use_inductor_graph_partition:
|
||||||
|
compile_config_no_split.inductor_compile_config["force_disable_caches"] = True
|
||||||
|
|
||||||
|
compile_config_split = deepcopy(compile_config_no_split)
|
||||||
|
compile_config_split.splitting_ops = ["silly::attention"]
|
||||||
|
|
||||||
outputs = []
|
outputs = []
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=0,
|
num_graphs_seen=0,
|
||||||
@ -358,8 +389,9 @@ def test_toy_llama(backend: str):
|
|||||||
num_backend_compilations=0,
|
num_backend_compilations=0,
|
||||||
num_cudagraph_captured=0,
|
num_cudagraph_captured=0,
|
||||||
):
|
):
|
||||||
outputs.append(run_model(llama_config, backend="eager", use_compile=False))
|
outputs.append(run_model(llama_config, compile_config_no_compile))
|
||||||
run_model(tractable_config, backend="eager", use_compile=False)
|
|
||||||
|
run_model(tractable_config, compile_config_no_compile)
|
||||||
|
|
||||||
if backend == "inductor":
|
if backend == "inductor":
|
||||||
kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
|
kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
|
||||||
@ -367,35 +399,34 @@ def test_toy_llama(backend: str):
|
|||||||
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
|
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
|
||||||
|
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
# One graph for the model
|
num_graphs_seen=1, # one graph for the model
|
||||||
num_graphs_seen=1,
|
|
||||||
num_piecewise_graphs_seen=1,
|
num_piecewise_graphs_seen=1,
|
||||||
num_piecewise_capturable_graphs_seen=1,
|
num_piecewise_capturable_graphs_seen=1,
|
||||||
# num_piecewise_capturable_graphs_seen
|
num_backend_compilations=1, # num_piecewise_capturable_graphs_seen
|
||||||
num_backend_compilations=1,
|
|
||||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
|
||||||
num_cudagraph_captured=2,
|
num_cudagraph_captured=2,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
outputs.append(run_model(llama_config, backend=backend, use_compile=True))
|
outputs.append(run_model(llama_config, compile_config_no_split))
|
||||||
run_model(tractable_config, backend=backend, use_compile=True)
|
|
||||||
|
run_model(tractable_config, compile_config_no_split)
|
||||||
|
|
||||||
|
if use_inductor_graph_partition:
|
||||||
|
num_piecewise_fx = 1
|
||||||
|
num_piecewise_capturable_fx = 1
|
||||||
|
else:
|
||||||
|
num_piecewise_fx = 2 * llama_config.num_layers + 1
|
||||||
|
num_piecewise_capturable_fx = 1 + llama_config.num_layers
|
||||||
|
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=1, # one graph for the model
|
num_graphs_seen=1, # one graph for the model
|
||||||
num_piecewise_graphs_seen=2 * llama_config.num_layers + 1, # 2 * num_layers + 1
|
num_piecewise_graphs_seen=num_piecewise_fx,
|
||||||
num_piecewise_capturable_graphs_seen=1
|
num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
|
||||||
+ llama_config.num_layers, # 1 + num_layers
|
num_backend_compilations=num_piecewise_capturable_fx,
|
||||||
num_backend_compilations=1
|
# num_cudagraph_sizes * num_partitions
|
||||||
+ llama_config.num_layers, # num_piecewise_capturable_graphs_seen
|
num_cudagraph_captured=2 * (1 + llama_config.num_layers),
|
||||||
num_cudagraph_captured=2
|
|
||||||
* (
|
|
||||||
1 + llama_config.num_layers
|
|
||||||
), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
|
||||||
):
|
):
|
||||||
outputs.append(
|
outputs.append(run_model(llama_config, compile_config_split))
|
||||||
run_model(llama_config, backend=backend, use_compile=True, split_attn=True)
|
run_model(tractable_config, compile_config_split)
|
||||||
)
|
|
||||||
run_model(tractable_config, backend=backend, use_compile=True, split_attn=True)
|
|
||||||
|
|
||||||
for i in range(1, len(outputs)):
|
for i in range(1, len(outputs)):
|
||||||
assert torch.allclose(outputs[0], outputs[i])
|
assert torch.allclose(outputs[0], outputs[i])
|
||||||
@ -427,14 +458,14 @@ def benchmark():
|
|||||||
for piecewise in [False, True]:
|
for piecewise in [False, True]:
|
||||||
if piecewise:
|
if piecewise:
|
||||||
compilation_config = CompilationConfig(
|
compilation_config = CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_cudagraph=True,
|
use_cudagraph=True,
|
||||||
splitting_ops=["silly::attention"],
|
splitting_ops=["silly::attention"],
|
||||||
cudagraph_capture_sizes=cudagraph_sizes,
|
cudagraph_capture_sizes=cudagraph_sizes,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
compilation_config = CompilationConfig(
|
compilation_config = CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
cudagraph_capture_sizes=cudagraph_sizes,
|
cudagraph_capture_sizes=cudagraph_sizes,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -62,5 +62,4 @@ direct_register_custom_op(
|
|||||||
mutates_args=["out"],
|
mutates_args=["out"],
|
||||||
fake_impl=silly_attention_fake,
|
fake_impl=silly_attention_fake,
|
||||||
target_lib=silly_lib,
|
target_lib=silly_lib,
|
||||||
tags=(torch._C.Tag.cudagraph_unsafe,),
|
|
||||||
)
|
)
|
||||||
|
|||||||
@ -10,7 +10,7 @@ import torch
|
|||||||
from vllm.compilation.decorators import support_torch_compile
|
from vllm.compilation.decorators import support_torch_compile
|
||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
CompilationLevel,
|
CompilationMode,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
)
|
)
|
||||||
@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module):
|
|||||||
def make_vllm_config() -> VllmConfig:
|
def make_vllm_config() -> VllmConfig:
|
||||||
return VllmConfig(
|
return VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
level=CompilationMode.VLLM_COMPILE,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import vllm.envs as envs
|
|||||||
from vllm.compilation.collective_fusion import AsyncTPPass
|
from vllm.compilation.collective_fusion import AsyncTPPass
|
||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
|
CompilationMode,
|
||||||
DeviceConfig,
|
DeviceConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
PassConfig,
|
PassConfig,
|
||||||
@ -400,7 +401,7 @@ def test_async_tp_pass_correctness(
|
|||||||
common_args.append("--enforce-eager")
|
common_args.append("--enforce-eager")
|
||||||
|
|
||||||
compilation_config = {
|
compilation_config = {
|
||||||
"level": 3,
|
"mode": CompilationMode.VLLM_COMPILE,
|
||||||
"compile_sizes": [2, 4, 8],
|
"compile_sizes": [2, 4, 8],
|
||||||
"splitting_ops": [],
|
"splitting_ops": [],
|
||||||
"pass_config": {"enable_async_tp": async_tp_enabled},
|
"pass_config": {"enable_async_tp": async_tp_enabled},
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import dataclasses
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import CompilationLevel
|
from vllm.config import CompilationMode
|
||||||
from vllm.utils import cuda_device_count_stateless
|
from vllm.utils import cuda_device_count_stateless
|
||||||
|
|
||||||
from ..utils import compare_all_settings
|
from ..utils import compare_all_settings
|
||||||
@ -21,7 +21,7 @@ class TestSetting:
|
|||||||
|
|
||||||
|
|
||||||
# we cannot afford testing the full Cartesian product
|
# we cannot afford testing the full Cartesian product
|
||||||
# of all models and all levels
|
# of all models and all modes
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"test_setting",
|
"test_setting",
|
||||||
[
|
[
|
||||||
@ -121,15 +121,13 @@ def test_compile_correctness(
|
|||||||
all_args: list[list[str]] = []
|
all_args: list[list[str]] = []
|
||||||
all_envs: list[dict[str, str] | None] = []
|
all_envs: list[dict[str, str] | None] = []
|
||||||
|
|
||||||
for comp_level in [
|
for comp_mode in [
|
||||||
CompilationLevel.DYNAMO_AS_IS,
|
CompilationMode.STOCK_TORCH_COMPILE,
|
||||||
CompilationLevel.DYNAMO_ONCE,
|
CompilationMode.DYNAMO_TRACE_ONCE,
|
||||||
CompilationLevel.PIECEWISE,
|
CompilationMode.VLLM_COMPILE,
|
||||||
]:
|
]:
|
||||||
for level in [CompilationLevel.NO_COMPILATION, comp_level]:
|
for mode in [CompilationMode.NONE, comp_mode]:
|
||||||
all_args.append(
|
all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"])
|
||||||
final_args + [f"-O.level={level}", "-O.backend=inductor"]
|
|
||||||
)
|
|
||||||
|
|
||||||
# inductor will change the output, so we only compare if the output
|
# inductor will change the output, so we only compare if the output
|
||||||
# is close, not exactly the same.
|
# is close, not exactly the same.
|
||||||
@ -142,13 +140,13 @@ def test_compile_correctness(
|
|||||||
all_envs.clear()
|
all_envs.clear()
|
||||||
all_args.clear()
|
all_args.clear()
|
||||||
|
|
||||||
for level in [
|
for mode in [
|
||||||
CompilationLevel.NO_COMPILATION,
|
CompilationMode.NONE,
|
||||||
CompilationLevel.DYNAMO_AS_IS,
|
CompilationMode.STOCK_TORCH_COMPILE,
|
||||||
CompilationLevel.DYNAMO_ONCE,
|
CompilationMode.DYNAMO_TRACE_ONCE,
|
||||||
CompilationLevel.PIECEWISE,
|
CompilationMode.VLLM_COMPILE,
|
||||||
]:
|
]:
|
||||||
all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"])
|
all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"])
|
||||||
all_envs.append({})
|
all_envs.append({})
|
||||||
all_envs.append({})
|
all_envs.append({})
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import pytest
|
|||||||
|
|
||||||
from vllm.compilation.counter import compilation_counter
|
from vllm.compilation.counter import compilation_counter
|
||||||
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
|
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
|
||||||
from vllm.config.compilation import CompilationLevel
|
from vllm.config.compilation import CompilationMode
|
||||||
from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
|
from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
|
||||||
|
|
||||||
|
|
||||||
@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
|
|||||||
|
|
||||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||||
@pytest.mark.forked
|
@pytest.mark.forked
|
||||||
def test_dynamo_as_is(vllm_runner, monkeypatch):
|
def test_stock_torch_compile(vllm_runner, monkeypatch):
|
||||||
# Disable multiprocessing so that the counter is in the same process
|
# Disable multiprocessing so that the counter is in the same process
|
||||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
|
||||||
with (
|
with (
|
||||||
compilation_counter.expect(dynamo_as_is_count=1),
|
compilation_counter.expect(stock_torch_compile_count=1),
|
||||||
# loading the model causes compilation (if enabled) to happen
|
# loading the model causes compilation (if enabled) to happen
|
||||||
vllm_runner(
|
vllm_runner(
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m",
|
||||||
compilation_config={"level": 1},
|
compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE},
|
||||||
gpu_memory_utilization=0.4,
|
gpu_memory_utilization=0.4,
|
||||||
) as _,
|
) as _,
|
||||||
):
|
):
|
||||||
@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch):
|
|||||||
# Disable multiprocessing so that the counter is in the same process
|
# Disable multiprocessing so that the counter is in the same process
|
||||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
with (
|
with (
|
||||||
compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
|
compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
|
||||||
# loading the model causes compilation (if enabled) to happen
|
# loading the model causes compilation (if enabled) to happen
|
||||||
vllm_runner(
|
vllm_runner(
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m",
|
||||||
compilation_config={"level": 0},
|
compilation_config={"mode": CompilationMode.NONE},
|
||||||
gpu_memory_utilization=0.4,
|
gpu_memory_utilization=0.4,
|
||||||
) as _,
|
) as _,
|
||||||
):
|
):
|
||||||
@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch):
|
|||||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
|
||||||
with (
|
with (
|
||||||
compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
|
compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
|
||||||
# loading the model causes compilation (if enabled) to happen
|
# loading the model causes compilation (if enabled) to happen
|
||||||
vllm_runner(
|
vllm_runner(
|
||||||
"facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
|
"facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
|
||||||
@ -151,7 +151,7 @@ def test_splitting_ops_dynamic():
|
|||||||
if is_torch_equal_or_newer("2.9.0.dev"):
|
if is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
config = VllmConfig(
|
config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
level=CompilationMode.VLLM_COMPILE,
|
||||||
use_inductor_graph_partition=True,
|
use_inductor_graph_partition=True,
|
||||||
splitting_ops=["vllm::unified_attention"],
|
splitting_ops=["vllm::unified_attention"],
|
||||||
)
|
)
|
||||||
@ -163,7 +163,7 @@ def test_splitting_ops_dynamic():
|
|||||||
# When attn_fusion pass enabled, splitting_ops now default to attention ops.
|
# When attn_fusion pass enabled, splitting_ops now default to attention ops.
|
||||||
config = VllmConfig(
|
config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
level=CompilationMode.VLLM_COMPILE,
|
||||||
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
||||||
custom_ops=["+quant_fp8"],
|
custom_ops=["+quant_fp8"],
|
||||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||||
@ -178,7 +178,7 @@ def test_splitting_ops_dynamic():
|
|||||||
if is_torch_equal_or_newer("2.9.0.dev"):
|
if is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
config = VllmConfig(
|
config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
level=CompilationMode.VLLM_COMPILE,
|
||||||
use_inductor_graph_partition=True,
|
use_inductor_graph_partition=True,
|
||||||
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
||||||
custom_ops=["+quant_fp8"],
|
custom_ops=["+quant_fp8"],
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
@ -8,12 +9,13 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_comp
|
|||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CacheConfig,
|
CacheConfig,
|
||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
CompilationLevel,
|
CompilationMode,
|
||||||
CUDAGraphMode,
|
CUDAGraphMode,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
)
|
)
|
||||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||||
|
from vllm.utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
# This import automatically registers `torch.ops.silly.attention`
|
# This import automatically registers `torch.ops.silly.attention`
|
||||||
from . import silly_attention # noqa: F401
|
from . import silly_attention # noqa: F401
|
||||||
@ -65,18 +67,40 @@ def run_model(
|
|||||||
return output.cpu()
|
return output.cpu()
|
||||||
|
|
||||||
|
|
||||||
def test_ignore_torch_compile_decorator():
|
@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
|
||||||
|
def test_ignore_torch_compile_decorator(use_inductor_graph_partition, monkeypatch):
|
||||||
|
# disable compile cache so that we can count the number of compilations
|
||||||
|
# appropriately
|
||||||
|
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||||
|
|
||||||
|
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
|
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||||
|
|
||||||
# piecewise
|
# piecewise
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_cudagraph=True,
|
use_cudagraph=True,
|
||||||
splitting_ops=["silly::attention"],
|
splitting_ops=["silly::attention"],
|
||||||
cudagraph_capture_sizes=[1, 2],
|
cudagraph_capture_sizes=[1, 2],
|
||||||
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||||
|
|
||||||
|
expected_num_graphs_seen = 1
|
||||||
|
expected_num_cudagraph_captured = (
|
||||||
|
4 # num_cudagraph_sizes * num cudagraphs to capture
|
||||||
|
)
|
||||||
|
if use_inductor_graph_partition:
|
||||||
|
expected_num_piecewise_graphs_seen = 1
|
||||||
|
expected_num_piecewise_capturable_graphs_seen = 1
|
||||||
|
expected_num_backend_compilations = 1
|
||||||
|
else:
|
||||||
|
expected_num_piecewise_graphs_seen = 3
|
||||||
|
expected_num_piecewise_capturable_graphs_seen = 2
|
||||||
|
expected_num_backend_compilations = 2
|
||||||
|
|
||||||
@support_torch_compile
|
@support_torch_compile
|
||||||
class A(nn.Module):
|
class A(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -103,12 +127,11 @@ def test_ignore_torch_compile_decorator():
|
|||||||
|
|
||||||
# A has support_torch_compile
|
# A has support_torch_compile
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=1,
|
num_graphs_seen=expected_num_graphs_seen,
|
||||||
num_piecewise_graphs_seen=3,
|
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
||||||
num_piecewise_capturable_graphs_seen=2,
|
num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
|
||||||
num_backend_compilations=2,
|
num_backend_compilations=expected_num_backend_compilations,
|
||||||
num_cudagraph_captured=4,
|
num_cudagraph_captured=expected_num_cudagraph_captured,
|
||||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
|
||||||
):
|
):
|
||||||
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
|
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
|
||||||
|
|
||||||
@ -130,12 +153,11 @@ def test_ignore_torch_compile_decorator():
|
|||||||
|
|
||||||
# C's support_torch_compile should override B's ignore_torch_compile
|
# C's support_torch_compile should override B's ignore_torch_compile
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=1,
|
num_graphs_seen=expected_num_graphs_seen,
|
||||||
num_piecewise_graphs_seen=3,
|
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
||||||
num_piecewise_capturable_graphs_seen=2,
|
num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
|
||||||
num_backend_compilations=2,
|
num_backend_compilations=expected_num_backend_compilations,
|
||||||
num_cudagraph_captured=4,
|
num_cudagraph_captured=expected_num_cudagraph_captured,
|
||||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
|
||||||
):
|
):
|
||||||
run_model(vllm_config, mod_C, cudagraph_runtime_mode)
|
run_model(vllm_config, mod_C, cudagraph_runtime_mode)
|
||||||
|
|
||||||
@ -178,16 +200,25 @@ class A(nn.Module):
|
|||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def test_conditional_compile_enable_if():
|
@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
|
||||||
|
def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch):
|
||||||
|
# disable compile cache so that we can count the number of compilations
|
||||||
|
# appropriately
|
||||||
|
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||||
|
|
||||||
|
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
|
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
cache_config=CacheConfig(
|
cache_config=CacheConfig(
|
||||||
kv_sharing_fast_prefill=True,
|
kv_sharing_fast_prefill=True,
|
||||||
),
|
),
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_cudagraph=True,
|
use_cudagraph=True,
|
||||||
splitting_ops=["silly::attention"],
|
splitting_ops=["silly::attention"],
|
||||||
cudagraph_capture_sizes=[1, 2],
|
cudagraph_capture_sizes=[1, 2],
|
||||||
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||||
@ -195,17 +226,26 @@ def test_conditional_compile_enable_if():
|
|||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
|
mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
|
||||||
|
|
||||||
|
if use_inductor_graph_partition:
|
||||||
|
expected_num_piecewise_graphs_seen = 2
|
||||||
|
expected_num_piecewise_capturable_graphs_seen = 2
|
||||||
|
expected_num_backend_compilations = 2
|
||||||
|
else:
|
||||||
|
expected_num_piecewise_graphs_seen = 6
|
||||||
|
expected_num_piecewise_capturable_graphs_seen = 4
|
||||||
|
expected_num_backend_compilations = 4
|
||||||
|
|
||||||
# A has support_torch_compile but enable_if fn returns False
|
# A has support_torch_compile but enable_if fn returns False
|
||||||
# enalbe_if will be True for B, so we expect mod1 and mod2
|
# enalbe_if will be True for B, so we expect mod1 and mod2
|
||||||
# to be compiled
|
# to be compiled
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=2,
|
num_graphs_seen=2,
|
||||||
num_piecewise_graphs_seen=6,
|
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
||||||
# 3 piecewise graphs per instance of B()
|
# 3 piecewise graphs per instance of B()
|
||||||
num_piecewise_capturable_graphs_seen=4,
|
num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
|
||||||
num_backend_compilations=4,
|
num_backend_compilations=expected_num_backend_compilations,
|
||||||
num_cudagraph_captured=8,
|
num_cudagraph_captured=8,
|
||||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
# num_cudagraph_sizes * num cudagraphable graphs to capture
|
||||||
):
|
):
|
||||||
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
|
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
|
||||||
|
|
||||||
@ -216,23 +256,34 @@ def test_conditional_compile_enable_if():
|
|||||||
kv_sharing_fast_prefill=False,
|
kv_sharing_fast_prefill=False,
|
||||||
),
|
),
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_cudagraph=True,
|
use_cudagraph=True,
|
||||||
splitting_ops=["silly::attention"],
|
splitting_ops=["silly::attention"],
|
||||||
cudagraph_capture_sizes=[1, 2],
|
cudagraph_capture_sizes=[1, 2],
|
||||||
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
|
mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
|
||||||
|
|
||||||
|
if use_inductor_graph_partition:
|
||||||
|
expected_num_piecewise_graphs_seen = 1
|
||||||
|
expected_num_piecewise_capturable_graphs_seen = 1
|
||||||
|
expected_num_backend_compilations = 1
|
||||||
|
else:
|
||||||
|
# 3 attn ops and 4 non-attn ops
|
||||||
|
expected_num_piecewise_graphs_seen = 7
|
||||||
|
expected_num_piecewise_capturable_graphs_seen = 4
|
||||||
|
expected_num_backend_compilations = 4
|
||||||
|
|
||||||
with compilation_counter.expect(
|
with compilation_counter.expect(
|
||||||
num_graphs_seen=1,
|
num_graphs_seen=1,
|
||||||
num_piecewise_graphs_seen=7,
|
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
||||||
# 3 attn ops and 4 non-attn ops
|
# 3 attn ops and 4 non-attn ops
|
||||||
num_piecewise_capturable_graphs_seen=4,
|
num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
|
||||||
num_backend_compilations=4,
|
num_backend_compilations=expected_num_backend_compilations,
|
||||||
num_cudagraph_captured=8,
|
num_cudagraph_captured=8,
|
||||||
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
# num_cudagraph_sizes * num cudagraphable graphs to capture
|
||||||
):
|
):
|
||||||
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
|
run_model(vllm_config, mod_A, cudagraph_runtime_mode)
|
||||||
|
|||||||
@ -12,7 +12,7 @@ from tests.quantization.utils import is_quant_method_supported
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.attention.backends.registry import _Backend
|
from vllm.attention.backends.registry import _Backend
|
||||||
from vllm.attention.selector import global_force_attn_backend_context_manager
|
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||||
from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig
|
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import is_torch_equal_or_newer
|
from vllm.utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"optimization_level",
|
"compilation_mode",
|
||||||
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
|
[CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("model_info", models_list(all=True))
|
@pytest.mark.parametrize("model_info", models_list(all=True))
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
def test_full_graph(
|
def test_full_graph(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
model_info: tuple[str, dict[str, Any]],
|
model_info: tuple[str, dict[str, Any]],
|
||||||
optimization_level: int,
|
compilation_mode: int,
|
||||||
):
|
):
|
||||||
model, model_kwargs = model_info
|
model, model_kwargs = model_info
|
||||||
|
|
||||||
with monkeypatch.context():
|
with monkeypatch.context():
|
||||||
print(f"MODEL={model}")
|
print(f"MODEL={model}")
|
||||||
|
|
||||||
run_model(optimization_level, model, model_kwargs)
|
run_model(compilation_mode, model, model_kwargs)
|
||||||
|
|
||||||
|
|
||||||
# TODO(luka) add other supported compilation config scenarios here
|
# TODO(luka) add other supported compilation config scenarios here
|
||||||
@ -104,7 +104,7 @@ def test_full_graph(
|
|||||||
[
|
[
|
||||||
# additional compile sizes, only some of the models
|
# additional compile sizes, only some of the models
|
||||||
(
|
(
|
||||||
CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]),
|
CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
|
||||||
model,
|
model,
|
||||||
)
|
)
|
||||||
for model in models_list(all=False)
|
for model in models_list(all=False)
|
||||||
@ -113,7 +113,7 @@ def test_full_graph(
|
|||||||
# RMSNorm + quant fusion, only 8-bit quant models
|
# RMSNorm + quant fusion, only 8-bit quant models
|
||||||
(
|
(
|
||||||
CompilationConfig(
|
CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
custom_ops=["+rms_norm"],
|
custom_ops=["+rms_norm"],
|
||||||
pass_config=PassConfig(enable_fusion=True, enable_noop=True),
|
pass_config=PassConfig(enable_fusion=True, enable_noop=True),
|
||||||
),
|
),
|
||||||
@ -125,7 +125,8 @@ def test_full_graph(
|
|||||||
# Test depyf integration works
|
# Test depyf integration works
|
||||||
(
|
(
|
||||||
CompilationConfig(
|
CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir()
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
|
debug_dump_path=tempfile.gettempdir(),
|
||||||
),
|
),
|
||||||
("facebook/opt-125m", {}),
|
("facebook/opt-125m", {}),
|
||||||
),
|
),
|
||||||
@ -134,7 +135,7 @@ def test_full_graph(
|
|||||||
# graph inductor partition
|
# graph inductor partition
|
||||||
(
|
(
|
||||||
CompilationConfig(
|
CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
# inductor graph partition uses
|
# inductor graph partition uses
|
||||||
# torch._C.Tag.cudagraph_unsafe to specify splitting ops
|
# torch._C.Tag.cudagraph_unsafe to specify splitting ops
|
||||||
use_inductor_graph_partition=True,
|
use_inductor_graph_partition=True,
|
||||||
@ -164,10 +165,10 @@ def test_custom_compile_config(
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"optimization_level",
|
"compilation_mode",
|
||||||
[CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE],
|
[CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
|
||||||
)
|
)
|
||||||
def test_fp8_kv_scale_compile(optimization_level: int):
|
def test_fp8_kv_scale_compile(compilation_mode: int):
|
||||||
model = "Qwen/Qwen2-0.5B"
|
model = "Qwen/Qwen2-0.5B"
|
||||||
model_kwargs = {
|
model_kwargs = {
|
||||||
"quantization": "fp8",
|
"quantization": "fp8",
|
||||||
@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int):
|
|||||||
"calculate_kv_scales": True,
|
"calculate_kv_scales": True,
|
||||||
"max_model_len": 512,
|
"max_model_len": 512,
|
||||||
}
|
}
|
||||||
run_model(optimization_level, model, model_kwargs)
|
run_model(compilation_mode, model, model_kwargs)
|
||||||
|
|
||||||
|
|
||||||
def test_inductor_graph_partition_attn_fusion(caplog_vllm):
|
def test_inductor_graph_partition_attn_fusion(caplog_vllm):
|
||||||
@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
|
|||||||
|
|
||||||
model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
|
model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
|
||||||
compilation_config = CompilationConfig(
|
compilation_config = CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_inductor_graph_partition=True,
|
use_inductor_graph_partition=True,
|
||||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||||
custom_ops=["+quant_fp8"],
|
custom_ops=["+quant_fp8"],
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from vllm.compilation.fusion import (
|
|||||||
)
|
)
|
||||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||||
from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig
|
from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
GroupShape,
|
GroupShape,
|
||||||
@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant(
|
|||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
custom_ops=["+rms_norm", "+quant_fp8"],
|
custom_ops=["+rms_norm", "+quant_fp8"],
|
||||||
pass_config=PassConfig(enable_fusion=True, enable_noop=True),
|
pass_config=PassConfig(enable_fusion=True, enable_noop=True),
|
||||||
)
|
)
|
||||||
|
|||||||
@ -12,7 +12,7 @@ from vllm.compilation.noop_elimination import NoOpEliminationPass
|
|||||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
CompilationLevel,
|
CompilationMode,
|
||||||
DeviceConfig,
|
DeviceConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
PassConfig,
|
PassConfig,
|
||||||
@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model(
|
|||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"]
|
mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
vllm_config.compilation_config.pass_config = PassConfig(
|
vllm_config.compilation_config.pass_config = PassConfig(
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass
|
|||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CacheConfig,
|
CacheConfig,
|
||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
CompilationLevel,
|
CompilationMode,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
PassConfig,
|
PassConfig,
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
@ -321,7 +321,7 @@ def test_attention_quant_pattern(
|
|||||||
),
|
),
|
||||||
scheduler_config=SchedulerConfig(max_num_seqs=1024),
|
scheduler_config=SchedulerConfig(max_num_seqs=1024),
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
custom_ops=["+quant_fp8"],
|
custom_ops=["+quant_fp8"],
|
||||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
),
|
),
|
||||||
@ -421,7 +421,9 @@ def test_attention_quant_pattern(
|
|||||||
]
|
]
|
||||||
if any(attn_fusion_supported):
|
if any(attn_fusion_supported):
|
||||||
# Check quantization ops in the graph before and after fusion
|
# Check quantization ops in the graph before and after fusion
|
||||||
test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=True)
|
# Note: fully_replaced=False because query quant ops remain in graph.
|
||||||
|
# Only output quant ops are fused into attention.
|
||||||
|
test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=False)
|
||||||
|
|
||||||
# access the underlying `AttnFusionPass` on the `LazyInitPass`
|
# access the underlying `AttnFusionPass` on the `LazyInitPass`
|
||||||
assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)
|
assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import torch
|
|||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||||
from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig
|
from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
|
||||||
|
|
||||||
from .backend import TestBackend
|
from .backend import TestBackend
|
||||||
|
|
||||||
@ -50,7 +50,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
|
|||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
pass_config=PassConfig(enable_noop=True),
|
pass_config=PassConfig(enable_noop=True),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -98,7 +98,7 @@ def test_non_noop_slice_preserved():
|
|||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationLevel.PIECEWISE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
pass_config=PassConfig(enable_noop=True),
|
pass_config=PassConfig(enable_noop=True),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
|
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
|
||||||
from vllm.config import CompilationLevel
|
from vllm.config import CompilationMode
|
||||||
|
|
||||||
|
|
||||||
class MyMod(torch.nn.Module):
|
class MyMod(torch.nn.Module):
|
||||||
@ -20,7 +20,7 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
|
|||||||
self.model = model
|
self.model = model
|
||||||
compiled_callable = torch.compile(self.forward, backend="eager")
|
compiled_callable = torch.compile(self.forward, backend="eager")
|
||||||
super().__init__(
|
super().__init__(
|
||||||
compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
|
compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
|
||||||
)
|
)
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||||
|
|||||||
@ -334,7 +334,7 @@ class HfRunner:
|
|||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
)
|
)
|
||||||
self.device = self.get_default_device()
|
self.device = self.get_default_device()
|
||||||
self.dtype = torch_dtype = _get_and_verify_dtype(
|
self.dtype = dtype = _get_and_verify_dtype(
|
||||||
self.model_name,
|
self.model_name,
|
||||||
self.config,
|
self.config,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
@ -342,7 +342,7 @@ class HfRunner:
|
|||||||
)
|
)
|
||||||
|
|
||||||
model_kwargs = model_kwargs if model_kwargs is not None else {}
|
model_kwargs = model_kwargs if model_kwargs is not None else {}
|
||||||
model_kwargs.setdefault("torch_dtype", torch_dtype)
|
model_kwargs.setdefault("dtype", dtype)
|
||||||
|
|
||||||
if is_sentence_transformer:
|
if is_sentence_transformer:
|
||||||
# Lazy init required for AMD CI
|
# Lazy init required for AMD CI
|
||||||
@ -388,7 +388,7 @@ class HfRunner:
|
|||||||
if not skip_tokenizer_init:
|
if not skip_tokenizer_init:
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||||
model_name,
|
model_name,
|
||||||
torch_dtype=torch_dtype,
|
dtype=dtype,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -398,7 +398,7 @@ class HfRunner:
|
|||||||
|
|
||||||
self.processor = AutoProcessor.from_pretrained(
|
self.processor = AutoProcessor.from_pretrained(
|
||||||
model_name,
|
model_name,
|
||||||
torch_dtype=torch_dtype,
|
dtype=dtype,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
)
|
)
|
||||||
if skip_tokenizer_init:
|
if skip_tokenizer_init:
|
||||||
@ -1011,8 +1011,12 @@ class VllmRunner:
|
|||||||
req_outputs = self.llm.embed(inputs, *args, **kwargs)
|
req_outputs = self.llm.embed(inputs, *args, **kwargs)
|
||||||
return [req_output.outputs.embedding for req_output in req_outputs]
|
return [req_output.outputs.embedding for req_output in req_outputs]
|
||||||
|
|
||||||
def encode(self, prompts: list[str]) -> list[list[float]]:
|
def token_embed(self, prompts: list[str]) -> list[list[float]]:
|
||||||
req_outputs = self.llm.encode(prompts)
|
req_outputs = self.llm.encode(prompts, pooling_task="token_embed")
|
||||||
|
return [req_output.outputs.data for req_output in req_outputs]
|
||||||
|
|
||||||
|
def token_classify(self, prompts: list[str]) -> list[list[float]]:
|
||||||
|
req_outputs = self.llm.encode(prompts, pooling_task="token_classify")
|
||||||
return [req_output.outputs.data for req_output in req_outputs]
|
return [req_output.outputs.data for req_output in req_outputs]
|
||||||
|
|
||||||
def reward(self, prompts: list[str]) -> list[list[float]]:
|
def reward(self, prompts: list[str]) -> list[list[float]]:
|
||||||
|
|||||||
@ -15,6 +15,7 @@ from typing import Literal, NamedTuple
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from vllm.config.compilation import CompilationMode
|
||||||
from vllm.config.model import RunnerOption
|
from vllm.config.model import RunnerOption
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
@ -234,7 +235,7 @@ def _compare_sp(
|
|||||||
common_args.append("--skip-tokenizer-init")
|
common_args.append("--skip-tokenizer-init")
|
||||||
|
|
||||||
compilation_config = {
|
compilation_config = {
|
||||||
"level": 3,
|
"mode": CompilationMode.VLLM_COMPILE,
|
||||||
"custom_ops": ["+rms_norm"],
|
"custom_ops": ["+rms_norm"],
|
||||||
"compile_sizes": [4, 8],
|
"compile_sizes": [4, 8],
|
||||||
"pass_config": {
|
"pass_config": {
|
||||||
|
|||||||
@ -226,30 +226,30 @@ def test_compilation_config():
|
|||||||
|
|
||||||
# set to O3
|
# set to O3
|
||||||
args = parser.parse_args(["-O0"])
|
args = parser.parse_args(["-O0"])
|
||||||
assert args.compilation_config.level == 0
|
assert args.compilation_config.mode == 0
|
||||||
|
|
||||||
# set to O 3 (space)
|
# set to O 3 (space)
|
||||||
args = parser.parse_args(["-O", "1"])
|
args = parser.parse_args(["-O", "1"])
|
||||||
assert args.compilation_config.level == 1
|
assert args.compilation_config.mode == 1
|
||||||
|
|
||||||
# set to O 3 (equals)
|
# set to O 3 (equals)
|
||||||
args = parser.parse_args(["-O=2"])
|
args = parser.parse_args(["-O=2"])
|
||||||
assert args.compilation_config.level == 2
|
assert args.compilation_config.mode == 2
|
||||||
|
|
||||||
# set to O.level 3
|
# set to O.mode 3
|
||||||
args = parser.parse_args(["-O.level", "3"])
|
args = parser.parse_args(["-O.mode", "3"])
|
||||||
assert args.compilation_config.level == 3
|
assert args.compilation_config.mode == 3
|
||||||
|
|
||||||
# set to string form of a dict
|
# set to string form of a dict
|
||||||
args = parser.parse_args(
|
args = parser.parse_args(
|
||||||
[
|
[
|
||||||
"-O",
|
"-O",
|
||||||
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
||||||
'"use_inductor": false}',
|
'"use_inductor": false}',
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
args.compilation_config.level == 3
|
args.compilation_config.mode == 3
|
||||||
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
||||||
and not args.compilation_config.use_inductor
|
and not args.compilation_config.use_inductor
|
||||||
)
|
)
|
||||||
@ -258,12 +258,12 @@ def test_compilation_config():
|
|||||||
args = parser.parse_args(
|
args = parser.parse_args(
|
||||||
[
|
[
|
||||||
"--compilation-config="
|
"--compilation-config="
|
||||||
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
||||||
'"use_inductor": true}',
|
'"use_inductor": true}',
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
args.compilation_config.level == 3
|
args.compilation_config.mode == 3
|
||||||
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
||||||
and args.compilation_config.use_inductor
|
and args.compilation_config.use_inductor
|
||||||
)
|
)
|
||||||
|
|||||||
@ -53,21 +53,34 @@ def base64_encoded_audio() -> dict[str, str]:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dummy_messages_from_audio_url(
|
||||||
|
audio_urls: str | list[str],
|
||||||
|
content_text: str = "What's happening in this audio?",
|
||||||
|
):
|
||||||
|
if isinstance(audio_urls, str):
|
||||||
|
audio_urls = [audio_urls]
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
*(
|
||||||
|
{"type": "audio_url", "audio_url": {"url": audio_url}}
|
||||||
|
for audio_url in audio_urls
|
||||||
|
),
|
||||||
|
{"type": "text", "text": content_text},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
|
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
|
||||||
async def test_single_chat_session_audio(
|
async def test_single_chat_session_audio(
|
||||||
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_audio_url(audio_url)
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "audio_url", "audio_url": {"url": audio_url}},
|
|
||||||
{"type": "text", "text": "What's happening in this audio?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
# test single completion
|
# test single completion
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
@ -138,20 +151,9 @@ async def test_single_chat_session_audio_base64encoded(
|
|||||||
audio_url: str,
|
audio_url: str,
|
||||||
base64_encoded_audio: dict[str, str],
|
base64_encoded_audio: dict[str, str],
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_audio_url(
|
||||||
{
|
f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
|
||||||
"role": "user",
|
)
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "audio_url",
|
|
||||||
"audio_url": {
|
|
||||||
"url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{"type": "text", "text": "What's happening in this audio?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
# test single completion
|
# test single completion
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
@ -252,15 +254,7 @@ async def test_single_chat_session_input_audio(
|
|||||||
async def test_chat_streaming_audio(
|
async def test_chat_streaming_audio(
|
||||||
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_audio_url(audio_url)
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "audio_url", "audio_url": {"url": audio_url}},
|
|
||||||
{"type": "text", "text": "What's happening in this audio?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
# test single completion
|
# test single completion
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
@ -365,18 +359,7 @@ async def test_chat_streaming_input_audio(
|
|||||||
async def test_multi_audio_input(
|
async def test_multi_audio_input(
|
||||||
client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]
|
client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_audio_url(audio_urls)
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
*(
|
|
||||||
{"type": "audio_url", "audio_url": {"url": audio_url}}
|
|
||||||
for audio_url in audio_urls
|
|
||||||
),
|
|
||||||
{"type": "text", "text": "What's happening in this audio?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
if len(audio_urls) > MAXIMUM_AUDIOS:
|
if len(audio_urls) > MAXIMUM_AUDIOS:
|
||||||
with pytest.raises(openai.BadRequestError): # test multi-audio input
|
with pytest.raises(openai.BadRequestError): # test multi-audio input
|
||||||
|
|||||||
@ -55,21 +55,34 @@ def base64_encoded_video() -> dict[str, str]:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dummy_messages_from_video_url(
|
||||||
|
video_urls: str | list[str],
|
||||||
|
content_text: str = "What's in this video?",
|
||||||
|
):
|
||||||
|
if isinstance(video_urls, str):
|
||||||
|
video_urls = [video_urls]
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
*(
|
||||||
|
{"type": "video_url", "video_url": {"url": video_url}}
|
||||||
|
for video_url in video_urls
|
||||||
|
),
|
||||||
|
{"type": "text", "text": content_text},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||||
async def test_single_chat_session_video(
|
async def test_single_chat_session_video(
|
||||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_video_url(video_url)
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "video_url", "video_url": {"url": video_url}},
|
|
||||||
{"type": "text", "text": "What's in this video?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
# test single completion
|
# test single completion
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
@ -137,15 +150,7 @@ async def test_error_on_invalid_video_url_type(
|
|||||||
async def test_single_chat_session_video_beamsearch(
|
async def test_single_chat_session_video_beamsearch(
|
||||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_video_url(video_url)
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "video_url", "video_url": {"url": video_url}},
|
|
||||||
{"type": "text", "text": "What's in this video?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
@ -172,20 +177,9 @@ async def test_single_chat_session_video_base64encoded(
|
|||||||
video_url: str,
|
video_url: str,
|
||||||
base64_encoded_video: dict[str, str],
|
base64_encoded_video: dict[str, str],
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_video_url(
|
||||||
{
|
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||||
"role": "user",
|
)
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "video_url",
|
|
||||||
"video_url": {
|
|
||||||
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{"type": "text", "text": "What's in this video?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
# test single completion
|
# test single completion
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
@ -231,20 +225,10 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
|
|||||||
video_url: str,
|
video_url: str,
|
||||||
base64_encoded_video: dict[str, str],
|
base64_encoded_video: dict[str, str],
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_video_url(
|
||||||
{
|
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||||
"role": "user",
|
)
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "video_url",
|
|
||||||
"video_url": {
|
|
||||||
"url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{"type": "text", "text": "What's in this video?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
@ -265,15 +249,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
|
|||||||
async def test_chat_streaming_video(
|
async def test_chat_streaming_video(
|
||||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_video_url(video_url)
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "video_url", "video_url": {"url": video_url}},
|
|
||||||
{"type": "text", "text": "What's in this video?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
# test single completion
|
# test single completion
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
@ -318,18 +294,7 @@ async def test_chat_streaming_video(
|
|||||||
async def test_multi_video_input(
|
async def test_multi_video_input(
|
||||||
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
|
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_video_url(video_urls)
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
*(
|
|
||||||
{"type": "video_url", "video_url": {"url": video_url}}
|
|
||||||
for video_url in video_urls
|
|
||||||
),
|
|
||||||
{"type": "text", "text": "What's in this video?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
if len(video_urls) > MAXIMUM_VIDEOS:
|
if len(video_urls) > MAXIMUM_VIDEOS:
|
||||||
with pytest.raises(openai.BadRequestError): # test multi-video input
|
with pytest.raises(openai.BadRequestError): # test multi-video input
|
||||||
|
|||||||
@ -78,6 +78,27 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dummy_messages_from_image_url(
|
||||||
|
image_urls: str | list[str],
|
||||||
|
content_text: str = "What's in this image?",
|
||||||
|
):
|
||||||
|
if isinstance(image_urls, str):
|
||||||
|
image_urls = [image_urls]
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
*(
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url}}
|
||||||
|
for image_url in image_urls
|
||||||
|
),
|
||||||
|
{"type": "text", "text": content_text},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def get_hf_prompt_tokens(model_name, content, image_url):
|
def get_hf_prompt_tokens(model_name, content, image_url):
|
||||||
processor = AutoProcessor.from_pretrained(
|
processor = AutoProcessor.from_pretrained(
|
||||||
model_name, trust_remote_code=True, num_crops=4
|
model_name, trust_remote_code=True, num_crops=4
|
||||||
@ -107,15 +128,7 @@ async def test_single_chat_session_image(
|
|||||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||||
):
|
):
|
||||||
content_text = "What's in this image?"
|
content_text = "What's in this image?"
|
||||||
messages = [
|
messages = dummy_messages_from_image_url(image_url, content_text)
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "image_url", "image_url": {"url": image_url}},
|
|
||||||
{"type": "text", "text": content_text},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
max_completion_tokens = 10
|
max_completion_tokens = 10
|
||||||
# test single completion
|
# test single completion
|
||||||
@ -188,15 +201,8 @@ async def test_error_on_invalid_image_url_type(
|
|||||||
async def test_single_chat_session_image_beamsearch(
|
async def test_single_chat_session_image_beamsearch(
|
||||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||||
):
|
):
|
||||||
messages = [
|
content_text = "What's in this image?"
|
||||||
{
|
messages = dummy_messages_from_image_url(image_url, content_text)
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "image_url", "image_url": {"url": image_url}},
|
|
||||||
{"type": "text", "text": "What's in this image?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
@ -226,20 +232,10 @@ async def test_single_chat_session_image_base64encoded(
|
|||||||
base64_encoded_image: dict[str, str],
|
base64_encoded_image: dict[str, str],
|
||||||
):
|
):
|
||||||
content_text = "What's in this image?"
|
content_text = "What's in this image?"
|
||||||
messages = [
|
messages = dummy_messages_from_image_url(
|
||||||
{
|
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
|
||||||
"role": "user",
|
content_text,
|
||||||
"content": [
|
)
|
||||||
{
|
|
||||||
"type": "image_url",
|
|
||||||
"image_url": {
|
|
||||||
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{"type": "text", "text": content_text},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
max_completion_tokens = 10
|
max_completion_tokens = 10
|
||||||
# test single completion
|
# test single completion
|
||||||
@ -293,20 +289,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
|||||||
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
|
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
|
||||||
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
|
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
|
||||||
|
|
||||||
messages = [
|
messages = dummy_messages_from_image_url(
|
||||||
{
|
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
||||||
"role": "user",
|
)
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "image_url",
|
|
||||||
"image_url": {
|
|
||||||
"url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{"type": "text", "text": "What's in this image?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
@ -326,15 +312,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
|||||||
async def test_chat_streaming_image(
|
async def test_chat_streaming_image(
|
||||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_image_url(image_url)
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "image_url", "image_url": {"url": image_url}},
|
|
||||||
{"type": "text", "text": "What's in this image?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
# test single completion
|
# test single completion
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
@ -381,18 +359,7 @@ async def test_chat_streaming_image(
|
|||||||
async def test_multi_image_input(
|
async def test_multi_image_input(
|
||||||
client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
|
client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
|
||||||
):
|
):
|
||||||
messages = [
|
messages = dummy_messages_from_image_url(image_urls)
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
*(
|
|
||||||
{"type": "image_url", "image_url": {"url": image_url}}
|
|
||||||
for image_url in image_urls
|
|
||||||
),
|
|
||||||
{"type": "text", "text": "What's in this image?"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
if len(image_urls) > MAXIMUM_IMAGES:
|
if len(image_urls) > MAXIMUM_IMAGES:
|
||||||
with pytest.raises(openai.BadRequestError): # test multi-image input
|
with pytest.raises(openai.BadRequestError): # test multi-image input
|
||||||
|
|||||||
243
tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
Normal file
243
tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
Normal file
@ -0,0 +1,243 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from tests.entrypoints.openai.tool_parsers.utils import (
|
||||||
|
run_tool_extraction,
|
||||||
|
run_tool_extraction_streaming,
|
||||||
|
)
|
||||||
|
from vllm.entrypoints.openai.protocol import FunctionCall
|
||||||
|
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
|
||||||
|
|
||||||
|
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
|
||||||
|
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
|
||||||
|
SIMPLE_FUNCTION_CALL = FunctionCall(
|
||||||
|
name="get_weather",
|
||||||
|
arguments='{"city": "San Francisco", "metric": "celsius"}',
|
||||||
|
)
|
||||||
|
MORE_TYPES_FUNCTION_OUTPUT = (
|
||||||
|
"register_user(name='John Doe', "
|
||||||
|
"age=37, "
|
||||||
|
"address={'city': 'San Francisco', 'state': 'CA'}, "
|
||||||
|
"role=None, "
|
||||||
|
"passed_test=True, "
|
||||||
|
"aliases=['John', 'Johnny'])"
|
||||||
|
)
|
||||||
|
MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS = (
|
||||||
|
"register_user(name='John Doe', "
|
||||||
|
"age=37, "
|
||||||
|
"address={'city': 'San Francisco', 'state': 'CA'}, "
|
||||||
|
"role=null, "
|
||||||
|
"passed_test=true, "
|
||||||
|
"aliases=['John', 'Johnny'])"
|
||||||
|
)
|
||||||
|
MORE_TYPES_FUNCTION_CALL = FunctionCall(
|
||||||
|
name="register_user",
|
||||||
|
arguments='{"name": "John Doe", '
|
||||||
|
'"age": 37, '
|
||||||
|
'"address": {"city": "San Francisco", "state": "CA"}, '
|
||||||
|
'"role": null, '
|
||||||
|
'"passed_test": true, '
|
||||||
|
'"aliases": ["John", "Johnny"]}',
|
||||||
|
)
|
||||||
|
PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
|
||||||
|
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
|
||||||
|
name="get_weather",
|
||||||
|
arguments="{}",
|
||||||
|
)
|
||||||
|
EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
|
||||||
|
EMPTY_DICT_FUNCTION_CALL = FunctionCall(
|
||||||
|
name="do_something_cool",
|
||||||
|
arguments='{"additional_data": {}}',
|
||||||
|
)
|
||||||
|
EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
|
||||||
|
EMPTY_LIST_FUNCTION_CALL = FunctionCall(
|
||||||
|
name="do_something_cool",
|
||||||
|
arguments='{"steps": []}',
|
||||||
|
)
|
||||||
|
ESCAPED_STRING_FUNCTION_OUTPUT = (
|
||||||
|
r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
|
||||||
|
)
|
||||||
|
ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
|
||||||
|
name="get_weather",
|
||||||
|
arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("streaming", [True, False])
|
||||||
|
def test_no_tool_call(streaming: bool):
|
||||||
|
mock_tokenizer = MagicMock()
|
||||||
|
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
|
||||||
|
model_output = "How can I help you today?"
|
||||||
|
|
||||||
|
content, tool_calls = run_tool_extraction(
|
||||||
|
tool_parser, model_output, streaming=streaming
|
||||||
|
)
|
||||||
|
|
||||||
|
assert content == model_output
|
||||||
|
assert len(tool_calls) == 0
|
||||||
|
|
||||||
|
|
||||||
|
TEST_CASES = [
|
||||||
|
pytest.param(
|
||||||
|
True,
|
||||||
|
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[SIMPLE_FUNCTION_CALL],
|
||||||
|
id="simple_streaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
False,
|
||||||
|
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[SIMPLE_FUNCTION_CALL],
|
||||||
|
id="simple_nonstreaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
True,
|
||||||
|
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[MORE_TYPES_FUNCTION_CALL],
|
||||||
|
id="more_types_streaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
False,
|
||||||
|
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[MORE_TYPES_FUNCTION_CALL],
|
||||||
|
id="more_types_nonstreaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
True,
|
||||||
|
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
|
||||||
|
[MORE_TYPES_FUNCTION_CALL],
|
||||||
|
id="more_types_streaming_json_literals",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
False,
|
||||||
|
f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
|
||||||
|
[MORE_TYPES_FUNCTION_CALL],
|
||||||
|
id="more_types_nonstreaming_json_literals",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
True,
|
||||||
|
f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[PARAMETERLESS_FUNCTION_CALL],
|
||||||
|
id="parameterless_streaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
False,
|
||||||
|
f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[PARAMETERLESS_FUNCTION_CALL],
|
||||||
|
id="parameterless_nonstreaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
True,
|
||||||
|
f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[EMPTY_DICT_FUNCTION_CALL],
|
||||||
|
id="empty_dict_streaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
False,
|
||||||
|
f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[EMPTY_DICT_FUNCTION_CALL],
|
||||||
|
id="empty_dict_nonstreaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
True,
|
||||||
|
f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[EMPTY_LIST_FUNCTION_CALL],
|
||||||
|
id="empty_list_streaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
False,
|
||||||
|
f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[EMPTY_LIST_FUNCTION_CALL],
|
||||||
|
id="empty_list_nonstreaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
True,
|
||||||
|
f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[ESCAPED_STRING_FUNCTION_CALL],
|
||||||
|
id="escaped_string_streaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
False,
|
||||||
|
f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[ESCAPED_STRING_FUNCTION_CALL],
|
||||||
|
id="escaped_string_nonstreaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
True,
|
||||||
|
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
|
||||||
|
id="parallel_calls_streaming",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
False,
|
||||||
|
f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
|
||||||
|
id="parallel_calls_nonstreaming",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
|
||||||
|
def test_tool_call(
|
||||||
|
streaming: bool, model_output: str, expected_tool_calls: list[FunctionCall]
|
||||||
|
):
|
||||||
|
mock_tokenizer = MagicMock()
|
||||||
|
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
|
||||||
|
|
||||||
|
content, tool_calls = run_tool_extraction(
|
||||||
|
tool_parser, model_output, streaming=streaming
|
||||||
|
)
|
||||||
|
|
||||||
|
assert content is None
|
||||||
|
assert len(tool_calls) == len(expected_tool_calls)
|
||||||
|
for actual, expected in zip(tool_calls, expected_tool_calls):
|
||||||
|
assert actual.type == "function"
|
||||||
|
assert actual.function == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_streaming_tool_call_with_large_steps():
|
||||||
|
mock_tokenizer = MagicMock()
|
||||||
|
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
|
||||||
|
model_output_deltas = [
|
||||||
|
"<function_calls>get_weather(city='San",
|
||||||
|
" Francisco', metric='celsius')\n"
|
||||||
|
f"{PARAMETERLESS_FUNCTION_OUTPUT}\n"
|
||||||
|
f"{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
|
||||||
|
]
|
||||||
|
|
||||||
|
reconstructor = run_tool_extraction_streaming(
|
||||||
|
tool_parser, model_output_deltas, assert_one_tool_per_delta=False
|
||||||
|
)
|
||||||
|
|
||||||
|
assert reconstructor.other_content == ""
|
||||||
|
assert len(reconstructor.tool_calls) == 3
|
||||||
|
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
|
||||||
|
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
|
||||||
|
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("streaming", [False])
|
||||||
|
def test_regex_timeout_handling(streaming: bool):
|
||||||
|
"""test regex timeout is handled gracefully"""
|
||||||
|
mock_tokenizer = MagicMock()
|
||||||
|
tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
|
||||||
|
|
||||||
|
fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
|
||||||
|
|
||||||
|
# create a mock regex that raises TimeoutError
|
||||||
|
mock_regex = MagicMock()
|
||||||
|
mock_regex.match.side_effect = TimeoutError("Regex timeout")
|
||||||
|
|
||||||
|
with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
|
||||||
|
content, tool_calls = run_tool_extraction(
|
||||||
|
tool_parser, fake_problematic_input, streaming=streaming
|
||||||
|
)
|
||||||
|
|
||||||
|
# should treat as regular text when regex times out
|
||||||
|
assert content == fake_problematic_input
|
||||||
|
assert len(tool_calls) == 0
|
||||||
|
mock_regex.match.assert_called_once()
|
||||||
@ -63,7 +63,7 @@ def test_encode_api(llm: LLM):
|
|||||||
# chunked prefill does not support all pooling
|
# chunked prefill does not support all pooling
|
||||||
err_msg = "pooling_task must be one of.+"
|
err_msg = "pooling_task must be one of.+"
|
||||||
with pytest.raises(ValueError, match=err_msg):
|
with pytest.raises(ValueError, match=err_msg):
|
||||||
llm.encode(prompts, use_tqdm=False)
|
llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
|
||||||
|
|
||||||
|
|
||||||
def test_score_api(llm: LLM):
|
def test_score_api(llm: LLM):
|
||||||
|
|||||||
@ -35,6 +35,13 @@ def llm():
|
|||||||
cleanup_dist_env_and_memory()
|
cleanup_dist_env_and_memory()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip_global_cleanup
|
||||||
|
def test_encode_api(llm: LLM):
|
||||||
|
outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
|
||||||
|
multi_vector = outputs[0].outputs.data
|
||||||
|
assert multi_vector.shape == (11, 384)
|
||||||
|
|
||||||
|
|
||||||
def test_pooling_params(llm: LLM):
|
def test_pooling_params(llm: LLM):
|
||||||
def get_outputs(normalize):
|
def get_outputs(normalize):
|
||||||
outputs = llm.embed(
|
outputs = llm.embed(
|
||||||
|
|||||||
@ -57,20 +57,24 @@ def test_multiple_pooling_params(llm: LLM):
|
|||||||
]
|
]
|
||||||
|
|
||||||
# Multiple PoolingParams should be matched with each prompt
|
# Multiple PoolingParams should be matched with each prompt
|
||||||
outputs = llm.encode(PROMPTS, pooling_params=pooling_params)
|
outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed")
|
||||||
assert len(PROMPTS) == len(outputs)
|
assert len(PROMPTS) == len(outputs)
|
||||||
|
|
||||||
# Exception raised, if the size of params does not match the size of prompts
|
# Exception raised, if the size of params does not match the size of prompts
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3])
|
outputs = llm.encode(
|
||||||
|
PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed"
|
||||||
|
)
|
||||||
|
|
||||||
# Single PoolingParams should be applied to every prompt
|
# Single PoolingParams should be applied to every prompt
|
||||||
single_pooling_params = PoolingParams()
|
single_pooling_params = PoolingParams()
|
||||||
outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params)
|
outputs = llm.encode(
|
||||||
|
PROMPTS, pooling_params=single_pooling_params, pooling_task="embed"
|
||||||
|
)
|
||||||
assert len(PROMPTS) == len(outputs)
|
assert len(PROMPTS) == len(outputs)
|
||||||
|
|
||||||
# pooling_params is None, default params should be applied
|
# pooling_params is None, default params should be applied
|
||||||
outputs = llm.encode(PROMPTS, pooling_params=None)
|
outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed")
|
||||||
assert len(PROMPTS) == len(outputs)
|
assert len(PROMPTS) == len(outputs)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -36,22 +36,23 @@ def llm():
|
|||||||
cleanup_dist_env_and_memory()
|
cleanup_dist_env_and_memory()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip_global_cleanup
|
|
||||||
def test_pooling_params(llm: LLM):
|
def test_pooling_params(llm: LLM):
|
||||||
def get_outputs(softmax):
|
def get_outputs(activation):
|
||||||
outputs = llm.reward(
|
outputs = llm.reward(
|
||||||
prompts, pooling_params=PoolingParams(softmax=softmax), use_tqdm=False
|
prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False
|
||||||
)
|
)
|
||||||
return torch.cat([x.outputs.data for x in outputs])
|
return torch.cat([x.outputs.data for x in outputs])
|
||||||
|
|
||||||
default = get_outputs(softmax=None)
|
default = get_outputs(activation=None)
|
||||||
w_softmax = get_outputs(softmax=True)
|
w_activation = get_outputs(activation=True)
|
||||||
wo_softmax = get_outputs(softmax=False)
|
wo_activation = get_outputs(activation=False)
|
||||||
|
|
||||||
assert torch.allclose(default, w_softmax, atol=1e-2), "Default should use softmax."
|
assert torch.allclose(default, w_activation, atol=1e-2), (
|
||||||
assert not torch.allclose(w_softmax, wo_softmax, atol=1e-2), (
|
"Default should use activation."
|
||||||
"wo_softmax should not use softmax."
|
|
||||||
)
|
)
|
||||||
assert torch.allclose(softmax(wo_softmax), w_softmax, atol=1e-2), (
|
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
|
||||||
"w_softmax should be close to softmax(wo_softmax)."
|
"wo_activation should not use activation."
|
||||||
|
)
|
||||||
|
assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
|
||||||
|
"w_activation should be close to activation(wo_activation)."
|
||||||
)
|
)
|
||||||
|
|||||||
@ -17,6 +17,7 @@ from tests.utils import RemoteOpenAIServer
|
|||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
EMBED_DTYPE_TO_TORCH_DTYPE,
|
EMBED_DTYPE_TO_TORCH_DTYPE,
|
||||||
EmbeddingResponse,
|
EmbeddingResponse,
|
||||||
|
PoolingResponse,
|
||||||
)
|
)
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
@ -509,3 +510,20 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str):
|
|||||||
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
|
assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
|
||||||
"w_normal should be close to normal(wo_normal)."
|
"w_normal should be close to normal(wo_normal)."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
async def test_pooling(server: RemoteOpenAIServer, model_name: str):
|
||||||
|
input_text = ["The chef prepared a delicious meal."]
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
server.url_for("pooling"),
|
||||||
|
json={"model": model_name, "input": input_text, "encoding_format": "float"},
|
||||||
|
)
|
||||||
|
|
||||||
|
poolings = PoolingResponse.model_validate(response.json())
|
||||||
|
|
||||||
|
assert len(poolings.data) == 1
|
||||||
|
assert len(poolings.data[0].data) == 11
|
||||||
|
assert len(poolings.data[0].data[0]) == 384
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import torch
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from tests.utils import RemoteOpenAIServer
|
from tests.utils import RemoteOpenAIServer
|
||||||
from vllm.entrypoints.openai.protocol import RerankResponse
|
from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
|
||||||
|
|
||||||
MODEL_NAME = "BAAI/bge-reranker-base"
|
MODEL_NAME = "BAAI/bge-reranker-base"
|
||||||
DTYPE = "bfloat16"
|
DTYPE = "bfloat16"
|
||||||
@ -159,3 +159,20 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str):
|
|||||||
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
|
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
|
||||||
"w_activation should be close to activation(wo_activation)."
|
"w_activation should be close to activation(wo_activation)."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
async def test_pooling(server: RemoteOpenAIServer, model_name: str):
|
||||||
|
input_text = ["The chef prepared a delicious meal."]
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
server.url_for("pooling"),
|
||||||
|
json={"model": model_name, "input": input_text, "encoding_format": "float"},
|
||||||
|
)
|
||||||
|
|
||||||
|
poolings = PoolingResponse.model_validate(response.json())
|
||||||
|
|
||||||
|
assert len(poolings.data) == 1
|
||||||
|
assert len(poolings.data[0].data) == 11
|
||||||
|
assert len(poolings.data[0].data[0]) == 1
|
||||||
|
|||||||
@ -6,7 +6,11 @@ from unittest.mock import MagicMock, patch
|
|||||||
import pytest
|
import pytest
|
||||||
from openai_harmony import Author, Message, Role, StreamState, TextContent
|
from openai_harmony import Author, Message, Role, StreamState, TextContent
|
||||||
|
|
||||||
from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext
|
from vllm.entrypoints.context import (
|
||||||
|
HarmonyContext,
|
||||||
|
StreamingHarmonyContext,
|
||||||
|
TurnMetrics,
|
||||||
|
)
|
||||||
from vllm.outputs import CompletionOutput, RequestOutput
|
from vllm.outputs import CompletionOutput, RequestOutput
|
||||||
|
|
||||||
|
|
||||||
@ -101,8 +105,12 @@ def test_single_turn_token_counting():
|
|||||||
|
|
||||||
# Verify internal state tracking
|
# Verify internal state tracking
|
||||||
assert not context.is_first_turn
|
assert not context.is_first_turn
|
||||||
assert context.previous_turn.input_tokens == 5
|
assert len(context.all_turn_metrics) == 1
|
||||||
assert context.previous_turn.output_tokens == 3
|
previous_turn = context.all_turn_metrics[0]
|
||||||
|
assert previous_turn.input_tokens == 5
|
||||||
|
assert previous_turn.output_tokens == 3
|
||||||
|
assert previous_turn.cached_input_tokens == 2
|
||||||
|
assert previous_turn.tool_output_tokens == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@ -156,6 +164,15 @@ async def test_multi_turn_token_counting():
|
|||||||
assert context.num_tool_output_tokens == expected_tool_output
|
assert context.num_tool_output_tokens == expected_tool_output
|
||||||
assert context.num_cached_tokens == 5 + 15
|
assert context.num_cached_tokens == 5 + 15
|
||||||
|
|
||||||
|
# Validate all turn metrics
|
||||||
|
assert len(context.all_turn_metrics) == 3
|
||||||
|
for i, turn in enumerate(context.all_turn_metrics):
|
||||||
|
assert turn.input_tokens == prompt_token_counts[i]
|
||||||
|
assert turn.output_tokens == output_token_counts[i]
|
||||||
|
assert turn.cached_input_tokens == cached_token_counts[i]
|
||||||
|
assert context.all_turn_metrics[1].tool_output_tokens == 7
|
||||||
|
assert context.all_turn_metrics[2].tool_output_tokens == 1
|
||||||
|
|
||||||
|
|
||||||
def test_empty_output_tokens():
|
def test_empty_output_tokens():
|
||||||
"""Test behavior when RequestOutput has empty output tokens."""
|
"""Test behavior when RequestOutput has empty output tokens."""
|
||||||
@ -314,6 +331,10 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
|||||||
# Create a streaming context
|
# Create a streaming context
|
||||||
context = StreamingHarmonyContext(messages=[], available_tools=["browser"])
|
context = StreamingHarmonyContext(messages=[], available_tools=["browser"])
|
||||||
|
|
||||||
|
num_prompt_tokens = [3, 8, 13]
|
||||||
|
num_output_tokens = [3, 3, 2]
|
||||||
|
num_cached_tokens = [0, 3, 8]
|
||||||
|
|
||||||
# Simulate three turns of conversation:
|
# Simulate three turns of conversation:
|
||||||
# Turn 1: stream tokens one by one, then finish the message
|
# Turn 1: stream tokens one by one, then finish the message
|
||||||
# Turn 2: new prompt, stream more tokens with a reasoning segment
|
# Turn 2: new prompt, stream more tokens with a reasoning segment
|
||||||
@ -325,7 +346,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
|||||||
create_mock_request_output(
|
create_mock_request_output(
|
||||||
prompt_token_ids=[1, 2, 3], # 3 prompt tokens
|
prompt_token_ids=[1, 2, 3], # 3 prompt tokens
|
||||||
output_token_ids=[101], # Single token
|
output_token_ids=[101], # Single token
|
||||||
num_cached_tokens=0,
|
num_cached_tokens=num_cached_tokens[0],
|
||||||
finished=False, # Not end of message yet
|
finished=False, # Not end of message yet
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -370,7 +391,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
|||||||
5,
|
5,
|
||||||
], # 8 tokens (includes previous)
|
], # 8 tokens (includes previous)
|
||||||
output_token_ids=[201],
|
output_token_ids=[201],
|
||||||
num_cached_tokens=3, # Some tokens cached
|
num_cached_tokens=num_cached_tokens[1], # Some tokens cached
|
||||||
finished=False,
|
finished=False,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -422,7 +443,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
|||||||
7,
|
7,
|
||||||
], # 13 tokens
|
], # 13 tokens
|
||||||
output_token_ids=[301],
|
output_token_ids=[301],
|
||||||
num_cached_tokens=8, # More cached tokens
|
num_cached_tokens=num_cached_tokens[2], # More cached tokens
|
||||||
finished=False,
|
finished=False,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -435,10 +456,12 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Final token counts check
|
# Final token counts check
|
||||||
assert context.num_prompt_tokens == 3 + 8 + 13 # All prompts
|
assert context.num_prompt_tokens == sum(num_prompt_tokens) # All prompts
|
||||||
assert context.num_output_tokens == 3 + 3 + 2 # All outputs
|
assert context.num_output_tokens == sum(num_output_tokens) # All outputs
|
||||||
assert context.num_reasoning_tokens == 3 # Unchanged from second turn
|
assert context.num_reasoning_tokens == 3 # Unchanged from second turn
|
||||||
assert context.num_cached_tokens == 3 + 8 # Accumulated cached tokens
|
assert context.num_cached_tokens == sum(
|
||||||
|
num_cached_tokens
|
||||||
|
) # Accumulated cached tokens
|
||||||
|
|
||||||
# Additional tool tokens from third turn
|
# Additional tool tokens from third turn
|
||||||
# Formula: this turn prompt - last turn prompt - last turn output
|
# Formula: this turn prompt - last turn prompt - last turn output
|
||||||
@ -447,6 +470,15 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
|
|||||||
context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens
|
context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Validate all turn metrics
|
||||||
|
assert len(context.all_turn_metrics) == 3
|
||||||
|
for i, turn in enumerate(context.all_turn_metrics):
|
||||||
|
assert turn.input_tokens == num_prompt_tokens[i]
|
||||||
|
assert turn.output_tokens == num_output_tokens[i]
|
||||||
|
assert turn.cached_input_tokens == num_cached_tokens[i]
|
||||||
|
assert context.all_turn_metrics[1].tool_output_tokens == 2
|
||||||
|
assert context.all_turn_metrics[2].tool_output_tokens == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_streaming_message_synchronization(mock_parser):
|
async def test_streaming_message_synchronization(mock_parser):
|
||||||
@ -522,3 +554,46 @@ async def test_streaming_message_synchronization(mock_parser):
|
|||||||
assert len(context._messages) == 3
|
assert len(context._messages) == 3
|
||||||
assert context.num_init_messages == 1
|
assert context.num_init_messages == 1
|
||||||
assert context._messages[2].content[0].text == "Response 4"
|
assert context._messages[2].content[0].text == "Response 4"
|
||||||
|
|
||||||
|
|
||||||
|
def test_turn_metrics_copy_and_reset():
|
||||||
|
"""Test TurnMetrics copy and reset methods work correctly."""
|
||||||
|
# Create a TurnMetrics with specific values
|
||||||
|
original_metrics = TurnMetrics(
|
||||||
|
input_tokens=10,
|
||||||
|
output_tokens=20,
|
||||||
|
cached_input_tokens=5,
|
||||||
|
tool_output_tokens=3,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test copy functionality
|
||||||
|
copied_metrics = original_metrics.copy()
|
||||||
|
|
||||||
|
# Verify copy has same values
|
||||||
|
assert copied_metrics.input_tokens == 10
|
||||||
|
assert copied_metrics.output_tokens == 20
|
||||||
|
assert copied_metrics.cached_input_tokens == 5
|
||||||
|
assert copied_metrics.tool_output_tokens == 3
|
||||||
|
|
||||||
|
# Verify they are separate objects
|
||||||
|
assert copied_metrics is not original_metrics
|
||||||
|
|
||||||
|
# Modify copy to ensure independence
|
||||||
|
copied_metrics.input_tokens = 999
|
||||||
|
assert original_metrics.input_tokens == 10 # Original unchanged
|
||||||
|
assert copied_metrics.input_tokens == 999
|
||||||
|
|
||||||
|
# Test reset functionality
|
||||||
|
original_metrics.reset()
|
||||||
|
|
||||||
|
# Verify all fields are reset to zero
|
||||||
|
assert original_metrics.input_tokens == 0
|
||||||
|
assert original_metrics.output_tokens == 0
|
||||||
|
assert original_metrics.cached_input_tokens == 0
|
||||||
|
assert original_metrics.tool_output_tokens == 0
|
||||||
|
|
||||||
|
# Verify copied metrics are unaffected by reset
|
||||||
|
assert copied_metrics.input_tokens == 999
|
||||||
|
assert copied_metrics.output_tokens == 20
|
||||||
|
assert copied_metrics.cached_input_tokens == 5
|
||||||
|
assert copied_metrics.tool_output_tokens == 3
|
||||||
|
|||||||
@ -12,7 +12,7 @@ from vllm.entrypoints.openai.api_server import (
|
|||||||
from vllm.inputs import TextPrompt
|
from vllm.inputs import TextPrompt
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils import merge_async_iterators
|
from vllm.utils.async_utils import merge_async_iterators
|
||||||
|
|
||||||
MODEL_PATH = "zai-org/chatglm3-6b"
|
MODEL_PATH = "zai-org/chatglm3-6b"
|
||||||
LORA_RANK = 64
|
LORA_RANK = 64
|
||||||
|
|||||||
45
tests/models/language/pooling/test_multi_vector_retrieval.py
Normal file
45
tests/models/language/pooling/test_multi_vector_retrieval.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
from transformers import AutoModel
|
||||||
|
|
||||||
|
from tests.models.utils import check_embeddings_close
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model",
|
||||||
|
["BAAI/bge-m3"],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
|
@torch.inference_mode
|
||||||
|
def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str):
|
||||||
|
with vllm_runner(
|
||||||
|
model,
|
||||||
|
runner="pooling",
|
||||||
|
max_model_len=None,
|
||||||
|
) as vllm_model:
|
||||||
|
vllm_outputs = vllm_model.token_embed(example_prompts)
|
||||||
|
|
||||||
|
with hf_runner(
|
||||||
|
model,
|
||||||
|
auto_cls=AutoModel,
|
||||||
|
) as hf_model:
|
||||||
|
tokenizer = hf_model.tokenizer
|
||||||
|
hf_outputs = []
|
||||||
|
for prompt in example_prompts:
|
||||||
|
inputs = tokenizer([prompt], return_tensors="pt")
|
||||||
|
inputs = hf_model.wrap_device(inputs)
|
||||||
|
output = hf_model.model(**inputs)
|
||||||
|
embedding = output.last_hidden_state[0].float()
|
||||||
|
# normal
|
||||||
|
hf_outputs.append(embedding.cpu())
|
||||||
|
|
||||||
|
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||||
|
check_embeddings_close(
|
||||||
|
embeddings_0_lst=hf_output,
|
||||||
|
embeddings_1_lst=vllm_output,
|
||||||
|
name_0="hf",
|
||||||
|
name_1="vllm",
|
||||||
|
tol=1e-2,
|
||||||
|
)
|
||||||
@ -93,7 +93,7 @@ def test_embed_models_using_normalize(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
def test_reward_models_using_softmax(
|
def test_reward_models_using_activation(
|
||||||
hf_runner,
|
hf_runner,
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
example_prompts,
|
example_prompts,
|
||||||
@ -104,22 +104,64 @@ def test_reward_models_using_softmax(
|
|||||||
model,
|
model,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
pooler_config=PoolerConfig(softmax=False),
|
pooler_config=PoolerConfig(activation=False),
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
wo_softmax = vllm_model.encode(example_prompts)
|
wo_activation = vllm_model.reward(example_prompts)
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True)
|
model,
|
||||||
|
max_model_len=1024,
|
||||||
|
dtype=dtype,
|
||||||
|
pooler_config=PoolerConfig(activation=True),
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
w_softmax = vllm_model.encode(example_prompts)
|
w_activation = vllm_model.reward(example_prompts)
|
||||||
|
|
||||||
for wo, w in zip(wo_softmax, w_softmax):
|
for wo, w in zip(wo_activation, w_activation):
|
||||||
wo = torch.tensor(wo)
|
wo = torch.tensor(wo)
|
||||||
w = torch.tensor(w)
|
w = torch.tensor(w)
|
||||||
|
|
||||||
assert not torch.allclose(wo, w, atol=1e-2), (
|
assert not torch.allclose(wo, w, atol=1e-2), (
|
||||||
"pooler_config softmax is not working"
|
"pooler_config activation is not working"
|
||||||
)
|
)
|
||||||
assert torch.allclose(softmax(wo), w, atol=1e-2), (
|
assert torch.allclose(softmax(wo), w, atol=1e-2), (
|
||||||
"w_softmax should be close to softmax(wo_softmax)."
|
"w_activation should be close to activation(wo_activation)."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model",
|
||||||
|
[
|
||||||
|
"intfloat/multilingual-e5-small",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
|
def test_multi_vector_retrieval_models_using_normalize(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
example_prompts,
|
||||||
|
model: str,
|
||||||
|
dtype: str,
|
||||||
|
) -> None:
|
||||||
|
with vllm_runner(
|
||||||
|
model,
|
||||||
|
max_model_len=512,
|
||||||
|
dtype=dtype,
|
||||||
|
pooler_config=PoolerConfig(normalize=False),
|
||||||
|
) as vllm_model:
|
||||||
|
wo_normalize = vllm_model.token_embed(example_prompts)
|
||||||
|
|
||||||
|
with vllm_runner(
|
||||||
|
model,
|
||||||
|
max_model_len=512,
|
||||||
|
dtype=dtype,
|
||||||
|
pooler_config=PoolerConfig(normalize=True),
|
||||||
|
) as vllm_model:
|
||||||
|
w_normalize = vllm_model.token_embed(example_prompts)
|
||||||
|
|
||||||
|
for wo, w in zip(wo_normalize, w_normalize):
|
||||||
|
assert not torch.allclose(wo, w, atol=1e-2), (
|
||||||
|
"pooler_config normalize is not working"
|
||||||
|
)
|
||||||
|
assert torch.allclose(F.normalize(wo, p=2, dim=-1), w, atol=1e-2), (
|
||||||
|
"w_normal should be close to normal(wo_normal)."
|
||||||
)
|
)
|
||||||
|
|||||||
@ -19,7 +19,7 @@ def test_bert_models(
|
|||||||
dtype: str,
|
dtype: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
|
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
|
||||||
vllm_outputs = vllm_model.encode(example_prompts)
|
vllm_outputs = vllm_model.token_classify(example_prompts)
|
||||||
|
|
||||||
with hf_runner(
|
with hf_runner(
|
||||||
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
|
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
|
||||||
@ -50,7 +50,7 @@ def test_modernbert_models(
|
|||||||
dtype: str,
|
dtype: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
|
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
|
||||||
vllm_outputs = vllm_model.encode(example_prompts)
|
vllm_outputs = vllm_model.token_classify(example_prompts)
|
||||||
|
|
||||||
with hf_runner(
|
with hf_runner(
|
||||||
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
|
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
|
||||||
|
|||||||
@ -17,7 +17,7 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import identity
|
from vllm.utils.func import identity
|
||||||
|
|
||||||
from ....conftest import (
|
from ....conftest import (
|
||||||
IMAGE_ASSETS,
|
IMAGE_ASSETS,
|
||||||
|
|||||||
@ -38,7 +38,7 @@ def run_intern_vit_test(
|
|||||||
config.norm_type = "rms_norm"
|
config.norm_type = "rms_norm"
|
||||||
|
|
||||||
hf_model = AutoModel.from_pretrained(
|
hf_model = AutoModel.from_pretrained(
|
||||||
model, torch_dtype=torch_dtype, trust_remote_code=True
|
model, dtype=torch_dtype, trust_remote_code=True
|
||||||
).to("cuda")
|
).to("cuda")
|
||||||
hf_outputs_per_image = [
|
hf_outputs_per_image = [
|
||||||
hf_model(pixel_value.to("cuda")).last_hidden_state
|
hf_model(pixel_value.to("cuda")).last_hidden_state
|
||||||
|
|||||||
@ -39,7 +39,7 @@ def _run_test(
|
|||||||
max_num_seqs=32,
|
max_num_seqs=32,
|
||||||
default_torch_num_threads=1,
|
default_torch_num_threads=1,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.encode(prompt)
|
vllm_model.llm.encode(prompt, pooling_task="token_classify")
|
||||||
|
|
||||||
|
|
||||||
MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
|
MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
|
||||||
|
|||||||
@ -45,7 +45,7 @@ def run_radio_test(
|
|||||||
hf_model = AutoModel.from_pretrained(
|
hf_model = AutoModel.from_pretrained(
|
||||||
model_id,
|
model_id,
|
||||||
config=config,
|
config=config,
|
||||||
torch_dtype=torch_dtype,
|
dtype=torch_dtype,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
).to("cuda")
|
).to("cuda")
|
||||||
hf_model.eval()
|
hf_model.eval()
|
||||||
|
|||||||
@ -30,7 +30,7 @@ class MyGemma2Embedding(nn.Module):
|
|||||||
|
|
||||||
self.pooler = DispatchPooler(
|
self.pooler = DispatchPooler(
|
||||||
{
|
{
|
||||||
"encode": Pooler.for_encode(pooler_config),
|
"token_embed": Pooler.for_token_embed(pooler_config),
|
||||||
"embed": Pooler.for_embed(pooler_config),
|
"embed": Pooler.for_embed(pooler_config),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@ -93,7 +93,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
|
|||||||
out_data_format="b64_json",
|
out_data_format="b64_json",
|
||||||
)
|
)
|
||||||
|
|
||||||
pooling_params = PoolingParams(task="encode", softmax=False)
|
pooling_params = PoolingParams(activation=False)
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model_name,
|
model_name,
|
||||||
@ -108,8 +108,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
|
|||||||
io_processor_plugin="prithvi_to_tiff",
|
io_processor_plugin="prithvi_to_tiff",
|
||||||
) as llm_runner:
|
) as llm_runner:
|
||||||
pooler_output = llm_runner.get_llm().encode(
|
pooler_output = llm_runner.get_llm().encode(
|
||||||
img_prompt,
|
img_prompt, pooling_params=pooling_params, pooling_task="token_classify"
|
||||||
pooling_params=pooling_params,
|
|
||||||
)
|
)
|
||||||
output = pooler_output[0].outputs
|
output = pooler_output[0].outputs
|
||||||
|
|
||||||
|
|||||||
@ -697,7 +697,8 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"args",
|
"args",
|
||||||
[
|
[
|
||||||
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
|
# TODO: Enable once model is available again
|
||||||
|
# ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
|
||||||
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4),
|
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user