Merge remote-tracking branch 'origin/main' into refactor-fp8-linear

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
This commit is contained in:
vllmellm 2025-11-18 08:03:30 +00:00
commit 9d94f5d4f0
289 changed files with 10926 additions and 2798 deletions

View File

@ -59,7 +59,7 @@ while true; do
fi
done
echo "--- Pulling container"
echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}"
@ -177,13 +177,13 @@ if [[ -z "$render_gid" ]]; then
exit 1
fi
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
# assign job count as the number of shards used
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
# assign shard-id for each shard
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \

View File

@ -46,6 +46,6 @@ docker run \
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
'

View File

@ -226,6 +226,27 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
- label: Distributed Tests (8 GPUs) # 4min
timeout_in_minutes: 10
mirror_hardwares: [amdexperimental]
agent_pool: mi325_8
# grade: Blocking
gpu: h100
num_gpus: 8
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
- vllm/config/parallel.py
- vllm/distributed/
- vllm/v1/engine/llm_engine.py
- vllm/v1/executor/uniproc_executor.py
- vllm/v1/worker/gpu_worker.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
#- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
@ -238,11 +259,11 @@ steps:
commands:
- pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution Test # 5min
- label: EPLB Execution Test # 10min
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 15
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
@ -250,6 +271,7 @@ steps:
- tests/distributed/test_eplb_execute.py
commands:
- pytest -v -s distributed/test_eplb_execute.py
- pytest -v -s distributed/test_eplb_spec_decode.py
- label: Metrics, Tracing Test # 12min
timeout_in_minutes: 20
@ -273,7 +295,7 @@ steps:
- label: Regression Test # 7min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
grade: Blocking
source_file_dependencies:
@ -288,7 +310,7 @@ steps:
timeout_in_minutes: 40
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
#grade: Blocking
# grade: Blocking
source_file_dependencies:
- vllm/
- tests/engine
@ -337,6 +359,7 @@ steps:
- tests/v1
commands:
# split the test to avoid interference
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
@ -344,7 +367,7 @@ steps:
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- pytest -v -s v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_request.py
@ -353,6 +376,20 @@ steps:
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
# TODO: Add the "V1 Test attetion (MI300)" test group
- label: V1 Test attention (H100) # 10min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
timeout_in_minutes: 30
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention
- label: V1 Test others (CPU) # 5 mins
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
@ -479,10 +516,11 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s compile/test_multimodal_compile.py
- pytest -v -s compile/piecewise/
- label: PyTorch Fullgraph Test # 22min
timeout_in_minutes: 35
- label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
@ -491,8 +529,23 @@ steps:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py
- pytest -v -s compile/test_fusions_e2e.py
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
- label: Cudagraph test
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
source_file_dependencies:
- tests/v1/cudagraph
- vllm/v1/cudagraph_dispatcher.py
- vllm/config/compilation.py
- vllm/compilation
commands:
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
- label: Kernels Core Operation Test # 48min
timeout_in_minutes: 75
@ -544,6 +597,8 @@ steps:
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
- vllm/distributed/device_communicators/
- vllm/envs.py
- vllm/config
commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
@ -562,10 +617,13 @@ steps:
- label: Model Executor Test # 23min
timeout_in_minutes: 35
torch_nightly: true
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- vllm/engine/arg_utils.py
- vllm/config/model.py
- vllm/model_executor
- tests/model_executor
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
@ -861,9 +919,10 @@ steps:
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
timeout_in_minutes: 70
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
timeout_in_minutes: 15
# grade: Blocking
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
@ -934,6 +993,7 @@ steps:
- label: Transformers Nightly Models Test
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/"
optional: true
commands:
@ -961,11 +1021,16 @@ steps:
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/platforms/cuda.py
- vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@ -1002,12 +1067,39 @@ steps:
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusions_e2e.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
- label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
num_gpus: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusions_e2e.py
- tests/compile/test_full_graph.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/test_fusions_e2e.py
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: ROCm GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
agent_pool: mi325_1
mirror_hardwares: [amdproduction]
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
@ -1016,7 +1108,7 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
@ -1253,6 +1345,7 @@ steps:
- label: NixlConnector PD accuracy tests (Distributed) # 30min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 4
@ -1267,6 +1360,9 @@ steps:
##### A100 test #####
- label: Distributed Tests (A100) # optional
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
gpu: a100
optional: true
num_gpus: 4
@ -1281,6 +1377,9 @@ steps:
- pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: a100
optional: true
num_gpus: 4
@ -1292,8 +1391,27 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
##### H100 test #####
- label: LM Eval Large Models (H100) # optional
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test #####
- label: Distributed Tests (H200) # optional
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
@ -1305,6 +1423,7 @@ steps:
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
- label: Distributed Tests (B200) # optional
@ -1315,6 +1434,7 @@ steps:
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
@ -1330,3 +1450,27 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020

View File

@ -445,6 +445,7 @@ steps:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_graph_partition.py
- pytest -v -s compile/test_config.py
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
@ -477,10 +478,11 @@ steps:
- vllm/
- tests/compile
commands:
# fp8 kv scales not supported on sm89, tested on Blackwell instead
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
- label: Cudagraph test
timeout_in_minutes: 20
@ -924,7 +926,7 @@ steps:
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- label: Blackwell Fusion Tests # 30 min
- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
@ -945,7 +947,9 @@ steps:
- pytest -v -s tests/compile/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
@ -968,8 +972,6 @@ steps:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/test_fusions_e2e.py
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
@ -1265,7 +1267,8 @@ steps:
- pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
- pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py

18
.github/CODEOWNERS vendored
View File

@ -3,8 +3,8 @@
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep
@ -20,15 +20,15 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1
/vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
@ -36,11 +36,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/v1/offloading @ApostaC
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
/.buildkite/lm-eval-harness @mgoin
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
/tests/evals @mgoin
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
@ -49,7 +49,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
@ -57,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC
# Transformers backend
# Transformers modeling backend
/vllm/model_executor/models/transformers @hmellor
/tests/models/test_transformers.py @hmellor

76
.github/workflows/macos-smoke-test.yml vendored Normal file
View File

@ -0,0 +1,76 @@
name: macOS Apple Silicon Smoke Test
on:
workflow_dispatch: # Manual trigger
jobs:
macos-m1-smoke-test:
runs-on: macos-latest
timeout-minutes: 20
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: |
requirements/**/*.txt
pyproject.toml
python-version: '3.12'
- name: Install dependencies
run: |
uv pip install -r requirements/cpu-build.txt
uv pip install -r requirements/cpu.txt
- name: Build vLLM
run: uv pip install -v -e .
env:
CMAKE_BUILD_PARALLEL_LEVEL: 4
- name: Verify installation
run: |
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
- name: Smoke test vllm serve
timeout-minutes: 10
run: |
# Start server in background
vllm serve Qwen/Qwen3-0.6B \
--max-model-len=2048 \
--load-format=dummy \
--enforce-eager \
--port 8000 &
SERVER_PID=$!
# Wait for server to start
for i in {1..30}; do
if curl -s http://localhost:8000/health > /dev/null; then
echo "Server started successfully"
break
fi
if [ "$i" -eq 30 ]; then
echo "Server failed to start"
kill "$SERVER_PID"
exit 1
fi
sleep 2
done
# Test health endpoint
curl -f http://localhost:8000/health
# Test completion
curl -f http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-0.6B",
"prompt": "Hello",
"max_tokens": 5
}'
# Cleanup
kill "$SERVER_PID"

View File

@ -3,10 +3,9 @@ MD007:
MD013: false
MD024:
siblings_only: true
MD031:
list_items: false
MD033: false
MD045: false
MD046: false
MD051: false
MD052: false
MD053: false
MD059: false

View File

@ -861,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# Hadacore kernels
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
if(HADACORE_ARCHS)
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
set_gencode_flags_for_srcs(

View File

@ -0,0 +1,380 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
This benchmark runs the same workload twice:
1. With VLLM_BATCH_INVARIANT=0 (baseline)
2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
And reports the timing and throughput metrics for comparison.
Environment variables:
VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
Example usage:
# Benchmark qwen3 (default)
python benchmarks/benchmark_batch_invariance.py
# Benchmark deepseek with 8 GPUs
VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
python benchmarks/benchmark_batch_invariance.py
# Quick test with fewer trials
VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
python benchmarks/benchmark_batch_invariance.py
"""
import contextlib
import os
import random
import time
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
"""Generate a random prompt for benchmarking."""
prompt_templates = [
"Question: What is the capital of France?\nAnswer: The capital of France is",
"Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
"User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
"Once upon a time in a distant galaxy, there lived",
"The old man walked slowly down the street, remembering",
"In the year 2157, humanity finally discovered",
"To implement a binary search tree in Python, first we need to",
"The algorithm works by iterating through the array and",
"Here's how to optimize database queries using indexing:",
"The Renaissance was a period in European history that",
"Climate change is caused by several factors including",
"The human brain contains approximately 86 billion neurons which",
"I've been thinking about getting a new laptop because",
"Yesterday I went to the store and bought",
"My favorite thing about summer is definitely",
]
base_prompt = random.choice(prompt_templates)
if max_words < min_words:
max_words = min_words
target_words = random.randint(min_words, max_words)
if target_words > 50:
padding_text = (
" This is an interesting topic that deserves more explanation. "
* (target_words // 50)
)
base_prompt = base_prompt + padding_text
return base_prompt
def run_benchmark_with_batch_invariant(
model: str,
tp_size: int,
max_batch_size: int,
num_trials: int,
min_prompt: int,
max_prompt: int,
max_tokens: int,
temperature: float,
gpu_mem_util: float,
max_model_len: int,
backend: str,
batch_invariant: bool,
seed: int = 12345,
) -> dict:
"""
Run the benchmark with the specified configuration.
Returns a dict with timing and throughput metrics.
"""
random.seed(seed)
# Set environment variables
os.environ["VLLM_ATTENTION_BACKEND"] = backend
if batch_invariant:
os.environ["VLLM_BATCH_INVARIANT"] = "1"
else:
os.environ["VLLM_BATCH_INVARIANT"] = "0"
print(f"\n{'=' * 80}")
print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
print(f" Model: {model}")
print(f" TP Size: {tp_size}")
print(f" Backend: {backend}")
print(f" Max Batch Size: {max_batch_size}")
print(f" Trials: {num_trials}")
print(f" Max Tokens: {max_tokens}")
print(f"{'=' * 80}\n")
sampling = SamplingParams(
temperature=temperature,
top_p=0.95,
max_tokens=max_tokens,
seed=20240919,
)
needle_prompt = "There once was a "
llm = None
try:
# Create LLM engine
start_init = time.perf_counter()
llm = LLM(
model=model,
max_num_seqs=max_batch_size,
gpu_memory_utilization=gpu_mem_util,
max_model_len=max_model_len,
dtype="bfloat16",
tensor_parallel_size=tp_size,
enable_prefix_caching=False,
)
init_time = time.perf_counter() - start_init
print(f"Engine initialization time: {init_time:.2f}s\n")
# Generate baseline
print("Generating baseline (warmup)...")
baseline_out = llm.generate([needle_prompt], sampling)
assert len(baseline_out) == 1
baseline_text = baseline_out[0].outputs[0].text
print(f"Baseline output: '{baseline_text[:50]}...'\n")
# Run trials and measure timing
trial_times: list[float] = []
total_tokens = 0
total_prompts = 0
for trial in range(num_trials):
# Create a batch
prompts: list[str] = []
batch_size = random.randint(max_batch_size // 2, max_batch_size)
needle_pos = random.randint(0, batch_size - 1)
for i in range(batch_size):
if i == needle_pos:
prompts.append(needle_prompt)
else:
prompts.append(_random_prompt(min_prompt, max_prompt))
# Measure time for this trial
start_time = time.perf_counter()
outputs = llm.generate(prompts, sampling)
trial_time = time.perf_counter() - start_time
trial_times.append(trial_time)
total_prompts += len(prompts)
# Count tokens
for output in outputs:
if output.outputs:
total_tokens += len(output.outputs[0].token_ids)
print(
f"Trial {trial + 1}/{num_trials}: "
f"batch_size={batch_size}, "
f"time={trial_time:.2f}s"
)
# Verify needle output still matches
needle_output = outputs[needle_pos]
assert needle_output.prompt == needle_prompt
# Compute statistics
avg_time = sum(trial_times) / len(trial_times)
min_time = min(trial_times)
max_time = max(trial_times)
throughput = total_tokens / sum(trial_times)
prompts_per_sec = total_prompts / sum(trial_times)
print(f"\n{'=' * 80}")
print("RESULTS:")
print(f" Average time per trial: {avg_time:.2f}s")
print(f" Min time: {min_time:.2f}s")
print(f" Max time: {max_time:.2f}s")
print(f" Total tokens generated: {total_tokens}")
print(f" Total prompts processed: {total_prompts}")
print(f" Throughput: {throughput:.2f} tokens/s")
print(f" Prompts/s: {prompts_per_sec:.2f}")
print(f"{'=' * 80}\n")
return {
"init_time": init_time,
"avg_time": avg_time,
"min_time": min_time,
"max_time": max_time,
"total_tokens": total_tokens,
"total_prompts": total_prompts,
"throughput": throughput,
"prompts_per_sec": prompts_per_sec,
"trial_times": trial_times,
}
finally:
# Cleanup
if llm is not None:
with contextlib.suppress(Exception):
llm.shutdown()
def main():
# Check platform support
if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
print("ERROR: Requires CUDA and >= Hopper (SM90)")
print(f"Current platform: {current_platform.device_type}")
if current_platform.is_cuda():
print(f"Device capability: {current_platform.get_device_capability()}")
return 1
# Read configuration from environment
model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
print("\n" + "=" * 80)
print("VLLM BATCH INVARIANCE BENCHMARK")
print("=" * 80)
print("\nConfiguration:")
print(f" Model: {model}")
print(f" Tensor Parallel Size: {tp_size}")
print(f" Attention Backend: {backend}")
print(f" Max Batch Size: {max_batch_size}")
print(f" Number of Trials: {num_trials}")
print(f" Prompt Length Range: {min_prompt}-{max_prompt} words")
print(f" Max Tokens to Generate: {max_tokens}")
print(f" Temperature: {temperature}")
print(f" GPU Memory Utilization: {gpu_mem_util}")
print(f" Max Model Length: {max_model_len}")
print("=" * 80)
# Run benchmark WITHOUT batch invariance (baseline)
print("\n" + "=" * 80)
print("PHASE 1: Running WITHOUT batch invariance (baseline)")
print("=" * 80)
baseline_results = run_benchmark_with_batch_invariant(
model=model,
tp_size=tp_size,
max_batch_size=max_batch_size,
num_trials=num_trials,
min_prompt=min_prompt,
max_prompt=max_prompt,
max_tokens=max_tokens,
temperature=temperature,
gpu_mem_util=gpu_mem_util,
max_model_len=max_model_len,
backend=backend,
batch_invariant=False,
)
# Run benchmark WITH batch invariance
print("\n" + "=" * 80)
print("PHASE 2: Running WITH batch invariance")
print("=" * 80)
batch_inv_results = run_benchmark_with_batch_invariant(
model=model,
tp_size=tp_size,
max_batch_size=max_batch_size,
num_trials=num_trials,
min_prompt=min_prompt,
max_prompt=max_prompt,
max_tokens=max_tokens,
temperature=temperature,
gpu_mem_util=gpu_mem_util,
max_model_len=max_model_len,
backend=backend,
batch_invariant=True,
)
# Compare results
print("\n" + "=" * 80)
print("COMPARISON: Batch Invariance vs Baseline")
print("=" * 80)
init_overhead_pct = (
(batch_inv_results["init_time"] - baseline_results["init_time"])
/ baseline_results["init_time"]
* 100
)
time_overhead_pct = (
(batch_inv_results["avg_time"] - baseline_results["avg_time"])
/ baseline_results["avg_time"]
* 100
)
throughput_change_pct = (
(batch_inv_results["throughput"] - baseline_results["throughput"])
/ baseline_results["throughput"]
* 100
)
print("\nInitialization Time:")
print(f" Baseline: {baseline_results['init_time']:.2f}s")
print(f" Batch Invariant: {batch_inv_results['init_time']:.2f}s")
print(f" Overhead: {init_overhead_pct:+.2f}%")
print("\nAverage Trial Time:")
print(f" Baseline: {baseline_results['avg_time']:.2f}s")
print(f" Batch Invariant: {batch_inv_results['avg_time']:.2f}s")
print(f" Overhead: {time_overhead_pct:+.2f}%")
print("\nThroughput (tokens/s):")
print(f" Baseline: {baseline_results['throughput']:.2f}")
print(f" Batch Invariant: {batch_inv_results['throughput']:.2f}")
print(f" Change: {throughput_change_pct:+.2f}%")
print("\nPrompts/s:")
print(f" Baseline: {baseline_results['prompts_per_sec']:.2f}")
print(f" Batch Invariant: {batch_inv_results['prompts_per_sec']:.2f}")
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
if time_overhead_pct > 0:
print(
f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
"overhead"
)
else:
print(
f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
"faster (unexpected!)"
)
if abs(throughput_change_pct) < 1.0:
print("Throughput difference is negligible (< 1%)")
elif throughput_change_pct < 0:
print(
f"Throughput decreased by {-throughput_change_pct:.1f}% "
"with batch invariance"
)
else:
print(
f"Throughput increased by {throughput_change_pct:.1f}% "
"with batch invariance (unexpected!)"
)
print("=" * 80 + "\n")
return 0
if __name__ == "__main__":
exit(main())

View File

@ -69,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
# Remove the special tokens.
return random.choices(
[v for k, v in vocab.items() if k not in all_special_ids],
[v for v in vocab.values() if v not in all_special_ids],
k=length,
)

View File

@ -561,8 +561,11 @@ async def client_main(
f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501
)
random.seed(args.seed)
np.random.seed(args.seed)
# Set unique seed per client (each client runs in its own process)
# Add 1 to ensure no client uses the same seed as the main process
client_seed = args.seed + client_id + 1
random.seed(client_seed)
np.random.seed(client_seed)
# Active conversations
active_convs: ConversationsMap = {}
@ -1490,6 +1493,7 @@ async def main() -> None:
f"Invalid --warmup-percentage={args.warmup_percentage}"
) from None
# Set global seeds for main process
random.seed(args.seed)
np.random.seed(args.seed)

View File

@ -242,7 +242,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
GIT_TAG v52.2.0
GIT_TAG v52.6.0
GIT_SHALLOW TRUE
GIT_PROGRESS TRUE
)
@ -310,7 +310,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.9
GIT_TAG v3.10
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 8e1b01d56210dc72030a2d0d41c2d8d266ba6309
GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -5,6 +5,10 @@
#include <type_traits>
#include <cstddef>
#if defined(__APPLE__)
#include <sys/sysctl.h>
#endif
#include "cpu_types.hpp"
#include "scratchpad_manager.h"
#include "cpu_attn_macros.h"
@ -741,9 +745,21 @@ class AttentionScheduler {
static int64_t get_available_l2_size() {
static int64_t size = []() {
#if defined(__APPLE__)
// macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
int64_t l2_cache_size = 0;
size_t len = sizeof(l2_cache_size);
if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
l2_cache_size > 0) {
return l2_cache_size >> 1; // use 50% of L2 cache
}
// Fallback if sysctlbyname fails
return 128LL * 1024 >> 1; // use 50% of 128KB
#else
long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
TORCH_CHECK_NE(l2_cache_size, -1);
return l2_cache_size >> 1; // use 50% of L2 cache
#endif
}();
return size;
}
@ -816,15 +832,21 @@ struct VecTypeTrait<float> {
using vec_t = vec_op::FP32Vec16;
};
// ARM only supports BF16 with ARMv8.6-A extension
#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
#else
template <>
struct VecTypeTrait<c10::BFloat16> {
using vec_t = vec_op::BF16Vec16;
};
#endif
#if !defined(__powerpc__)
template <>
struct VecTypeTrait<c10::Half> {
using vec_t = vec_op::FP16Vec16;
};
#endif
template <typename T>
void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
@ -1586,9 +1608,17 @@ class AttentionMainLoop {
if (use_sink) {
alignas(64) float s_aux_fp32[16];
#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
// ARM without native BF16 support: manual conversion
for (int i = 0; i < 16; ++i) {
s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
}
#else
// All other platforms have BF16Vec16 available
vec_op::BF16Vec16 vec_bf16(curr_s_aux);
vec_op::FP32Vec16 vec_fp32(vec_bf16);
vec_fp32.save(s_aux_fp32);
#endif
float* __restrict__ curr_sum_buffer = sum_buffer;
float* __restrict__ curr_max_buffer = max_buffer;

View File

@ -100,6 +100,9 @@ void cpu_attention_with_kv_cache(
const torch::Tensor& scheduler_metadata,
const std::optional<torch::Tensor>& s_aux);
// Note: just for avoiding importing errors
void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops
@ -275,6 +278,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
"float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
&cpu_attention_with_kv_cache);
// placeholders
ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
}
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {

View File

@ -37,6 +37,16 @@
#ifdef USE_ROCM
#define FINAL_MASK 0xffffffffffffffffULL
#if defined(HIP_VERSION) && HIP_VERSION < 70000000
// On ROCm versions before 7.0, __syncwarp isn't defined. The below
// implementation is copy/pasted from the implementation in ROCm 7.0
__device__ inline void __syncwarp() {
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
__builtin_amdgcn_wave_barrier();
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
}
#endif
#else
#define FINAL_MASK 0xffffffff
#endif

View File

@ -279,17 +279,17 @@ __device__ __forceinline__ void token_bounds(int32_t n_tokens,
}
template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
int THREADS, typename Idx_t, bool USE_UE8M0, int GROUP_SIZE = 128,
int NUM_STAGES = 3>
typename scale_t, int THREADS, typename Idx_t, bool CEIL_UE8M0,
int GROUP_SIZE = 128, int NUM_STAGES = 3>
__global__ void silu_mul_fp8_quant_deep_gemm_kernel(
const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
scale_t* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
// sizes
Idx_t E, Idx_t T, Idx_t H,
// strides (in elements)
Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
Idx_t stride_ys_g, Idx_t stride_counts_e) {
Idx_t stride_ys_g, Idx_t stride_ys_p, Idx_t stride_counts_e) {
#ifndef USE_ROCM
static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
@ -466,9 +466,22 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
__nv_fp8x4_e4m3* y_q_base_ptr =
reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g;
Idx_t scale_group_offset = 0;
if constexpr (std::is_same<scale_t, uint8_t>::value) {
// packed int32_t format
int pack_id = warp_position_scales / 4;
int scale_in_pack = warp_position_scales % 4;
scale_group_offset = pack_id * stride_ys_p + scale_in_pack * stride_ys_g;
} else {
scale_group_offset = warp_position_scales * stride_ys_g;
}
scale_t* const y_scale_base_ptr = _y_s + scale_group_offset;
for (auto j = tokens_lower; j < tokens_upper; j++) {
int current_group_id = warp_position_scales; // Running count of which
// group is being processed
const Idx_t base_ys = expert_id * stride_ys_e;
auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
__nv_fp8x4_e4m3* y_q_ptr =
@ -509,7 +522,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
__nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
if constexpr (USE_UE8M0) {
if constexpr (CEIL_UE8M0) {
y_s = hexp2(hceil(hlog2(y_s)));
}
@ -527,8 +540,24 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
y_q_ptr += WARP_SIZE * stride_yq_h;
if (!lane_id) {
*y_s_ptr = y_s;
y_s_ptr += stride_ys_g;
// Store scales.
if constexpr (std::is_same<scale_t, uint8_t>::value) {
// Packed UE8MO format. Remove Mantissa.
*y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
bool const jump_pack = (current_group_id + 1) % 4 == 0;
// Minus 3 because we need to get to the first group in the
// next pack.
y_s_ptr += jump_pack ? (stride_ys_p - 3) : stride_ys_g;
} else {
// float32 format
static_assert(std::is_same<scale_t, float>::value);
*y_s_ptr = y_s;
y_s_ptr += stride_ys_g;
}
current_group_id += 1;
}
}
}
@ -573,7 +602,7 @@ void persistent_masked_m_silu_mul_quant(
const at::Tensor& tokens_per_expert, // (E)
at::Tensor& y_q, // (E, T, H) [OUT]
at::Tensor& y_s, // (E, T, H//group_size) [OUT]
bool use_ue8m0) {
bool cast_scale_ue8m0) {
#ifndef USE_ROCM
// This kernel currently only supports H % 128 == 0 and assumes a
@ -583,9 +612,12 @@ void persistent_masked_m_silu_mul_quant(
TORCH_CHECK(input.dtype() == torch::kBFloat16);
TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
y_q.dtype() == torch::kFloat8_e4m3fnuz);
TORCH_CHECK(y_s.dtype() == torch::kFloat32);
TORCH_CHECK(input.size(-1) % (GROUP_SIZE * 2) == 0);
bool const is_packed_ue8m0 =
(y_s.dtype() == torch::kInt32 && cast_scale_ue8m0);
TORCH_CHECK(y_s.dtype() == torch::kFloat32 || is_packed_ue8m0);
using Idx_t = int64_t;
Idx_t E = input.size(0);
@ -597,15 +629,18 @@ void persistent_masked_m_silu_mul_quant(
Idx_t stride_yq_e = y_q.stride(0);
Idx_t stride_yq_t = y_q.stride(1);
Idx_t stride_yq_h = y_q.stride(2);
Idx_t stride_ys_e = y_s.stride(0);
Idx_t stride_ys_t = y_s.stride(1);
Idx_t stride_ys_g = y_s.stride(2);
Idx_t stride_counts_e = tokens_per_expert.stride(0);
int const NUM_GROUPS = H / GROUP_SIZE;
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
#define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES) \
// TODO: Get this from cuda_arch ?
static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
#define KERNEL(BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G, \
STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, STAGES) \
static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE; \
int sms = SILU_V2_BLOCK_COUNT; \
static constexpr int max_shared_mem_bytes = \
@ -615,43 +650,86 @@ void persistent_masked_m_silu_mul_quant(
VLLM_DISPATCH_FP8_TYPES( \
y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] { \
vllm::silu_mul_fp8_quant_deep_gemm_kernel< \
BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t, \
USE_UE8M0, GROUP_SIZE, STAGES> \
BLOCK_COUNT, max_shared_mem_bytes, fp8_t, scale_t, THREAD_COUNT, \
Idx_t, CEIL_UE8M0, GROUP_SIZE, STAGES> \
<<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>( \
reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \
(fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \
(fp8_t*)y_q.data_ptr(), \
reinterpret_cast<scale_t*>(y_s.data_ptr()), \
reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e, \
stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t, \
stride_ys_g, stride_counts_e); \
stride_yq_t, stride_yq_h, STRIDE_YS_E, STRIDE_YS_T, \
STRIDE_YS_G, STRIDE_YS_P, stride_counts_e); \
});
static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
#define LAUNCH_ON_H(scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G, \
STRIDE_YS_P, CEIL_UE8M0) \
if (H >= 4096 && (NUM_GROUPS % 8) == 0) { \
/* 8 warp config */ \
static constexpr int NUM_STAGES = 4; \
static constexpr int THREAD_COUNT = 256; \
KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, \
STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, NUM_STAGES); \
} else { \
/* 1 warp config */ \
static constexpr int THREAD_COUNT = 32; \
KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, \
STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, 2); \
}
int const NUM_GROUPS = H / GROUP_SIZE;
if (!use_ue8m0) {
if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
/* 8 warps config */
static constexpr int NUM_STAGES = 4;
static constexpr int THREAD_COUNT = 256;
KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
} else {
/* 1 warp config */
static constexpr int THREAD_COUNT = 32;
KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
}
} else {
if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
/* 8 warps config */
static constexpr int NUM_STAGES = 4;
static constexpr int THREAD_COUNT = 256;
KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
} else {
/* 1 warp config */
static constexpr int THREAD_COUNT = 32;
KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
}
Idx_t stride_ys_e = y_s.stride(0);
Idx_t stride_ys_t = y_s.stride(1);
Idx_t stride_ys_g = y_s.stride(2);
Idx_t stride_ys_p = 0;
if (!cast_scale_ue8m0) {
TORCH_CHECK(!is_packed_ue8m0);
LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
false);
return;
}
if (!is_packed_ue8m0) {
// UE8M0 but not packed
LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
true);
return;
}
TORCH_CHECK(cast_scale_ue8m0 && is_packed_ue8m0);
TORCH_CHECK(y_s.dtype() == torch::kInt32);
// Int32 packed ue8m0 scales tensor.
// Let E, T, G be the number to experts, number of tokens and number of groups
// respectively. Let, E = 2, T = 4, G = 6, in this case the int32 scales
// tensor are of shape [1, 4, 2] and stride [8, 1, 4]. The scales are expected
// to be arranged as follows,
// [[T0G0-T0G1-T0G2-T0G3, T0G4-T0G5-X-X,],
// [T1G0-T1G1-T1G2-T1G3, T1G4-T1G5-X-X,]
// [T2G0-T2G1-T2G2-T2G3, T2G4-T2G5-X-X,]
// [T3G0-T3G1-T3G2-T3G3, T3G4-T3G5-X-X,]]
// where, TxGy is the scale ue8m0 scale value of Token x, Group y.
//
// In memory (in bytes) the scale values are arranged as,
// [T0G0, T0G1, T0G2, T0G3, T1G0, T1G2, T1G3, T1G4, T2G0, T2G1, T2G3, T2G4,
// T3G0, T3G1, T3G2, T3G3, T0G4, T0G5, X, X, T1G4, T1G5, X, X, T2G4, T2G5,
// X, X, T3G4, T3G5, X, X]
//
// An Int32 tensor of size [1, 4, 2] and stride [8, 1, 4] can be represented
// as an uint8 tensor of shape [1, 2, 4, 4] and stride [32, 16, 4, 1]. In
// english, ignoring the Experts dimension, the original int32 tensor is
// simply treated as two packed [4, 4] uint8 tensor (or two [4, 1] int32
// tensor). The following strides setting reflects this change. Caveat: This
// means that the G dimension is no longer contiguous. i.e. Note that to move
// from G3 to G4, we need to jump along the packing dimension. The kernel
// handles this case.
stride_ys_e *= sizeof(int32_t);
stride_ys_p = T * sizeof(int32_t); // Packing dimension
stride_ys_t = sizeof(int32_t);
stride_ys_g = 1;
LAUNCH_ON_H(uint8_t, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
true);
#endif
}

View File

@ -247,22 +247,6 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
return out;
}
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
c10::SymInt size_k, c10::SymInt size_n,
int64_t num_bits) {
int const pack_factor = 32 / num_bits;
auto options = torch::TensorOptions()
.dtype(b_q_weight.dtype())
.device(b_q_weight.device());
return torch::empty_symint(
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
options);
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("awq_marlin_repack", &awq_marlin_repack);
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
}

View File

@ -321,22 +321,6 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
return out;
}
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
torch::Tensor& perm, c10::SymInt size_k,
c10::SymInt size_n, int64_t num_bits) {
int const pack_factor = 32 / num_bits;
auto options = torch::TensorOptions()
.dtype(b_q_weight.dtype())
.device(b_q_weight.device());
return torch::empty_symint(
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
options);
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("gptq_marlin_repack", &gptq_marlin_repack);
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
}

View File

@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
});
if (numel % 256 != 0) {
out = out.index({torch::indexing::Slice(0, numel / had_size)});
out = out.narrow(0, 0, numel / had_size);
}
if (inplace && out.data_ptr() != x.data_ptr()) {

View File

@ -116,6 +116,26 @@ struct sm90_fp8_config_default {
ClusterShape, KernelSchedule, EpilogueSchedule>>;
};
template <typename InType, typename OutType, bool EnableBias>
struct sm90_fp8_config_M8192_K6144 {
// M >= 8192, K >= 6144
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule =
cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
using EpilogueSchedule =
typename cutlass::epilogue::TmaWarpSpecializedCooperative;
using TileShape = Shape<_256, _128, _128>;
using ClusterShape = Shape<_2, _1, _1>;
using Cutlass3xGemm = conditional_t<
EnableBias,
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
TileShape, ClusterShape, KernelSchedule,
EpilogueSchedule>,
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
ClusterShape, KernelSchedule, EpilogueSchedule>>;
};
template <typename InType, typename OutType, bool EnableBias>
struct sm90_fp8_config_M128 {
// M in (64, 128]
@ -273,6 +293,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
using Cutlass3xGemmDefault =
typename sm90_fp8_config_default<InType, OutType,
EnableBias>::Cutlass3xGemm;
using Cutlass3xGemmM8192_K6144 =
typename sm90_fp8_config_M8192_K6144<InType, OutType,
EnableBias>::Cutlass3xGemm;
using Cutlass3xGemmM128 =
typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
@ -291,6 +314,7 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
uint32_t const m = a.size(0);
uint32_t const n = b.size(1);
uint32_t const k = a.size(1);
if (m <= 16) {
// m in [1, 16]
@ -312,6 +336,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
// m in (64, 128]
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
} else if (m >= 8192 && k >= 6144) {
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM8192_K6144>(
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
} else {
// m in (128, inf)
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(

View File

@ -15,6 +15,17 @@ RUN apt-get update -q -y && apt-get install -q -y \
# Remove sccache
RUN python3 -m pip install --upgrade pip
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
# Install UV
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR}
@ -59,13 +70,15 @@ FROM base AS test
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
# Install vLLM
# Install vLLM using uv (inherited from base stage)
# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
--mount=type=cache,target=/root/.cache/uv \
cd /install \
&& pip install -U -r requirements/rocm.txt \
&& pip install -U -r requirements/rocm-test.txt \
&& uv pip install --system -r requirements/rocm.txt \
&& uv pip install --system -r requirements/rocm-test.txt \
&& pip uninstall -y vllm \
&& pip install *.whl
&& uv pip install --system *.whl
WORKDIR /vllm-workspace
ARG COMMON_WORKDIR
@ -89,14 +102,17 @@ RUN case "$(which python3)" in \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
*) ;; esac
RUN python3 -m pip install --upgrade huggingface-hub[cli]
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --upgrade huggingface-hub[cli]
# Install vLLM
# Install vLLM using uv (inherited from base stage)
# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
--mount=type=cache,target=/root/.cache/uv \
cd /install \
&& pip install -U -r requirements/rocm.txt \
&& uv pip install --system -r requirements/rocm.txt \
&& pip uninstall -y vllm \
&& pip install *.whl
&& uv pip install --system *.whl
ARG COMMON_WORKDIR

View File

@ -1,4 +1,4 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
ARG TRITON_BRANCH="57c693b6"
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="1c57644d"
@ -7,7 +7,7 @@ ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="0e60e394"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="9716b1b8"
ARG AITER_BRANCH="59bd8ff2"
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base
@ -19,6 +19,9 @@ ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx11
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV AITER_ROCM_ARCH=gfx942;gfx950
# Required for RCCL in ROCm7.1
ENV HSA_NO_SCRATCH_RECLAIM=1
ARG PYTHON_VERSION=3.12
RUN mkdir -p /app

View File

@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \
libxext6 \
libgl1 \
lsb-release \
libaio-dev \
numactl \
wget \
vim \
@ -68,8 +69,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
RUN python3 -m pip install -e tests/vllm_test_utils
# install nixl from source code
ENV NIXL_VERSION=0.7.0
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
RUN --mount=type=cache,target=/root/.cache/pip \
pip uninstall oneccl oneccl-devel -y

View File

@ -46,7 +46,10 @@ nav:
- contributing/model/multimodal.md
- contributing/model/transcription.md
- CI: contributing/ci
- Design Documents: design
- Design Documents:
- Plugins:
- design/*plugin*.md
- design/*
- API Reference:
- api/README.md
- api/vllm

View File

@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
Where to get started with vLLM depends on the type of user. If you are looking to:
- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
- Build vLLM, we recommend starting with [Developer Guide](./contributing)
- Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
- Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
For information about the development of vLLM, see:

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_latency.md"
--8<-- "docs/argparse/bench_latency.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_serve.md"
--8<-- "docs/argparse/bench_serve.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_sweep_plot.md"
--8<-- "docs/argparse/bench_sweep_plot.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_sweep_serve.md"
--8<-- "docs/argparse/bench_sweep_serve.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_sweep_serve_sla.md"
--8<-- "docs/argparse/bench_sweep_serve_sla.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/bench_throughput.md"
--8<-- "docs/argparse/bench_throughput.inc.md"

View File

@ -1,5 +1,5 @@
# vllm chat
## Options
## Arguments
--8<-- "docs/argparse/chat.md"
--8<-- "docs/argparse/chat.inc.md"

View File

@ -1,5 +1,5 @@
# vllm complete
## Options
## Arguments
--8<-- "docs/argparse/complete.md"
--8<-- "docs/argparse/complete.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/run-batch.md"
--8<-- "docs/argparse/run-batch.inc.md"

View File

@ -4,6 +4,6 @@
--8<-- "docs/cli/json_tip.inc.md"
## Options
## Arguments
--8<-- "docs/argparse/serve.md"
--8<-- "docs/argparse/serve.inc.md"

View File

@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
## CLI Arguments
The `vllm serve` command is used to launch the OpenAI-compatible server.
To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
To see the available options, take a look at the [CLI Reference](../cli/README.md)!
## Configuration file

View File

@ -10,8 +10,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu
- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
[Benchmark CLI]: #benchmark-cli
## Benchmark CLI
This section guides you through running benchmark tests with the extensive
@ -985,7 +983,7 @@ each document has close to 512 tokens.
Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
with an embedding model, also set `--no_reranker`. Because in this case the query is
treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
treated as an individual prompt by the server, here we send `random_batch_size - 1` documents
to account for the extra prompt which is the query. The token accounting to report the
throughput numbers correctly is also adjusted.

View File

@ -95,7 +95,7 @@ when manually triggering a build on Buildkite. This branch accomplishes two thin
to warm it up so that future builds are faster.
<p align="center" width="100%">
<img width="60%" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
</p>
## Update dependencies

View File

@ -1,7 +1,7 @@
# Summary
!!! important
Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
Many decoder language models can now be automatically loaded using the [Transformers modeling backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.

View File

@ -249,7 +249,7 @@ No extra registration is required beyond having your model class available via t
## Examples in-tree
- Whisper encoderdecoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
## Test with the API

View File

@ -29,8 +29,8 @@ pip install vllm
- API Path: `/chat/completions`
- Model: `qwen/Qwen1.5-0.5B-Chat`
![](../../assets/deployment/chatbox-settings.png)
![Chatbox settings screen](../../assets/deployment/chatbox-settings.png)
1. Go to `Just chat`, and start to chat:
![](../../assets/deployment/chatbox-chat.png)
![Chatbot chat screen](../../assets/deployment/chatbox-chat.png)

View File

@ -46,12 +46,12 @@ And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compos
- **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
- **Completion Mode**: `Completion`
![](../../assets/deployment/dify-settings.png)
![Dify settings screen](../../assets/deployment/dify-settings.png)
1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
![](../../assets/deployment/dify-create-chatbot.png)
![Dify create chatbot screen](../../assets/deployment/dify-create-chatbot.png)
1. Click the chatbot you just created to open the chat interface and start interacting with the model:
![](../../assets/deployment/dify-chat.png)
![Dify chat screen](../../assets/deployment/dify-chat.png)

View File

@ -156,7 +156,7 @@ In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.o
## Advanced Deployment Details
With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLMs optimized inference without additional backend modifications.
With the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLMs optimized inference without additional backend modifications.
Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts.
@ -167,4 +167,4 @@ The platform integrates seamlessly with the Hugging Face Hub, allowing you to de
- Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog
- Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index)
- Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm)
- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
- Understand the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)

View File

@ -128,7 +128,7 @@ A [CUDAGraphWrapper][vllm.compilation.cuda_graph.CUDAGraphWrapper] instance wrap
3. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, the wrapper will perform CUDA Graphs capture (if key does not exist, create
a new entry and cache it) or replay (if key exists in the cache).
The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust whats in the forward context (controlled by the dispatcher). This lets us simplify and cenralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust whats in the forward context (controlled by the dispatcher). This lets us simplify and centralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
#### Nested Wrapper design

View File

@ -19,9 +19,9 @@ The input activation format completely depends on the All2All Dispatch being use
The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
![](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png "FusedMoE Non-Batched")
![FusedMoE Non-Batched](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png)
![](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png "FusedMoE Batched")
![FusedMoE Batched](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png)
!!! note
The main difference, in terms of operations, between the Batched and Non-Batched cases is the Permute / Unpermute operations. All other operations remain.
@ -57,7 +57,7 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive` and `finalize` functions.
The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers. Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
![FusedMoEPrepareAndFinalize Blocks](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png)
### FusedMoEPermuteExpertsUnpermute
@ -88,7 +88,7 @@ The core FusedMoE implementation performs a series of operations. It would be in
It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section.
`FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use.
![](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png "FusedMoEPermuteExpertsUnpermute Blocks")
![FusedMoEPermuteExpertsUnpermute Blocks](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png)
### FusedMoEModularKernel

View File

@ -0,0 +1,220 @@
# LoRA Resolver Plugins
This directory contains vLLM's LoRA resolver plugins built on the `LoRAResolver` framework.
They automatically discover and load LoRA adapters from a specified local storage path, eliminating the need for manual configuration or server restarts.
## Overview
LoRA Resolver Plugins provide a flexible way to dynamically load LoRA adapters at runtime. When vLLM
receives a request for a LoRA adapter that hasn't been loaded yet, the resolver plugins will attempt
to locate and load the adapter from their configured storage locations. This enables:
- **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source.
- **Automatic Discovery**: Seamless integration with existing LoRA workflows
- **Scalable Deployment**: Centralized adapter management across multiple vLLM instances
## Prerequisites
Before using LoRA Resolver Plugins, ensure the following environment variables are configured:
### Required Environment Variables
1. **`VLLM_ALLOW_RUNTIME_LORA_UPDATING`**: Must be set to `true` or `1` to enable dynamic LoRA loading
```bash
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
```
2. **`VLLM_PLUGINS`**: Must include the desired resolver plugins (comma-separated list)
```bash
export VLLM_PLUGINS=lora_filesystem_resolver
```
3. **`VLLM_LORA_RESOLVER_CACHE_DIR`**: Must be set to a valid directory path for filesystem resolver
```bash
export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
```
### Optional Environment Variables
- **`VLLM_PLUGINS`**: If not set, all available plugins will be loaded. If set to empty string, no plugins will be loaded.
## Available Resolvers
### lora_filesystem_resolver
The filesystem resolver is installed with vLLM by default and enables loading LoRA adapters from a local directory structure.
#### Setup Steps
1. **Create the LoRA adapter storage directory**:
```bash
mkdir -p /path/to/lora/adapters
```
2. **Set environment variables**:
```bash
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
export VLLM_PLUGINS=lora_filesystem_resolver
export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
```
3. **Start vLLM server**:
Your base model can be `meta-llama/Llama-2-7b-hf`. Please make sure you set up the Hugging Face token in your env var `export HF_TOKEN=xxx235`.
```bash
python -m vllm.entrypoints.openai.api_server \
--model your-base-model \
--enable-lora
```
#### Directory Structure Requirements
The filesystem resolver expects LoRA adapters to be organized in the following structure:
```text
/path/to/lora/adapters/
├── adapter1/
│ ├── adapter_config.json
│ ├── adapter_model.bin
│ └── tokenizer files (if applicable)
├── adapter2/
│ ├── adapter_config.json
│ ├── adapter_model.bin
│ └── tokenizer files (if applicable)
└── ...
```
Each adapter directory must contain:
- **`adapter_config.json`**: Required configuration file with the following structure:
```json
{
"peft_type": "LORA",
"base_model_name_or_path": "your-base-model-name",
"r": 16,
"lora_alpha": 32,
"target_modules": ["q_proj", "v_proj"],
"bias": "none",
"modules_to_save": null,
"use_rslora": false,
"use_dora": false
}
```
- **`adapter_model.bin`**: The LoRA adapter weights file
#### Usage Example
1. **Prepare your LoRA adapter**:
```bash
# Assuming you have a LoRA adapter in /tmp/my_lora_adapter
cp -r /tmp/my_lora_adapter /path/to/lora/adapters/my_sql_adapter
```
2. **Verify the directory structure**:
```bash
ls -la /path/to/lora/adapters/my_sql_adapter/
# Should show: adapter_config.json, adapter_model.bin, etc.
```
3. **Make a request using the adapter**:
```bash
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "my_sql_adapter",
"prompt": "Generate a SQL query for:",
"max_tokens": 50,
"temperature": 0.1
}'
```
#### How It Works
1. When vLLM receives a request for a LoRA adapter named `my_sql_adapter`
2. The filesystem resolver checks if `/path/to/lora/adapters/my_sql_adapter/` exists
3. If found, it validates the `adapter_config.json` file
4. If the configuration matches the base model and is valid, the adapter is loaded
5. The request is processed normally with the newly loaded adapter
6. The adapter remains available for future requests
## Advanced Configuration
### Multiple Resolvers
You can configure multiple resolver plugins to load adapters from different sources:
'lora_s3_resolver' is an example of a custom resolver you would need to implement
```bash
export VLLM_PLUGINS=lora_filesystem_resolver,lora_s3_resolver
```
All listed resolvers are enabled; at request time, vLLM tries them in order until one succeeds.
### Custom Resolver Implementation
To implement your own resolver plugin:
1. **Create a new resolver class**:
```python
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
from vllm.lora.request import LoRARequest
class CustomResolver(LoRAResolver):
async def resolve_lora(self, base_model_name: str, lora_name: str) -> Optional[LoRARequest]:
# Your custom resolution logic here
pass
```
2. **Register the resolver**:
```python
def register_custom_resolver():
resolver = CustomResolver()
LoRAResolverRegistry.register_resolver("Custom Resolver", resolver)
```
## Troubleshooting
### Common Issues
1. **"VLLM_LORA_RESOLVER_CACHE_DIR must be set to a valid directory"**
- Ensure the directory exists and is accessible
- Check file permissions on the directory
2. **"LoRA adapter not found"**
- Verify the adapter directory name matches the requested model name
- Check that `adapter_config.json` exists and is valid JSON
- Ensure `adapter_model.bin` exists in the directory
3. **"Invalid adapter configuration"**
- Verify `peft_type` is set to "LORA"
- Check that `base_model_name_or_path` matches your base model
- Ensure `target_modules` is properly configured
4. **"LoRA rank exceeds maximum"**
- Check that `r` value in `adapter_config.json` doesn't exceed `max_lora_rank` setting
### Debugging Tips
1. **Enable debug logging**:
```bash
export VLLM_LOGGING_LEVEL=DEBUG
```
2. **Verify environment variables**:
```bash
echo $VLLM_ALLOW_RUNTIME_LORA_UPDATING
echo $VLLM_PLUGINS
echo $VLLM_LORA_RESOLVER_CACHE_DIR
```
3. **Test adapter configuration**:
```bash
python -c "
import json
with open('/path/to/lora/adapters/my_adapter/adapter_config.json') as f:
config = json.load(f)
print('Config valid:', config)
"
```

View File

@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
## Fused MoE Experts Kernels
The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.

View File

@ -5,7 +5,7 @@ You can use vLLM *custom arguments* to pass in arguments which are not part of t
Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
!!! note
Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise invalid custom arguments can cause unexpected behaviour.
Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise, invalid custom arguments can cause unexpected behaviour.
## Offline Custom Arguments

View File

@ -71,7 +71,7 @@ Logits processor `update_state()` implementations should assume the following mo
* **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
* **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
* **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
@ -286,7 +286,7 @@ Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) wh
## Ways to Load Your Custom Logits Processor in vLLM
Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests.
Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits processors cannot be loaded on-demand for individual requests.
This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor.
@ -438,7 +438,7 @@ The examples below show how a user would pass a custom argument (`target_token`)
## Best Practices for Writing Custom Logits Processors
Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently.
Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus, it is important to implement these methods efficiently.
* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
* For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
@ -465,4 +465,4 @@ Once vLLM loads a logits processor during initialization, then vLLM will invoke
* **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However, the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method

View File

@ -0,0 +1,118 @@
# Interleaved Thinking
## Introduction
Interleaved thinking allows models to reason between tool calls, enabling more sophisticated decision-making after receiving tool results. This feature helps models chain multiple tool calls with reasoning steps in between and make nuanced decisions based on intermediate results.
Important: Interleaved thinking increases token usage and response latency. Consider your budget and performance requirements when enabling this feature.
## How Interleaved Thinking Works
With interleaved thinking, the model can:
- Reason about the results of a tool call before deciding what to do next
- Chain multiple tool calls with reasoning steps in between
- Make more nuanced decisions based on intermediate results
- Provide transparent reasoning for its tool selection process
## Supported Models
vLLM currently supports the following interleaved thinking models:
| Model Series | Reasoning Parser Name |
|--------------|-----------------------|
| moonshotai/Kimi-K2-Thinking | kimi_k2 |
| MiniMaxAI/MiniMax-M2 | minimax_m2 |
## Example Usage
To use interleaved thinking with tool calls, specify a model that supports this feature and enable tool calls in your chat completion request. Here's an example:
??? code
```python
"""
vllm serve MiniMaxAI/MiniMax-M2 \
--tensor-parallel-size 4 \
--tool-call-parser minimax_m2 \
--reasoning-parser minimax_m2 \
--enable-auto-tool-choice
"""
import json
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
def get_current_weather(location: str, unit: "str"):
"""Get the current weather in a given location"""
if unit == "celsius":
return f"The current temperature in {location} is 22°C."
else:
return f"The current temperature in {location} is 72°F."
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City and state, e.g., 'San Francisco, CA'",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location", "unit"],
},
},
}
]
messages = [{"role": "user", "content": "What's the weather in Fahrenheit like in San Francisco?"}]
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=messages,
tools=tools,
tool_choice="auto",
)
tool_call = response.choices[0].message.tool_calls[0].function
messages.append(
{
"role": "assistant",
"tool_calls": response.choices[0].message.tool_calls,
"reasoning": response.choices[0].message.reasoning, # append reasoning
}
)
# Simulate tool execution
available_tools = {"get_weather": get_current_weather}
completion_tool_calls = response.choices[0].message.tool_calls
for call in completion_tool_calls:
tool_to_call = available_tools[call.function.name]
args = json.loads(call.function.arguments)
result = tool_to_call(**args)
messages.append(
{
"role": "tool",
"content": result,
"tool_call_id": call.id,
"name": call.function.name,
}
)
response_2 = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=messages,
tools=tools,
tool_choice="auto",
)
print(response_2.choices[0].message.content)
```
This example demonstrates how to set up interleaved thinking with tool calls using a weather retrieval function. The model reasons about the tool results before generating the final response.

View File

@ -298,7 +298,7 @@ There are two steps to generate and deploy a mixed precision model quantized wit
Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later.
As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benifits. They are:
As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benefits. They are:
- amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
- amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8

View File

@ -28,10 +28,15 @@ After installation of XCode and the Command Line Tools, which include Apple Clan
```bash
git clone https://github.com/vllm-project/vllm.git
cd vllm
uv pip install -r requirements/cpu.txt
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
uv pip install -e .
```
!!! tip
The `--index-strategy unsafe-best-match` flag is needed to resolve dependencies across multiple package indexes (PyTorch CPU index and PyPI). Without this flag, you may encounter `typing-extensions` version conflicts.
The term "unsafe" refers to the package resolution strategy, not security. By default, `uv` only searches the first index where a package is found to prevent dependency confusion attacks. This flag allows `uv` to search all configured indexes to find the best compatible versions. Since both PyTorch and PyPI are trusted package sources, using this strategy is safe and appropriate for vLLM installation.
!!! note
On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device.

View File

@ -104,7 +104,7 @@ Currently, there are no pre-built CPU wheels.
### Which `dtype` should be used?
- Currently vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.
- Currently, vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.
### How to launch a vLLM service on CPU?

View File

@ -2,7 +2,7 @@
vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
Currently the CPU implementation for s390x architecture supports FP32 datatype only.
Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
!!! warning
There are no pre-built wheels or images for this device, so you must build vLLM from source.

View File

@ -83,7 +83,7 @@ uv pip install dist/*.whl
!!! example "Troubleshooting"
- **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
- **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
- `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
- `AMD` requires at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
- If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
```toml title="pyproject.toml"
[build-system]

View File

@ -1,12 +1,15 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import importlib.metadata
import importlib.util
import logging
import sys
import traceback
from argparse import SUPPRESS, HelpFormatter
from argparse import SUPPRESS, Action, HelpFormatter
from collections.abc import Iterable
from importlib.machinery import ModuleSpec
from pathlib import Path
from typing import Literal
from typing import TYPE_CHECKING, Literal
from unittest.mock import MagicMock, patch
from pydantic_core import core_schema
@ -19,6 +22,11 @@ ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
sys.path.insert(0, str(ROOT_DIR))
def mock_if_no_torch(mock_module: str, mock: MagicMock):
if not importlib.util.find_spec("torch"):
sys.modules[mock_module] = mock
# Mock custom op code
class MockCustomOp:
@staticmethod
@ -29,18 +37,21 @@ class MockCustomOp:
return decorator
noop = lambda *a, **k: None
sys.modules["vllm._C"] = MagicMock()
sys.modules["vllm.model_executor.custom_op"] = MagicMock(CustomOp=MockCustomOp)
sys.modules["vllm.utils.torch_utils"] = MagicMock(direct_register_custom_op=noop)
mock_if_no_torch("vllm._C", MagicMock())
mock_if_no_torch("vllm.model_executor.custom_op", MagicMock(CustomOp=MockCustomOp))
mock_if_no_torch(
"vllm.utils.torch_utils", MagicMock(direct_register_custom_op=lambda *a, **k: None)
)
# Mock any version checks by reading from compiled CI requirements
with open(ROOT_DIR / "requirements/test.txt") as f:
VERSIONS = dict(line.strip().split("==") for line in f if "==" in line)
importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0"
# Make torch.nn.Parameter safe to inherit from
sys.modules["torch.nn"] = MagicMock(Parameter=object)
mock_if_no_torch("torch.nn", MagicMock(Parameter=object))
class PydanticMagicMock(MagicMock):
@ -49,31 +60,34 @@ class PydanticMagicMock(MagicMock):
def __init__(self, *args, **kwargs):
name = kwargs.pop("name", None)
super().__init__(*args, **kwargs)
self.__spec__ = importlib.machinery.ModuleSpec(name, None)
self.__spec__ = ModuleSpec(name, None)
def __get_pydantic_core_schema__(self, source_type, handler):
return core_schema.any_schema()
def auto_mock(module, attr, max_mocks=100):
def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
"""Function that automatically mocks missing modules during imports."""
logger.info("Importing %s from %s", attr, module)
logger.info("Importing %s from %s", attr, module_name)
for _ in range(max_mocks):
try:
module = importlib.import_module(module_name)
# First treat attr as an attr, then as a submodule
return getattr(
importlib.import_module(module),
attr,
importlib.import_module(f"{module}.{attr}"),
)
if hasattr(module, attr):
return getattr(module, attr)
return importlib.import_module(f"{module_name}.{attr}")
except ModuleNotFoundError as e:
assert e.name is not None
logger.info("Mocking %s for argparse doc generation", e.name)
sys.modules[e.name] = PydanticMagicMock(name=e.name)
except Exception as e:
logger.warning("Failed to import %s.%s: %s", module, attr, e)
except Exception:
logger.exception("Failed to import %s.%s: %s", module_name, attr)
raise ImportError(
f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
f"Failed to import {module_name}.{attr} after mocking {max_mocks} imports"
)
@ -91,21 +105,26 @@ ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
FlexibleArgumentParser = auto_mock(
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
)
if TYPE_CHECKING:
from vllm.utils.argparse_utils import FlexibleArgumentParser
else:
FlexibleArgumentParser = auto_mock(
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
)
class MarkdownFormatter(HelpFormatter):
"""Custom formatter that generates markdown for argument groups."""
def __init__(self, prog, starting_heading_level=3):
super().__init__(prog, max_help_position=float("inf"), width=float("inf"))
def __init__(self, prog: str, starting_heading_level: int = 3):
super().__init__(prog, max_help_position=sys.maxsize, width=sys.maxsize)
self._section_heading_prefix = "#" * starting_heading_level
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
self._markdown_output = []
def start_section(self, heading):
def start_section(self, heading: str):
if heading not in {"positional arguments", "options"}:
heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
self._markdown_output.append(heading_md)
@ -113,14 +132,14 @@ class MarkdownFormatter(HelpFormatter):
def end_section(self):
pass
def add_text(self, text):
def add_text(self, text: str):
if text:
self._markdown_output.append(f"{text.strip()}\n\n")
def add_usage(self, usage, actions, groups, prefix=None):
pass
def add_arguments(self, actions):
def add_arguments(self, actions: Iterable[Action]):
for action in actions:
if len(action.option_strings) == 0 or "--help" in action.option_strings:
continue
@ -169,7 +188,7 @@ def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
# Auto-mock runtime imports
if tb_list := traceback.extract_tb(e.__traceback__):
path = Path(tb_list[-1].filename).relative_to(ROOT_DIR)
auto_mock(module=".".join(path.parent.parts), attr=path.stem)
auto_mock(module_name=".".join(path.parent.parts), attr=path.stem)
return create_parser(add_cli_args, **kwargs)
else:
raise e
@ -209,7 +228,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
# Generate documentation for each parser
for stem, parser in parsers.items():
doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
doc_path = ARGPARSE_DOC_DIR / f"{stem}.inc.md"
# Specify encoding for building on Windows
with open(doc_path, "w", encoding="utf-8") as f:
f.write(super(type(parser), parser).format_help())

View File

@ -15,9 +15,9 @@ These models are what we list in [supported text models](#list-of-text-only-lang
### Transformers
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers modeling backend".
Currently, the Transformers backend works for the following:
Currently, the Transformers modeling backend works for the following:
- Modalities: embedding models, language models and vision-language models*
- Architectures: encoder-only, decoder-only, mixture-of-experts
@ -25,7 +25,7 @@ Currently, the Transformers backend works for the following:
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers modeling backend, it will be compatible with the following features of vLLM:
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
- Any combination of the following vLLM parallelisation schemes:
@ -44,7 +44,7 @@ llm.apply_model(lambda model: print(type(model)))
If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers modeling backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
!!! note
For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
@ -53,12 +53,12 @@ If a model has a vLLM implementation but you would prefer to use the Transformer
If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!
For a model to be compatible with the Transformers backend for vLLM it must:
For a model to be compatible with the Transformers modeling backend for vLLM it must:
- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
- The model directory must have the correct structure (e.g. `config.json` is present).
- `config.json` must contain `auto_map.AutoModel`.
- be a Transformers backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
- be a Transformers modeling backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
- Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
If the compatible model is:
@ -66,13 +66,13 @@ If the compatible model is:
- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md).
- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference](../serving/offline_inference.md) or `vllm serve <MODEL_DIR>` for the [openai-compatible-server](../serving/openai_compatible_server.md).
This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
This means that, with the Transformers modeling backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
#### Writing custom models
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers modeling backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
To make your model compatible with the Transformers backend, it needs:
To make your model compatible with the Transformers modeling backend, it needs:
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
- If your model is encoder-only:
@ -134,7 +134,7 @@ Here is what happens in the background when this model is loaded:
1. The config is loaded.
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
3. `MyModel` is loaded into one of the Transformers modeling backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
That's it!
@ -182,7 +182,7 @@ To determine whether a given model is natively supported, you can check the `con
If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
Models do not _need_ to be natively supported to be used in vLLM.
The [Transformers backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
The [Transformers modeling backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
!!! tip
The easiest way to check if your model is really supported at runtime is to run the program below:
@ -351,6 +351,7 @@ th {
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|-------------------|----------------------|---------------------------|
| `AfmoeForCausalLM` | Afmoe | TBA | ✅︎ | ✅︎ |
| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ |
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
@ -451,7 +452,7 @@ th {
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|-------------------|----------------------|---------------------------|
@ -669,7 +670,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ |
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>E+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
@ -684,7 +685,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
| `LightOnOCRForConditionalGeneration` | LightOnOCR-1B | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ |
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ |
@ -720,7 +721,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
@ -785,6 +786,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
!!! note
`VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed.
### Pooling Models
See [this page](./pooling_models.md) for more information on how to use pooling models.

View File

@ -1,6 +1,6 @@
# Using vLLM
First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
First, vLLM must be [installed](../getting_started/installation/README.md) for your chosen device in either a Python or Docker environment.
Then, vLLM supports the following usage patterns:

View File

@ -43,6 +43,7 @@ class ModelRequestData(NamedTuple):
# Voxtral
# Make sure to install mistral-common[audio].
def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.chunk import (

View File

@ -1536,7 +1536,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": [1],
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
)

View File

@ -161,6 +161,7 @@ def main():
{
"role": "assistant",
"tool_calls": chat_completion.choices[0].message.tool_calls,
"reasoning": chat_completion.choices[0].message.reasoning,
}
)

View File

@ -0,0 +1,49 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import httpx
from transformers import AutoTokenizer
GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
DUMMY_API_KEY = "empty"
MODEL_NAME = "Qwen/Qwen3-0.6B"
transport = httpx.HTTPTransport()
headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
client = httpx.Client(
transport=transport,
base_url=GEN_ENDPOINT,
timeout=600,
headers=headers,
)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "How many countries are in the EU?"},
]
def main(client):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False,
)
payload = {
"model": MODEL_NAME,
"token_ids": token_ids,
"sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
"stream": False,
}
resp = client.post(GEN_ENDPOINT, json=payload)
resp.raise_for_status()
data = resp.json()
print(data)
print("-" * 50)
print("Token generation results:")
res = tokenizer.decode(data["choices"][0]["token_ids"])
print(res)
print("-" * 50)
if __name__ == "__main__":
main(client)

View File

@ -31,7 +31,7 @@ partial-json-parser # used for parsing partial JSON outputs
pyzmq >= 25.0.0
msgspec
gguf >= 0.13.0
mistral_common[image,audio] >= 1.8.5
mistral_common[image] >= 1.8.5
opencv-python-headless >= 4.11.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12

View File

@ -8,7 +8,7 @@ packaging>=24.2
setuptools>=77.0.3,<81.0.0
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.8.0; platform_system == "Darwin"
torch==2.9.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch

View File

@ -9,6 +9,6 @@ torch==2.9.0
torchaudio==2.9.0
# These must be updated alongside torch
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers==0.0.33; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
# FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.5.2

View File

@ -10,3 +10,7 @@ mkdocs-minify-plugin
regex
ruff
pydantic
# For generating argparse docs.
# Adding requirements here should only be used as a last resort.
msgspec # Need for multiple inheritance involving msgspec.Struct

View File

@ -13,5 +13,5 @@ setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
wheel
jinja2>=3.1.6
amdsmi==6.2.4
amdsmi==6.4.3
timm>=1.0.17

View File

@ -545,7 +545,9 @@ def get_vllm_version() -> str:
# Allow overriding the version. This is useful to build platform-specific
# wheels (e.g. CPU, TPU) without modifying the source.
if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
return env_version
print(f"Overriding VLLM version with {env_version} from VLLM_VERSION_OVERRIDE")
os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = env_version
return get_version(write_to="vllm/_version.py")
version = get_version(write_to="vllm/_version.py")
sep = "+" if "+" not in version else "." # dev versions might contain +

View File

@ -22,6 +22,8 @@ from vllm.config import (
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import create_new_process_for_each_test
# This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401
@ -193,7 +195,14 @@ def run_model(
@pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
@create_new_process_for_each_test("spawn")
def test_multi_graph_piecewise_compile(
use_inductor_graph_partition: bool, use_bytecode_hook: bool, monkeypatch
):
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

View File

@ -21,6 +21,8 @@ from vllm.config import (
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import create_new_process_for_each_test
# This import automatically registers `torch.ops.silly.attention`
from ..silly_attention import get_global_counter, reset_global_counter
@ -124,6 +126,7 @@ def _run_simple_model(
@pytest.mark.parametrize("use_inductor", [True, False])
@torch.inference_mode()
@create_new_process_for_each_test("spawn")
def test_simple_piecewise_compile(use_inductor):
_run_simple_model(
splitting_ops=["silly::attention"],

View File

@ -29,6 +29,8 @@ from vllm.config import (
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ...utils import create_new_process_for_each_test
# This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401
@ -334,6 +336,7 @@ def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
("inductor", True), # Inductor, Inductor partition
],
)
@create_new_process_for_each_test("spawn")
def test_toy_llama(
backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
):
@ -513,4 +516,8 @@ def benchmark():
if __name__ == "__main__":
benchmark()
# Protect against subprocess reimport when using spawn_new_process_for_each_test
import os
if os.environ.get("RUNNING_IN_SUBPROCESS") != "1":
benchmark()

View File

@ -15,6 +15,9 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.utils.torch_utils import _is_torch_equal_or_newer
# This import automatically registers `torch.ops.silly.attention`
from . import silly_attention # noqa: F401
def test_version():
# Test the version comparison logic using the private function
@ -257,15 +260,6 @@ def test_should_split():
splitting_ops = ["aten::add.Tensor"]
assert not should_split(node, splitting_ops)
@torch.library.custom_op(
"silly::attention",
mutates_args=["out"],
)
def attention(
q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
) -> None:
out.copy_(q + k + v)
q, k, v, out = [torch.randn(1)] * 4
# supports custom ops as OpOverloadPacket

View File

@ -20,13 +20,22 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer
from ..utils import flat_product, multi_gpu_test
is_blackwell = lambda: current_platform.is_device_capability(100)
"""Are we running on Blackwell, a lot of tests depend on it"""
class Matches(NamedTuple):
attention_fusion: int = 0
allreduce_fusion: int = 0
sequence_parallel: int = 0
async_tp: int = 0
class ModelBackendTestCase(NamedTuple):
model_name: str
model_kwargs: dict[str, Any]
backend: AttentionBackendEnum
attention_fusions: int
allreduce_fusions: int | None = None
matches: Matches
MODELS_FP8: list[ModelBackendTestCase] = []
@ -38,17 +47,33 @@ if current_platform.is_cuda():
ModelBackendTestCase(
# Use smaller model for L40s in CI
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.TRITON_ATTN,
attention_fusions=32,
allreduce_fusions=65,
# TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
# so FI attention+fp8_quant is at least tested once
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.FLASHINFER
if is_blackwell()
else AttentionBackendEnum.TRITON_ATTN,
matches=Matches(
attention_fusion=32,
allreduce_fusion=65,
sequence_parallel=65,
async_tp=128,
),
),
ModelBackendTestCase(
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.FLASHINFER,
attention_fusions=48,
allreduce_fusions=96,
# TODO FlashInfer attn broken on Hopper with kvcache=fp8:
# https://github.com/vllm-project/vllm/issues/28568
# TODO FlashInfer attn broken on Blackwell for llama4:
# https://github.com/vllm-project/vllm/issues/28604
backend=AttentionBackendEnum.TRITON_ATTN,
matches=Matches(
attention_fusion=48,
allreduce_fusion=96,
sequence_parallel=96,
async_tp=95, # mlp is moe, no fusion there
),
),
]
@ -57,8 +82,12 @@ if current_platform.is_cuda():
model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.FLASHINFER,
attention_fusions=32,
allreduce_fusions=65,
matches=Matches(
attention_fusion=32,
allreduce_fusion=65,
sequence_parallel=65,
async_tp=128,
),
),
]
@ -68,15 +97,23 @@ if current_platform.is_cuda():
model_name="meta-llama/Llama-3.1-8B-Instruct",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.TRITON_ATTN,
attention_fusions=0,
allreduce_fusions=65,
matches=Matches(
attention_fusion=0,
allreduce_fusion=65,
sequence_parallel=65,
async_tp=128,
),
),
ModelBackendTestCase(
model_name="Qwen/Qwen3-30B-A3B",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.TRITON_ATTN,
attention_fusions=0,
allreduce_fusions=97,
matches=Matches(
attention_fusion=0,
allreduce_fusion=97,
sequence_parallel=97,
async_tp=96, # MLP is MoE, half the fusions of dense
),
),
]
@ -86,19 +123,19 @@ elif current_platform.is_rocm():
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.TRITON_ATTN,
attention_fusions=32,
matches=Matches(attention_fusion=32),
),
ModelBackendTestCase(
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.ROCM_ATTN,
attention_fusions=32,
matches=Matches(attention_fusion=32),
),
ModelBackendTestCase(
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
model_kwargs=dict(max_model_len=1024),
backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
attention_fusions=32,
matches=Matches(attention_fusion=32),
),
]
@ -106,8 +143,7 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
@pytest.mark.parametrize(
"model_name, model_kwargs, backend, "
"attention_fusions, allreduce_fusions, custom_ops",
"model_name, model_kwargs, backend, matches, custom_ops",
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
# quant_fp4 only has the custom impl
@ -118,15 +154,14 @@ def test_attn_quant(
model_name: str,
model_kwargs: dict[str, Any],
backend: AttentionBackendEnum,
attention_fusions: int,
allreduce_fusions: int,
matches: Matches,
custom_ops: str,
inductor_graph_partition: bool,
caplog_mp_spawn,
monkeypatch,
):
if backend == AttentionBackendEnum.FLASHINFER and (
not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
not is_blackwell() or not has_flashinfer()
):
pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
@ -169,12 +204,12 @@ def test_attn_quant(
with caplog_mp_spawn(logging.DEBUG) as log_holder:
run_model(compilation_config, model_name, **model_kwargs)
matches = re.findall(
log_matches = re.findall(
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
log_holder.text,
)
assert len(matches) == 1, log_holder.text
assert int(matches[0]) == attention_fusions
assert len(log_matches) == 1, log_holder.text
assert int(log_matches[0]) == matches.attention_fusion
CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
@ -187,8 +222,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"model_name, model_kwargs, backend, "
"attention_fusions, allreduce_fusions, custom_ops",
"model_name, model_kwargs, backend, matches, custom_ops",
# Toggle RMSNorm and QuantFP8 for FP8 models
list(
flat_product(
@ -209,8 +243,7 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
model_name: str,
model_kwargs: dict,
backend: AttentionBackendEnum,
attention_fusions: int,
allreduce_fusions: int,
matches: Matches,
custom_ops: str,
inductor_graph_partition: bool,
caplog_mp_spawn,
@ -219,6 +252,13 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("Inductor graph partition requires torch>=2.9")
if "fp4" in model_name.lower() and not is_blackwell():
pytest.skip("NVFP4 quant requires Blackwell")
if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
# FlashInfer attn fusion requires Blackwell
matches = matches._replace(attention_fusion=0)
custom_ops_list = custom_ops.split(",") if custom_ops else []
if inductor_graph_partition:
@ -258,23 +298,135 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
run_model(
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
)
matches = re.findall(
log_matches = re.findall(
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
log_holder.text,
)
assert len(matches) == 2, log_holder.text
assert len(log_matches) == 2, log_holder.text
assert int(matches[0]) == attention_fusions
assert int(matches[1]) == attention_fusions
assert int(log_matches[0]) == matches.attention_fusion
assert int(log_matches[1]) == matches.attention_fusion
matches = re.findall(
log_matches = re.findall(
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
log_holder.text,
)
assert len(matches) == 2, log_holder.text
assert len(log_matches) == 2, log_holder.text
assert int(matches[0]) == allreduce_fusions
assert int(matches[1]) == allreduce_fusions
assert int(log_matches[0]) == matches.allreduce_fusion
assert int(log_matches[1]) == matches.allreduce_fusion
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"model_name, model_kwargs, backend, matches, custom_ops",
# Toggle RMSNorm and QuantFP8 for FP8 models
list(
flat_product(
MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
)
)
# Toggle RMSNorm for FP4 models and unquant models
+ list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
)
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="sequence parallel only tested on CUDA",
)
def test_tp2_attn_quant_async_tp(
model_name: str,
model_kwargs: dict,
backend: AttentionBackendEnum,
matches: Matches,
custom_ops: str,
inductor_graph_partition: bool,
caplog_mp_spawn,
monkeypatch,
):
if is_blackwell():
# TODO: https://github.com/vllm-project/vllm/issues/27893
pytest.skip("Blackwell is not supported for AsyncTP pass")
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("Inductor graph partition requires torch>=2.9")
if "fp4" in model_name.lower() and not is_blackwell():
pytest.skip("NVFP4 quant requires Blackwell")
if backend == AttentionBackendEnum.FLASHINFER:
if not has_flashinfer():
pytest.skip("FlashInfer backend requires flashinfer installed")
if not is_blackwell():
# FlashInfer attn fusion requires Blackwell
matches = matches._replace(attention_fusion=0)
custom_ops_list = custom_ops.split(",") if custom_ops else []
if inductor_graph_partition:
mode = CUDAGraphMode.FULL_AND_PIECEWISE
splitting_ops: list[str] | None = None
else:
mode = CUDAGraphMode.FULL_DECODE_ONLY
splitting_ops = []
# Disable, compile cache to make sure custom passes run.
# Otherwise, we can't verify fusion happened through the logs.
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
compilation_config = CompilationConfig(
# Testing properties
use_inductor_graph_partition=inductor_graph_partition,
cudagraph_mode=mode,
custom_ops=custom_ops_list,
splitting_ops=splitting_ops,
# Common
level=CompilationMode.VLLM_COMPILE,
pass_config=PassConfig(
enable_attn_fusion=True,
enable_noop=True,
enable_sequence_parallelism=True,
enable_async_tp=True,
),
# Inductor caches custom passes by default as well via uuid
inductor_compile_config={"force_disable_caches": True},
)
with caplog_mp_spawn(logging.DEBUG) as log_holder:
run_model(
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
)
log_matches = re.findall(
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
log_holder.text,
)
assert len(log_matches) == 2, log_holder.text
assert int(log_matches[0]) == matches.attention_fusion
assert int(log_matches[1]) == matches.attention_fusion
log_matches = re.findall(
r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
log_holder.text,
)
assert len(log_matches) == 2, log_holder.text
assert int(log_matches[0]) == matches.sequence_parallel
assert int(log_matches[1]) == matches.sequence_parallel
log_matches = re.findall(
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
log_holder.text,
)
assert len(log_matches) == 2, log_holder.text
assert int(log_matches[0]) == matches.async_tp
assert int(log_matches[1]) == matches.async_tp
def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):

View File

@ -0,0 +1,124 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import operator
import pytest
import torch
from torch.fx.experimental.proxy_tensor import make_fx
from vllm.compilation.backends import split_graph
def test_getitem_moved_to_producer_subgraph():
"""
Test that getitem operations are moved to the same subgraph as their input,
preventing tuple inputs to submodules.
"""
def model_fn(x: torch.Tensor) -> torch.Tensor:
# torch.split returns a tuple, creating real getitem operations
# Should become first submodule that produces tuple
chunks = torch.split(x, x.shape[0] // 2, dim=0)
# Following ops should become second submodule that consumes tuple
result_0 = torch.relu(chunks[0])
result_1 = torch.relu(chunks[1])
return torch.cat([result_0, result_1], dim=0)
x = torch.randn(4, 3)
gm = make_fx(model_fn)(x)
has_getitem = any(
node.op == "call_function" and node.target == operator.getitem
for node in gm.graph.nodes
)
assert has_getitem, "Test setup failed: graph should contain getitem operations"
# Split on tuple producer aten::split
split_ops = ["aten::split.Tensor"]
split_gm, split_items = split_graph(gm, split_ops)
assert len(split_items) == 2, "Graph should be split into 2 submodules"
for split_item in split_items:
submodule = split_item.graph
getitem_on_placeholder = []
for node in submodule.graph.nodes:
if (
node.op == "call_function"
and node.target == operator.getitem
and node.args[0].op == "placeholder"
):
getitem_on_placeholder.append(node)
assert len(getitem_on_placeholder) == 0, (
f"Submodule {split_item.submod_name} has getitem operations on "
f"placeholder nodes: {[n.name for n in getitem_on_placeholder]}. "
"This means tuple inputs were not properly eliminated."
)
new_x = torch.randn(4, 3)
output_original = gm(new_x)
output_split = split_gm(new_x)
assert torch.allclose(output_original, output_split), "Output mismatch"
def test_no_tuple_inputs_with_multiple_consumers():
"""
Test that when a tuple is consumed by multiple split operations,
getitem operations are properly moved to avoid tuple inputs.
"""
def model_fn(x: torch.Tensor) -> torch.Tensor:
# torch.split returns a tuple, creating real getitem operations
# Should become first submodule that produces tuple
chunks = torch.split(x, x.shape[0] // 2, dim=0)
# These should become second submodule consuming tuple
result_1 = torch.relu(chunks[0])
result_2 = torch.relu(chunks[1])
# Artificial graph splitting point to create another
# independent submodule that consumes tuple later
# This would become the third submodule
result_1 = torch.sigmoid(result_1)
# Fourth submodule that consumes tuple
result = torch.cat([chunks[0], chunks[1], result_1, result_2])
return result
x = torch.randn(4, 3)
gm = make_fx(model_fn)(x)
has_getitem = any(
node.op == "call_function" and node.target == operator.getitem
for node in gm.graph.nodes
)
assert has_getitem, "Test setup failed: graph should contain getitem operations"
split_ops = ["aten::split.Tensor", "aten::sigmoid"]
split_gm, split_items = split_graph(gm, split_ops)
assert len(split_items) == 4, "Graph should be split into 4 submodules"
for split_item in split_items:
submodule = split_item.graph
for node in submodule.graph.nodes:
if (
node.op == "call_function"
and node.target == operator.getitem
and node.args[0].op == "placeholder"
):
pytest.fail(
f"Submodule {split_item.submod_name} has getitem on "
f"placeholder {node.args[0].name}, indicating it receives "
"a tuple input"
)
new_x = torch.randn(4, 3)
output_original = gm(new_x)
output_split = split_gm(new_x)
assert torch.allclose(output_original, output_split), "Output mismatch after split"

View File

@ -10,8 +10,8 @@ from vllm.platforms import current_platform
def test_compile():
vllm_config = VllmConfig()
# Default configuration compiles mm encoder
assert vllm_config.compilation_config.compile_mm_encoder
# Default configuration does not compile mm encoder
assert not vllm_config.compilation_config.compile_mm_encoder
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@ -39,7 +39,10 @@ def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
"Qwen/Qwen2.5-VL-3B-Instruct",
max_model_len=2048,
gpu_memory_utilization=0.8,
compilation_config={"mode": CompilationMode.VLLM_COMPILE},
compilation_config={
"mode": CompilationMode.VLLM_COMPILE,
"compile_mm_encoder": True,
},
) as _,
):
pass

View File

@ -5,15 +5,15 @@ import pytest
import torch
import vllm.envs as envs
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
from vllm.compilation.fusion import RMSNormQuantFusionPass
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
from vllm.compilation.fx_utils import find_auto_fn
from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass
from vllm.compilation.sequence_parallelism import SequenceParallelismPass
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
from vllm.config import (
CompilationConfig,
CUDAGraphMode,
DeviceConfig,
ModelConfig,
PassConfig,
@ -30,6 +30,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.quant_utils import (
kFp8StaticTensorSym,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables
@ -45,169 +46,157 @@ prompts = [
]
class TestModel(torch.nn.Module):
def __init__(self, hidden_size=16, intermediate_size=32):
class TestAllReduceRMSNormModel(torch.nn.Module):
def __init__(self, hidden_size=16, eps=1e-6):
super().__init__()
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.gate_proj = torch.nn.Parameter(
torch.empty((intermediate_size, hidden_size))
)
self.norm = RMSNorm(intermediate_size, 1e-05)
# Initialize weights
torch.nn.init.normal_(self.gate_proj, std=0.02)
self.eps = eps
self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
def forward(self, hidden_states, residual):
"""
Forward pass implementing the operations in the FX graph
def forward(self, x):
z = torch.relu(x)
x = resid = tensor_model_parallel_all_reduce(z)
y = self.norm[0](x)
Args:
hidden_states: Input tensor
residual: Residual tensor from previous layer
z2 = torch.mm(y, self.w[0])
x2 = tensor_model_parallel_all_reduce(z2)
Returns:
Tuple containing the output tensor
"""
# Reshape input
view = hidden_states.reshape(-1, self.hidden_size)
y2, resid = self.norm[1](x2, resid)
# matrix multiplication
permute = self.gate_proj.permute(1, 0)
mm = torch.mm(view, permute)
z3 = torch.mm(y2, self.w[1])
x3 = tensor_model_parallel_all_reduce(z3)
# Tensor parallel all-reduce
all_reduce = tensor_model_parallel_all_reduce(mm)
y3, resid = self.norm[2](x3, resid)
# layer normalization
norm_output, residual_output = self.norm(all_reduce, residual)
z4 = torch.mm(y3, self.w[2])
x4 = tensor_model_parallel_all_reduce(z4)
return norm_output, residual_output
y4, resid = self.norm[3](x4, resid)
return y4
def ops_in_model_before(self):
return [torch.ops.vllm.all_reduce.default]
def ops_in_model_after(self):
return [
torch.ops.vllm.reduce_scatter.default,
torch.ops.vllm.all_gather.default,
torch.ops.vllm.reduce_scatter.default,
]
def ops_in_model(self):
return [torch.ops._C.fused_add_rms_norm.default]
if RMSNorm.enabled():
return [
torch.ops._C.rms_norm.default,
torch.ops._C.fused_add_rms_norm.default,
]
else:
return []
class TestQuantModel(torch.nn.Module):
class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
quant_key = kFp8StaticTensorSym
def __init__(self, hidden_size=16, intermediate_size=32):
def __init__(self, hidden_size=16, eps=1e-6):
super().__init__()
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.vllm_config = get_current_vllm_config()
self.gate_proj = torch.nn.Parameter(
torch.empty((intermediate_size, hidden_size)), requires_grad=False
)
self.norm = RMSNorm(intermediate_size, 1e-05)
# Initialize weights
torch.nn.init.normal_(self.gate_proj, std=0.02)
self.hidden_size = hidden_size
self.eps = eps
self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
self.w = [
torch.rand(hidden_size, hidden_size)
.to(dtype=current_platform.fp8_dtype())
.t()
for _ in range(3)
]
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
self.scale = torch.rand(1, dtype=torch.float32)
# Create a weight that is compatible with torch._scaled_mm,
# which expects a column-major layout.
self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t()
self.wscale = torch.rand(1, dtype=torch.float32)
self.fp8_linear = TestFP8Layer(
self.quant_key, self.quant_key, self.w, self.wscale, self.scale
)
def forward(self, hidden_states, residual):
"""
Forward pass implementing the operations in the FX graph
Args:
hidden_states: Input tensor
residual: Residual tensor from previous layer
Returns:
Tuple containing the output tensor
"""
# Reshape input
view = hidden_states.reshape(-1, self.hidden_size)
# matrix multiplication
permute = self.gate_proj.permute(1, 0)
mm = torch.mm(view, permute)
# Tensor parallel all-reduce
all_reduce = tensor_model_parallel_all_reduce(mm)
# layer normalization
norm_output, residual_output = self.norm(all_reduce, residual)
# scaled_mm with static input quantization
fp8_linear_result = self.fp8_linear(norm_output)
return fp8_linear_result, residual_output
def ops_in_model_before(self):
ops_to_remove = [torch.ops.vllm.all_reduce.default] # Always removed by SP
# The following are only removed if fusion happens
if (
self.vllm_config
and self.vllm_config.compilation_config.pass_config.enable_fusion
):
ops_to_remove.extend(
[
torch.ops._C.fused_add_rms_norm.default,
torch.ops._C.static_scaled_fp8_quant.default,
]
self.fp8_linear_layers = [
TestFP8Layer(
self.quant_key, self.quant_key, self.w[i], self.wscale[i], self.scale[i]
)
return ops_to_remove
for i in range(3)
]
def forward(self, hidden_states):
# avoid having graph input be an arg to a pattern directly
z = torch.relu(hidden_states)
x = resid = tensor_model_parallel_all_reduce(z)
y = self.norm[0](x)
z2 = self.fp8_linear_layers[0](y)
x2 = tensor_model_parallel_all_reduce(z2)
y2, resid = self.norm[1](x2, resid)
z3 = self.fp8_linear_layers[1](y2)
x3 = tensor_model_parallel_all_reduce(z3)
y3, resid = self.norm[2](x3, resid) # use resid here
z4 = self.fp8_linear_layers[2](y3)
x4 = tensor_model_parallel_all_reduce(z4)
y4, resid = self.norm[3](x4, resid) # use resid here
return y4
def ops_in_model_after(self):
ops_to_add = [
torch.ops.vllm.reduce_scatter.default,
return [
torch.ops.vllm.all_gather.default,
torch.ops.vllm.reduce_scatter.default,
]
def ops_in_model_before(self):
return [
torch.ops.vllm.all_reduce.default,
]
# The following is only added if fusion happens
if (
self.vllm_config
and self.vllm_config.compilation_config.pass_config.enable_fusion
):
ops_to_add.append(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
return ops_to_add
def ops_in_model(self):
if (
self.vllm_config
and self.vllm_config.compilation_config.pass_config.enable_fusion
):
# If fusion happens, the fused op is the one
# we check for (de)functionalization
if self.vllm_config.compilation_config.pass_config.enable_fusion:
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
else:
# If no fusion, the original ops are checked
elif RMSNorm.enabled():
return [
torch.ops._C.fused_add_rms_norm.default,
# TODO functionalization pass does not handle this yet
# torch.ops._C.static_scaled_fp8_quant.default,
]
elif any(
layer.is_quant_fp8_enabled() for layer in self.fp8_linear_layers
):
return [
torch.ops._C.static_scaled_fp8_quant.default,
]
else:
return []
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("test_model_cls", [TestModel, TestQuantModel])
@pytest.mark.parametrize(
"test_model_cls, custom_ops",
[
(TestAllReduceRMSNormModel, "+rms_norm"),
(TestAllReduceRMSNormModel, "-rms_norm"),
(TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,+quant_fp8"),
(TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,-quant_fp8"),
(TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,+quant_fp8"),
(TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,-quant_fp8"),
],
)
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("seq_len", [16])
@pytest.mark.parametrize("hidden_size", [16])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("enable_fusion", [True, False])
@pytest.mark.parametrize("dynamic", [False, True])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
def test_sequence_parallelism_pass(
test_model_cls: type[torch.nn.Module],
custom_ops: str,
batch_size: int,
seq_len: int,
hidden_size: int,
dtype: torch.dtype,
enable_fusion: bool,
dynamic: bool,
):
num_processes = 2
@ -219,11 +208,13 @@ def test_sequence_parallelism_pass(
args=(
num_processes,
test_model_cls,
custom_ops,
batch_size,
seq_len,
hidden_size,
dtype,
enable_fusion,
dynamic,
),
nprocs=nprocs,
)
@ -235,11 +226,13 @@ def sequence_parallelism_pass_on_test_model(
local_rank: int,
world_size: int,
test_model_cls: type[torch.nn.Module],
custom_ops: str,
batch_size: int,
seq_len: int,
hidden_size: int,
dtype: torch.dtype,
enable_fusion: bool,
dynamic: bool,
):
current_platform.seed_everything(0)
@ -263,12 +256,16 @@ def sequence_parallelism_pass_on_test_model(
initialize_model_parallel(tensor_model_parallel_size=world_size)
# configure vllm config for SequenceParallelismPass
custom_ops_list = custom_ops.split(",") if custom_ops else []
compilation_config = CompilationConfig(
splitting_ops=[], # avoid automatic rms_norm enablement
cudagraph_mode=CUDAGraphMode.NONE, # avoid piecewise warnings
custom_ops=custom_ops_list,
pass_config=PassConfig(
enable_sequence_parallelism=True,
enable_fusion=enable_fusion,
enable_noop=True,
)
),
) # NoOp needed for fusion
device_config = DeviceConfig(device=torch.device("cuda"))
@ -288,7 +285,6 @@ def sequence_parallelism_pass_on_test_model(
with set_current_vllm_config(vllm_config):
noop_pass = NoOpEliminationPass(vllm_config)
sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
func_pass = FixFunctionalizationPass(vllm_config)
cleanup_pass = PostCleanupPass(vllm_config)
assert (
sequence_parallelism_pass.compilation_config.splitting_ops
@ -309,38 +305,29 @@ def sequence_parallelism_pass_on_test_model(
passes_for_backend.append(cleanup_pass)
backend_no_func = TestBackend(*passes_for_backend)
backend_func = TestBackend(*passes_for_backend, func_pass)
backend = TestBackend(*passes_for_backend)
model = test_model_cls(hidden_size, hidden_size * 2)
model = test_model_cls(hidden_size)
hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
compiled_model_no_func = torch.compile(model, backend=backend_no_func)
compiled_model_no_func(hidden_states, residual)
compiled_model_func = torch.compile(model, backend=backend_func)
compiled_model_func(hidden_states, residual)
if dynamic:
torch._dynamo.mark_dynamic(hidden_states, 0)
assert sequence_parallelism_pass.matched_count == 1
compiled_model = torch.compile(model, backend=backend)
compiled_model(hidden_states)
assert sequence_parallelism_pass.matched_count == 4
# In pre-nodes, all reduce should be there,
# reduce scatter and all gather should not
backend_no_func.check_before_ops(model.ops_in_model_before())
for op in model.ops_in_model_before():
assert backend.op_count(op, before=True) == 4
# In post-nodes, reduce scatter and all gather should be there,
# all reduce should not
backend_no_func.check_after_ops(model.ops_in_model_after())
for op in model.ops_in_model_after():
assert backend.op_count(op, before=False) == 4
# check if the functionalization pass is applied
for op in model.ops_in_model():
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
# make sure the ops were all de-functionalized
found = dict()
for node in backend_func.graph_post_pass.nodes:
for op in model.ops_in_model():
if is_func(node, op):
found[op] = True
assert all(found[op] for op in model.ops_in_model())
find_auto_fn(backend.graph_post_pass.nodes, op)

View File

@ -2,59 +2,134 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
import torch
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
from vllm.config import CompilationMode
from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
from vllm.config import (
CompilationConfig,
CompilationMode,
VllmConfig,
set_current_vllm_config,
)
class MyMod(torch.nn.Module):
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
if cache is not None:
return x + cache
return x * 2
if x.size()[0] >= 4:
return x * 2
else:
return x * 100
class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
class MyWrapper(TorchCompileWithNoGuardsWrapper):
def __init__(self, model):
self.model = model
compiled_callable = torch.compile(self.forward, backend="eager")
super().__init__(
compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
super().__init__()
def forward(self, x: torch.Tensor): # type: ignore[override]
# this is the function to be compiled
return self.model(x)
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
"""Test basic functionality of TorchCompileWithNoGuardsWrapper."""
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
# Create a proper vLLM config instead of mocking
vllm_config = VllmConfig()
vllm_config.compilation_config = CompilationConfig()
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
vllm_config.compilation_config.backend = "inductor"
# Test DYNAMO_TRACE_ONCE
with set_current_vllm_config(vllm_config):
torch._dynamo.reset()
mod = MyMod()
wrapper = MyWrapper(mod)
# First call should trigger compilation
x = torch.tensor([1, 2, 3, 4])
torch._dynamo.mark_dynamic(x, 0)
result1 = wrapper(x)
expected1 = torch.tensor([2, 4, 6, 8])
assert torch.allclose(result1, expected1), (
f"Expected {expected1}, got {result1}"
)
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
# this is the function to be compiled
return self.model(x, cache)
# Second call should use compiled code
x2 = torch.tensor([1, 2, 3])
result2 = wrapper(x2)
expected2 = torch.tensor([2, 4, 6])
assert torch.allclose(result2, expected2), (
f"Expected {expected2}, got {result2}"
)
def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
# let torch.compile compile twice
if len(self.compiled_codes) == 2:
dispatch_id = 0 if cache is None else 1
with self.dispatch_to_code(dispatch_id):
return self.forward(x, cache)
else:
return self.compiled_callable(x, cache)
# without the wrapper result would be different.
result3 = mod(x2)
expected3 = torch.tensor([100, 200, 300])
assert torch.allclose(result3, expected3), (
f"Expected {result3}, got {expected3}"
)
def test_torch_compile_wrapper():
mod = MyMod()
wrappers = []
for i in range(3):
torch._dynamo.reset()
# with STOCK_TORCH_COMPILE we do not remove guards.
vllm_config.compilation_config.mode = CompilationMode.STOCK_TORCH_COMPILE
torch._dynamo.reset()
with set_current_vllm_config(vllm_config):
mod = MyMod()
wrapper = MyWrapper(mod)
wrappers.append(wrapper)
x = torch.tensor([1])
wrapper(x, None) # profile run, compile
# create a cache tensor
cache = torch.tensor([2])
wrapper(x, cache) # warm up with cache, recompile
# for new input, dispatch to the compiled code directly
new_x = torch.tensor([3])
assert wrapper(new_x, None).item() == 6 # dispatch to the first compiled code
assert wrapper(new_x, cache).item() == 5 # dispatch to the second compiled code
# First call should trigger compilation
x = torch.tensor([1, 2, 3, 4])
torch._dynamo.mark_dynamic(x, 0)
for wrapper in wrappers:
# make sure they have independent compiled codes
assert len(wrapper.compiled_codes) == 2
result1 = wrapper(x)
expected1 = torch.tensor([2, 4, 6, 8])
assert torch.allclose(result1, expected1), (
f"Expected {expected1}, got {result1}"
)
# Second call should triger another compilation
x2 = torch.tensor([1, 2, 3])
result2 = wrapper(x2)
expected2 = torch.tensor([100, 200, 300])
assert torch.allclose(result2, expected2), (
f"Expected {expected2}, got {result2}"
)
# NO_COMPILATION level not supported.
vllm_config.compilation_config.mode = None
torch._dynamo.reset()
with set_current_vllm_config(vllm_config):
torch._dynamo.reset()
mod = MyMod()
try:
wrapper = MyWrapper(mod)
except Exception:
return
raise AssertionError("expected an exception to be raised")
if __name__ == "__main__":
# Run with both parameter values
class MockMonkeypatch:
def setenv(self, name, value):
os.environ[name] = value
mp = MockMonkeypatch()
print("Testing with VLLM_USE_BYTECODE_HOOK=False")
test_torch_compile_wrapper(False, mp)
print("Testing with VLLM_USE_BYTECODE_HOOK=True")
test_torch_compile_wrapper(True, mp)
print("All tests passed!")

View File

@ -1384,3 +1384,16 @@ def image_urls(request, local_asset_server) -> list[str]:
"""Indirect fixture: takes a list of names, returns list of full URLs."""
names: list[str] = request.param
return [local_asset_server.url_for(name) for name in names]
@pytest.fixture
def disable_deepgemm_ue8m0(monkeypatch):
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
with monkeypatch.context() as monkeypatch_ctx:
monkeypatch_ctx.setenv("VLLM_USE_DEEP_GEMM_E8M0", "0")
is_deep_gemm_e8m0_used.cache_clear()
yield
# Clear cache so the next time it is used it is processed with the
# default VLLM_USE_DEEP_GEMM_E8M0 setting.
is_deep_gemm_e8m0_used.cache_clear()

View File

@ -39,6 +39,7 @@ class ParallelSetup(NamedTuple):
class CPTestOptions(NamedTuple):
multi_node_only: bool
load_format: str | None = None
attn_backend: str | None = None
@dataclass
@ -58,6 +59,7 @@ class CPTestSettings:
multi_node_only: bool = False,
runner: RunnerOption = "auto",
load_format: str | None = None,
attn_backend: str | None = None,
):
parallel_setups = []
for eager_mode_val in [False]:
@ -79,7 +81,9 @@ class CPTestSettings:
distributed_backends=["mp"],
runner=runner,
test_options=CPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
multi_node_only=multi_node_only,
load_format=load_format,
attn_backend=attn_backend,
),
)
@ -117,7 +121,7 @@ def _compare_cp_with_tp(
chunked_prefill,
) = parallel_setup
multi_node_only, load_format = test_options
multi_node_only, load_format, attn_backend = test_options
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_transformers_version(on_fail="skip")
@ -177,6 +181,13 @@ def _compare_cp_with_tp(
if hf_overrides:
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
if not attn_backend:
cp_env = tp_env = {}
else:
cp_env = tp_env = {
"VLLM_ATTENTION_BACKEND": attn_backend,
}
cp_args = [
*common_args,
"--tensor-parallel-size",
@ -205,6 +216,8 @@ def _compare_cp_with_tp(
model_id,
cp_args,
tp_args,
cp_env,
tp_env,
method=method,
max_wait_seconds=720,
)

View File

@ -0,0 +1,437 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Integration tests for MultiprocExecutor at the executor level.
This test directly tests the executor without going through the LLM interface,
focusing on executor initialization, RPC calls, and distributed execution.
"""
import multiprocessing
import os
from tests.utils import multi_gpu_test
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import get_open_port
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
MODEL = "facebook/opt-125m"
def create_vllm_config(
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
max_model_len: int = 256,
gpu_memory_utilization: float = 0.3,
distributed_executor_backend: str = "mp",
nnodes: int = 1,
node_rank: int = 0,
master_port: int = 0,
) -> VllmConfig:
"""Create a VllmConfig for testing using EngineArgs."""
engine_args = EngineArgs(
model=MODEL,
tensor_parallel_size=tensor_parallel_size,
pipeline_parallel_size=pipeline_parallel_size,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
)
vllm_config = engine_args.create_engine_config()
# Override distributed node settings if needed
if nnodes > 1 or node_rank > 0:
vllm_config.parallel_config.nnodes = nnodes
vllm_config.parallel_config.node_rank = node_rank
vllm_config.parallel_config.master_port = master_port
if nnodes > 1:
vllm_config.parallel_config.disable_custom_all_reduce = True
return vllm_config
def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput:
"""Create a minimal SchedulerOutput for testing."""
# This is a simplified version - in practice you'd need proper
# SchedulerOutput construction based on the actual vLLM v1 API
return SchedulerOutput(
scheduled_new_reqs=[],
scheduled_resumed_reqs=[],
scheduled_running_reqs=[],
num_scheduled_tokens={},
total_num_scheduled_tokens=0,
)
def test_multiproc_executor_initialization():
"""Test that MultiprocExecutor can be initialized with proper config."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
# Create executor - this should initialize workers
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify executor properties
assert executor.world_size == 1, "World size should be 1 for single GPU"
assert executor.local_world_size == 1, "Local world size should be 1"
assert hasattr(executor, "workers"), "Executor should have workers"
assert len(executor.workers) == 1, "Should have 1 worker for single GPU"
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_initialization_tensor_parallel():
"""Test MultiprocExecutor initialization with tensor parallelism."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
# Create executor
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify executor properties
assert executor.world_size == 2, "World size should be 2 for TP=2"
assert executor.local_world_size == 2, "Local world size should be 2"
assert len(executor.workers) == 2, "Should have 2 workers for TP=2"
# Verify output rank calculation
output_rank = executor._get_output_rank()
assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1"
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_collective_rpc():
"""Test collective RPC calls to all workers."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
# Create executor
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Test check_health RPC - should work without errors
executor.check_health()
# Test that RPC works correctly
# Note: We're just testing that the RPC mechanism works,
# not testing actual model execution here
assert not executor.is_failed, "Executor should not be in failed state"
finally:
# Clean up
executor.shutdown()
def test_multiproc_executor_failure_callback():
"""Test failure callback registration and invocation."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Test callback registration
callback_invoked = []
def test_callback():
callback_invoked.append(True)
# Register callback
executor.register_failure_callback(test_callback)
# Callback should not be invoked yet
assert len(callback_invoked) == 0, "Callback should not be invoked immediately"
# Simulate failure
executor.is_failed = True
# Register another callback - should be invoked immediately
executor.register_failure_callback(test_callback)
assert len(callback_invoked) == 1, (
"Callback should be invoked when executor is failed"
)
finally:
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_worker_monitor():
"""Test that worker monitor is set up correctly."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Verify all worker processes are alive
for worker in executor.workers:
assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive"
# Verify executor is not in failed state
assert not executor.is_failed, "Executor should not be in failed state"
finally:
# Clean up
executor.shutdown()
# After shutdown, workers should be terminated
import time
time.sleep(0.5) # Give processes time to terminate
for worker in executor.workers:
assert not worker.proc.is_alive(), (
f"Worker rank {worker.rank} should terminate after shutdown"
)
@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_get_response_message_queues():
"""Test message queue retrieval for different ranks."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Get all message queues
all_queues = executor.get_response_mqs()
assert len(all_queues) == 2, "Should have 2 message queues for 2 workers"
# Get message queue for specific rank
rank0_queue = executor.get_response_mqs(unique_reply_rank=0)
assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0"
rank1_queue = executor.get_response_mqs(unique_reply_rank=1)
assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1"
finally:
# Clean up
executor.shutdown()
def test_multiproc_executor_shutdown_cleanup():
"""Test that shutdown properly cleans up resources."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify executor is set up
assert hasattr(executor, "workers"), "Executor should have workers"
assert len(executor.workers) > 0, "Should have at least one worker"
# Shutdown
executor.shutdown()
# Verify cleanup
import time
time.sleep(0.5) # Give processes time to terminate
for worker in executor.workers:
assert not worker.proc.is_alive(), "Worker processes should be terminated"
# Verify shutdown event is set
assert executor.shutdown_event.is_set(), "Shutdown event should be set"
# Multiple shutdowns should be safe (idempotent)
executor.shutdown()
executor.shutdown()
@multi_gpu_test(num_gpus=4)
def test_multiproc_executor_pipeline_parallel():
"""Test MultiprocExecutor with pipeline parallelism."""
vllm_config = create_vllm_config(
tensor_parallel_size=2,
pipeline_parallel_size=2,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Verify executor properties
assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2"
assert len(executor.workers) == 4, "Should have 4 workers"
# Verify output rank calculation
# For TP=2, PP=2: output should be from the last PP stage (ranks 2-3)
# Specifically rank 2 (first rank of last PP stage)
output_rank = executor._get_output_rank()
assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)"
# Verify max_concurrent_batches for pipeline parallel
assert executor.max_concurrent_batches == 2, (
"Max concurrent batches should equal PP size"
)
finally:
# Clean up
executor.shutdown()
def test_multiproc_executor_properties():
"""Test various executor properties and configurations."""
vllm_config = create_vllm_config(
tensor_parallel_size=1,
pipeline_parallel_size=1,
)
executor = MultiprocExecutor(vllm_config=vllm_config)
try:
# Test supports_pp property
assert MultiprocExecutor.supports_pp is True, (
"MultiprocExecutor should support pipeline parallelism"
)
# Test world_size calculation
assert executor.world_size == (
executor.parallel_config.tensor_parallel_size
* executor.parallel_config.pipeline_parallel_size
), "World size should equal TP * PP"
# Test local_world_size calculation
assert executor.local_world_size == (
executor.parallel_config.world_size // executor.parallel_config.nnodes
), "Local world size should be world_size / nnodes"
finally:
# Clean up
executor.shutdown()
@multi_gpu_test(num_gpus=4)
def test_multiproc_executor_multi_node():
"""
Test MultiprocExecutor with multi-node configuration.
This simulates 2 nodes with TP=4:
- Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2
- Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
Total world_size = 4, nnodes = 2
"""
port = get_open_port()
# symm_mem does not work for simulating multi instance in single node
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int):
"""Run a single node's executor."""
executor = None
try:
# Set CUDA_VISIBLE_DEVICES for this node
if node_rank == 0:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
else:
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
# Create config for this node
vllm_config = create_vllm_config(
tensor_parallel_size=4, # Total TP across all nodes
pipeline_parallel_size=1,
nnodes=2, # 2 nodes
node_rank=node_rank,
master_port=port, # same port
)
# Create executor for this node
executor = MultiprocExecutor(vllm_config=vllm_config)
# Verify node-specific properties
assert executor.world_size == 4, (
f"World size should be 4 on node {node_rank}"
)
assert executor.local_world_size == 2, (
f"Local world size should be 2 on node {node_rank}"
)
assert len(executor.workers) == 2, (
f"Should have 2 local workers on node {node_rank}"
)
# Verify worker ranks are correct for this node
expected_ranks = [node_rank * 2, node_rank * 2 + 1]
actual_ranks = sorted([w.rank for w in executor.workers])
assert actual_ranks == expected_ranks, (
f"Node {node_rank} should have workers "
f"with ranks {expected_ranks}, got {actual_ranks}"
)
# Verify all workers are alive
for worker in executor.workers:
assert worker.proc.is_alive(), (
f"Worker rank {worker.rank} should be alive on node {node_rank}"
)
# executor.gen
# Put success result in queue BEFORE shutdown to avoid hanging
result_queue.put({"node": node_rank, "success": True})
import time
time.sleep(2)
executor.shutdown()
except Exception as e:
# Put failure result in queue
result_queue.put({"node": node_rank, "success": False, "error": str(e)})
raise e
finally:
if executor is not None:
executor.shutdown()
# Create a queue to collect results from both processes
result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue()
# Start both node processes
processes = []
for node_rank in range(2):
p = multiprocessing.Process(
target=run_node,
args=(node_rank, result_queue, port),
name=f"Node{node_rank}",
)
p.start()
processes.append(p)
# Wait for both processes to complete
all_completed = True
for p in processes:
p.join(timeout=60)
if p.is_alive():
p.terminate()
p.join(timeout=20)
if p.is_alive():
p.kill()
p.join()
all_completed = False
# Check results from both nodes
results: list[dict[str, int | bool]] = []
while len(results) < 2:
try:
result = result_queue.get(timeout=1)
results.append(result)
except Exception:
pass
assert all_completed, "Not all processes completed successfully"
assert len(results) == 2, f"Expected 2 results, got {len(results)}"
assert results[0]["success"], f"Node 0 failed: {results[0]}"
assert results[1]["success"], f"Node 1 failed: {results[1]}"

View File

@ -18,6 +18,7 @@ import pytest
from vllm.config.compilation import CompilationMode
from vllm.config.model import RunnerOption
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ..models.registry import HF_EXAMPLE_MODELS
@ -161,6 +162,7 @@ def _compare_sp(
test_options: SPTestOptions,
num_gpus_available: int,
use_inductor_graph_partition: bool,
enable_async_tp: bool,
*,
method: Literal["generate", "encode"],
is_multimodal: bool,
@ -244,10 +246,10 @@ def _compare_sp(
compilation_config = {
"mode": CompilationMode.VLLM_COMPILE,
"custom_ops": ["+rms_norm"],
"compile_sizes": [4, 8],
"pass_config": {
"enable_sequence_parallelism": True,
"enable_async_tp": enable_async_tp,
"enable_fusion": enable_fusion,
"enable_noop": True,
},
@ -307,6 +309,7 @@ SP_TEST_MODELS = [
],
)
@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
@pytest.mark.parametrize("enable_async_tp", [False]) # TODO: enable async TP
@create_new_process_for_each_test()
def test_tp_sp_generation(
model_id: str,
@ -316,10 +319,19 @@ def test_tp_sp_generation(
test_options: SPTestOptions,
num_gpus_available,
use_inductor_graph_partition: bool,
enable_async_tp: bool,
):
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
# Skip FP8 SP-only test on sm89 (compute capability 8.9)
if (
"fp8" in model_id.lower()
and current_platform.get_device_capability() < (9, 0)
and (not enable_async_tp)
):
pytest.skip("FP8 reduction support begins with sm90 capable devices.")
_compare_sp(
model_id,
parallel_setup,
@ -328,6 +340,7 @@ def test_tp_sp_generation(
test_options,
num_gpus_available,
use_inductor_graph_partition,
enable_async_tp=enable_async_tp,
method="generate",
is_multimodal=False,
)

View File

@ -0,0 +1,262 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import httpx
import pytest
import pytest_asyncio
from transformers import AutoTokenizer
from vllm.config import ModelConfig
from vllm.v1.engine.detokenizer import check_stop_strings
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
GEN_ENDPOINT = "/inference/v1/generate"
def get_vocab_size(model_name):
config = ModelConfig(
model=model_name,
seed=0,
dtype="bfloat16",
)
return config.get_vocab_size()
@pytest.fixture(scope="module")
def tokenizer():
return AutoTokenizer.from_pretrained(MODEL_NAME)
@pytest.fixture(scope="module")
def messages():
return [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "How many countries are in the EU?"},
]
@pytest.fixture(scope="module")
def server(request):
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"1024",
"--enforce-eager",
]
extra_args = getattr(request, "param", None)
if extra_args is not None:
args = args + (
list(extra_args)
if isinstance(extra_args, (list, tuple))
else [str(extra_args)]
)
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server: RemoteOpenAIServer):
transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
async with httpx.AsyncClient(
transport=transport,
base_url=server.url_root,
timeout=600,
headers=headers,
) as c:
yield c
@pytest.mark.asyncio
async def test_generate_endpoint(client):
payload = {
"model": MODEL_NAME,
"token_ids": [1, 2, 3],
"sampling_params": {"max_tokens": 5},
"stream": False,
}
resp = await client.post(GEN_ENDPOINT, json=payload)
resp.raise_for_status()
data = resp.json()
assert "choices" in data
@pytest.mark.asyncio
async def test_same_response_as_chat_completions(client, tokenizer, messages):
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False, # default with Qwen3
)
for ignore_eos in [True, False]:
payload = {
"model": MODEL_NAME,
"token_ids": token_ids,
"sampling_params": {
"max_tokens": 24,
"temperature": 0.0,
# NOTE coordinator will set this to skip detokenization
"detokenize": False,
"ignore_eos": ignore_eos,
},
"stream": False,
}
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
generate_data = generate_resp.json()
generate_res = tokenizer.decode(
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
)
payload = {
"model": MODEL_NAME,
"messages": messages,
"max_tokens": 24,
"temperature": 0.0,
"stream": False,
"ignore_eos": ignore_eos,
"chat_template_kwargs": dict(enable_thinking=False),
}
completions_resp = await client.post("/v1/chat/completions", json=payload)
completions_data = completions_resp.json()
completions_res = completions_data["choices"][0]["message"]["content"]
assert generate_res == completions_res
@pytest.mark.asyncio
async def test_stop_string_workflow(client, tokenizer, messages):
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False, # default with Qwen3
)
payload = {
"model": MODEL_NAME,
"token_ids": token_ids,
"sampling_params": {
"max_tokens": 24,
"temperature": 0.0,
"detokenize": False,
# stop strings are only supported when detokenize is True.
"stop": ["27 member"],
},
# TODO stream test is much more interesting
"stream": False,
}
with pytest.raises(httpx.HTTPStatusError):
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
generate_resp.raise_for_status()
payload["sampling_params"]["stop"] = None
generate_resp = await client.post(
GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"}
)
generate_data = generate_resp.json()
generate_res = tokenizer.decode(
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
)
# NOTE This is under the responsibility of the coordinator
# stop_checker = StopChecker(
# max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer
# )
stop_str, truncate_to = check_stop_strings(
generate_res, len(generate_res), ["27 member"], False
)
assert stop_str == "27 member"
# abort request that hit stop string (requires tokens-only mode)
# res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501
# res.raise_for_status()
generate_res = generate_res[:truncate_to]
# Get stop_str response from chat completions
payload = {
"model": MODEL_NAME,
"messages": messages,
"max_tokens": 24,
"temperature": 0.0,
"stream": False,
"stop": ["27 member"],
"chat_template_kwargs": dict(enable_thinking=False),
}
completions_resp = await client.post("/v1/chat/completions", json=payload)
completions_data = completions_resp.json()
completions_res = completions_data["choices"][0]["message"]["content"]
assert generate_res == completions_res
@pytest.mark.asyncio
@pytest.mark.parametrize(
"server",
[
[
"--enable-lora",
"--lora-modules",
"Alice=charent/self_cognition_Alice",
"Bob=charent/self_cognition_Bob",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
]
],
indirect=True,
)
async def test_generate_with_lora_adapter(client, tokenizer, messages):
# Verify adapters are listed
models_resp = await client.get("/v1/models")
models_resp.raise_for_status()
models = {m["id"] for m in models_resp.json().get("data", [])}
assert {"Alice", "Bob"}.issubset(models)
# Generate using a LoRA adapter by specifying its name as the model
payload = {
"model": "Alice",
"token_ids": [1, 2, 3],
"sampling_params": {"max_tokens": 5},
"stream": False,
}
resp = await client.post(GEN_ENDPOINT, json=payload)
resp.raise_for_status()
data = resp.json()
assert "choices" in data
token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False, # default with Qwen3
)
payload = {
"model": "Alice",
"token_ids": token_ids,
"sampling_params": {
"max_tokens": 24,
"temperature": 0.0,
"detokenize": False,
},
"stream": False,
}
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
generate_data = generate_resp.json()
generate_res = tokenizer.decode(
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
)
payload = {
"model": "Alice",
"messages": messages,
"max_tokens": 24,
"temperature": 0.0,
"stream": False,
"chat_template_kwargs": dict(enable_thinking=False),
}
completions_resp = await client.post("/v1/chat/completions", json=payload)
completions_data = completions_resp.json()
completions_res = completions_data["choices"][0]["message"]["content"]
assert generate_res == completions_res

View File

@ -46,6 +46,16 @@ def test_single_input_classification(server: RemoteOpenAIServer, model_name: str
assert hasattr(output.data[0], "probs")
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str):
response = requests.post(
server.url_for("classify"),
json={"model": model_name, "input": "hello", "add_special_tokens": False},
)
response.raise_for_status()
ClassificationResponse.model_validate(response.json())
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
input_texts = [

View File

@ -0,0 +1,95 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
import requests
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import ClassificationResponse
VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
MAXIMUM_VIDEOS = 1
TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
HF_OVERRIDES = {
"text_config": {
"architectures": ["Qwen2_5_VLForSequenceClassification"],
},
}
@pytest.fixture(scope="module")
def server_vlm_classify():
args = [
"--runner",
"pooling",
"--max-model-len",
"5000",
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"video": MAXIMUM_VIDEOS}),
]
with RemoteOpenAIServer(
VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
) as remote_server:
yield remote_server
@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
def test_classify_accepts_chat_text_only(
server_vlm_classify: RemoteOpenAIServer, model_name: str
) -> None:
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Please classify this text request."},
],
}
]
response = requests.post(
server_vlm_classify.url_for("classify"),
json={"model": model_name, "messages": messages},
)
response.raise_for_status()
output = ClassificationResponse.model_validate(response.json())
assert output.object == "list"
assert output.model == model_name
assert len(output.data) == 1
assert len(output.data[0].probs) == 2
assert output.usage.prompt_tokens == 22
@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
def test_classify_accepts_chat_video_url(
server_vlm_classify: RemoteOpenAIServer, model_name: str
) -> None:
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Please classify this video."},
{"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}},
],
}
]
response = requests.post(
server_vlm_classify.url_for("classify"),
json={"model": model_name, "messages": messages},
)
response.raise_for_status()
output = ClassificationResponse.model_validate(response.json())
assert output.object == "list"
assert output.model == model_name
assert len(output.data) == 1
assert len(output.data[0].probs) == 2
assert output.usage.prompt_tokens == 4807

View File

@ -83,8 +83,12 @@ async def call_vllm_api(
stop: list[str] | None = None,
url: str | None = None,
seed: int | None = None,
) -> str:
"""Call vLLM's OpenAI-compatible completions endpoint."""
) -> tuple[str, int]:
"""Call vLLM's OpenAI-compatible completions endpoint.
Returns:
Tuple of (response_text, completion_tokens)
"""
data = {
"prompt": prompt,
"temperature": temperature,
@ -98,10 +102,12 @@ async def call_vllm_api(
async with session.post(f"{url}/v1/completions", json=data) as response:
response.raise_for_status()
result = await response.json()
return result["choices"][0]["text"]
text = result["choices"][0]["text"]
completion_tokens = result.get("usage", {}).get("completion_tokens", 0)
return text, completion_tokens
except Exception as e:
print(f"Error calling vLLM API: {e}")
return ""
return "", 0
def evaluate_gsm8k(
@ -146,10 +152,11 @@ def evaluate_gsm8k(
# Run evaluation
async def run_async_evaluation():
states: list[str] = [""] * num_questions
output_tokens: list[int] = [0] * num_questions
async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
async def get_answer(session: aiohttp.ClientSession, i: int) -> tuple[str, int]:
prompt = few_shot_examples + questions[i]
answer = await call_vllm_api(
answer, tokens = await call_vllm_api(
session=session,
prompt=prompt,
temperature=temperature,
@ -159,7 +166,8 @@ def evaluate_gsm8k(
seed=seed,
)
states[i] = answer
return answer
output_tokens[i] = tokens
return answer, tokens
async with aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=600)
@ -167,24 +175,28 @@ def evaluate_gsm8k(
tasks = [get_answer(session, i) for i in range(num_questions)]
await tqdm.gather(*tasks, desc="Evaluating")
return states
return states, output_tokens
print(f"Running GSM8K evaluation: {num_questions} questions, {num_shots}-shot")
tic = time.perf_counter()
states = asyncio.run(run_async_evaluation())
states, output_tokens = asyncio.run(run_async_evaluation())
latency = time.perf_counter() - tic
# Compute metrics
preds = [get_answer_value(state) for state in states]
accuracy = np.mean(np.array(preds) == np.array(labels))
invalid_rate = np.mean(np.array(preds) == INVALID)
total_output_tokens = sum(output_tokens)
tokens_per_second = total_output_tokens / latency if latency > 0 else 0.0
result = {
"accuracy": accuracy,
"invalid_rate": invalid_rate,
"latency": latency,
"questions_per_second": num_questions / latency,
"total_output_tokens": total_output_tokens,
"tokens_per_second": tokens_per_second,
"num_questions": num_questions,
"num_shots": num_shots,
"max_tokens": max_tokens,
@ -236,6 +248,8 @@ def main() -> None:
print(f"Invalid responses: {result['invalid_rate']:.3f}")
print(f"Total latency: {result['latency']:.3f} s")
print(f"Questions per second: {result['questions_per_second']:.3f}")
print(f"Total output tokens: {result['total_output_tokens']}")
print(f"Output tokens per second: {result['tokens_per_second']:.3f}")
# Optional file saving
if args.save_results:

View File

@ -170,6 +170,7 @@ def test_cascade(
logits_soft_cap=soft_cap if soft_cap is not None else 0,
block_table=block_tables,
common_prefix_len=common_prefix_len,
max_num_splits=0, # no max
fa_version=fa_version,
)

View File

@ -40,8 +40,6 @@ NUM_EXPERTS = [8, 64]
TOP_KS = [1, 2, 6]
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
@dataclass

View File

@ -33,8 +33,6 @@ if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
# Test configurations
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]

View File

@ -18,8 +18,6 @@ if current_platform.get_device_capability() < (7, 0):
pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
DTYPES = [torch.bfloat16]

View File

@ -42,8 +42,6 @@ MNK_FACTORS = [
]
vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
@dataclasses.dataclass

View File

@ -7,6 +7,7 @@ fp8 block-quantized case.
"""
import dataclasses
from contextlib import contextmanager
import pytest
import torch.distributed
@ -14,6 +15,7 @@ from torch.distributed import ProcessGroup
from typing_extensions import ParamSpec
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.forward_context import set_forward_context
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
fp8_w8a8_moe_quant_config,
@ -21,7 +23,11 @@ from vllm.model_executor.layers.fused_moe.config import (
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.platforms import current_platform
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
from vllm.utils.deep_gemm import (
get_mk_alignment_for_contiguous_layout,
is_deep_gemm_e8m0_used,
is_deep_gemm_supported,
)
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
from ...utils import multi_gpu_test
@ -57,6 +63,23 @@ requires_deep_gemm = pytest.mark.skipif(
P = ParamSpec("P")
@contextmanager
def with_dp_metadata(M: int, world_size: int):
num_tokens_across_dp = torch.tensor([M] * world_size, device="cpu", dtype=torch.int)
vllm_config = VllmConfig()
vllm_config.parallel_config.data_parallel_size = world_size
vllm_config.parallel_config.enable_expert_parallel = True
with set_forward_context(
None,
vllm_config,
num_tokens=M,
num_tokens_across_dp=num_tokens_across_dp,
):
yield
def next_power_of_2(x):
import math
@ -281,18 +304,21 @@ def deepep_deepgemm_moe_impl(
quant_config=quant_config,
)
out = mk.forward(
hidden_states=test_tensors.rank_tokens,
w1=w1,
w2=w2,
topk_weights=test_tensors.topk_weights,
topk_ids=test_tensors.topk,
inplace=False,
activation="silu",
global_num_experts=num_experts,
expert_map=build_expert_map(),
apply_router_weight_on_input=False,
)
with with_dp_metadata(
M=test_tensors.rank_tokens.size(0), world_size=pgi.world_size
):
out = mk.forward(
hidden_states=test_tensors.rank_tokens,
w1=w1,
w2=w2,
topk_weights=test_tensors.topk_weights,
topk_ids=test_tensors.topk,
inplace=False,
activation="silu",
global_num_experts=num_experts,
expert_map=build_expert_map(),
apply_router_weight_on_input=False,
)
return out
@ -413,19 +439,16 @@ NUM_EXPERTS = [32]
@multi_gpu_test(num_gpus=2)
@requires_deep_ep
@requires_deep_gemm
@pytest.mark.skipif(
is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
)
def test_ht_deepep_deepgemm_moe(
mnk: tuple[int, int, int],
num_experts: int,
topk: int,
world_dp_size: tuple[int, int],
disable_deepgemm_ue8m0,
):
"""
Tests for High-Throughput DeepEP + DeepGemm integration.
"""
import deep_gemm
m, n, k = mnk
current_platform.seed_everything(7)
@ -433,7 +456,7 @@ def test_ht_deepep_deepgemm_moe(
if topk > num_experts:
pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
block_m = get_mk_alignment_for_contiguous_layout()[0]
block_size = [block_m, block_m]
world_size, dp_size = world_dp_size
@ -487,9 +510,6 @@ USE_FP8_DISPATCH = [False]
@multi_gpu_test(num_gpus=2)
@requires_deep_ep
@requires_deep_gemm
@pytest.mark.skipif(
is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
)
def test_ll_deepep_deepgemm_moe(
mnk: tuple[int, int, int],
num_experts: int,
@ -497,10 +517,12 @@ def test_ll_deepep_deepgemm_moe(
use_fp8_dispatch: bool,
block_size: list[int],
world_dp_size: tuple[int, int],
disable_deepgemm_ue8m0,
):
"""
Tests for Low-Latency DeepEP + DeepGemm integration.
"""
assert not is_deep_gemm_e8m0_used()
m, n, k = mnk
current_platform.seed_everything(7)

View File

@ -294,7 +294,7 @@ def torch_moe_impl(
# blockwise quant and de-quant.
assert not per_act_token_quant
a = test_tensors.rank_tokens
aq, aq_scale = per_token_group_quant_fp8(a, 128)
aq, aq_scale = per_token_group_quant_fp8(a, 128, use_ue8m0=False)
a = (
(aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1))
.view(a.shape)

View File

@ -45,8 +45,6 @@ MNK_FACTORS = [
]
vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
def quant_fp8_per_tensor_batches(a):
@ -79,10 +77,14 @@ class TestData:
@staticmethod
def make_moe_tensors_8bit(
m: int, k: int, n: int, e: int, reorder: bool
m: int, k: int, n: int, e: int, reorder: bool, activation: str = "silu"
) -> "TestData":
is_gated = activation != "relu2_no_mul"
hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16)
w13 = torch.randn(
(e, (2 * n) if is_gated else n, k), device="cuda", dtype=torch.bfloat16
)
w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16)
# Scale to fp8
@ -192,18 +194,22 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("activation", ["silu", "relu2_no_mul"])
def test_flashinfer_cutlass_moe_fp8_no_graph(
m: int,
n: int,
k: int,
e: int,
topk: int,
activation: str,
monkeypatch,
):
current_platform.seed_everything(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
with set_current_vllm_config(vllm_config):
td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)
td = TestData.make_moe_tensors_8bit(
m, k, n, e, reorder=False, activation=activation
)
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
@ -235,7 +241,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=False,
activation="silu",
activation=activation,
global_num_experts=e,
expert_map=None,
apply_router_weight_on_input=True,
@ -255,7 +261,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
td.layer,
topk_weights,
topk_ids,
activation="silu",
activation=activation,
global_num_experts=e,
expert_map=None,
apply_router_weight_on_input=True,

View File

@ -81,8 +81,6 @@ FUSED_MOE_WN16_MNK_FACTORS = [
]
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
def run_moe_test(

View File

@ -192,8 +192,6 @@ def pplx_cutlass_moe(
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
def _pplx_moe(

View File

@ -81,8 +81,6 @@ TOP_KS = [1, 2, 6]
DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
def torch_prepare(

View File

@ -1,6 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import pytest
import torch
@ -8,27 +11,30 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
persistent_masked_m_silu_mul_quant,
)
from vllm.platforms import current_platform
from vllm.utils.math_utils import cdiv
from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
from vllm.utils.math_utils import cdiv, round_up
fp8_dtype = torch.float8_e4m3fn
CASES = [
(1, 1, 128, fp8_dtype),
(1, 4, 128, fp8_dtype),
(2, 4, 256, fp8_dtype),
(32, 64, 256, fp8_dtype),
(17, 31, 768, fp8_dtype),
(1, 1, 128 * 1, fp8_dtype),
(1, 1, 128 * 3, fp8_dtype),
(1, 1, 128 * 4, fp8_dtype),
(8, 16, 128 * 1, fp8_dtype),
(8, 16, 128 * 2, fp8_dtype),
(8, 16, 128 * 3, fp8_dtype),
(1, 4, 128 * 1, fp8_dtype),
(2, 4, 128 * 2, fp8_dtype),
(1, 4, 128 * 3, fp8_dtype),
(8, 16, 128 * 4, fp8_dtype),
(8, 16, 128 * 5, fp8_dtype),
(8, 16, 128 * 6, fp8_dtype),
(8, 16, 128 * 7, fp8_dtype),
(8, 16, 128 * 8, fp8_dtype),
(8, 16, 128 * 9, fp8_dtype),
(8, 64, 7168, fp8_dtype),
(8, 128, 128 * 33, fp8_dtype),
(1, 4, 128 * 10, fp8_dtype),
(8, 128, 7168, fp8_dtype),
(8, 512, 7168, fp8_dtype),
(8, 1024, 7168, fp8_dtype),
(17, 31, 768, fp8_dtype),
(32, 64, 256, fp8_dtype),
(256, 8, 7168, fp8_dtype),
(256, 32, 7168, fp8_dtype),
(256, 64, 7168, fp8_dtype),
@ -38,14 +44,159 @@ CASES = [
]
def as_uint8(x) -> torch.Tensor:
return (
torch.empty(x.shape, dtype=x.dtype, device=x.device).copy_(x).view(torch.uint8)
)
def silu(x: torch.Tensor) -> torch.Tensor:
one_f32 = torch.tensor([1.0], device=x.device, dtype=torch.float32)
x_f32 = x.to(torch.float32)
act_f32 = x_f32 / (one_f32 + torch.exp(-x_f32))
assert act_f32.dtype == torch.float32
return act_f32.to(torch.bfloat16)
def do_quant(x: torch.Tensor, group_size: int, ceil_ue8m0: bool):
eps_bf16 = torch.tensor([1e-10], device=x.device, dtype=torch.bfloat16)
one_bf16 = torch.tensor([1.0], device=x.device, dtype=torch.bfloat16)
fp8_max_bf16 = torch.tensor(
[torch.finfo(fp8_dtype).max], device=x.device, dtype=torch.bfloat16
)
fp8_min_bf16 = torch.tensor(
[torch.finfo(fp8_dtype).min], device=x.device, dtype=torch.bfloat16
)
fp8_max_inv = one_bf16 / fp8_max_bf16
assert fp8_max_inv.dtype == torch.bfloat16
assert x.size(-1) % group_size == 0
num_groups = x.numel() // group_size
x_og_shape = x.shape
x = x.to(torch.bfloat16)
x = x.view((-1, group_size))
amax = x.abs().amax(dim=1).clamp(min=eps_bf16)
assert amax.dtype == torch.bfloat16
s = amax * fp8_max_inv
if ceil_ue8m0:
s = torch.exp2(
torch.ceil(torch.log2(s).to(torch.bfloat16)).to(torch.bfloat16)
).to(torch.bfloat16)
inv_s = one_bf16 / s
inv_s = inv_s.view((num_groups, 1))
xq = torch.clamp(x * inv_s, min=fp8_min_bf16.item(), max=fp8_max_bf16.item()).to(
fp8_dtype
)
xq = xq.view(x_og_shape)
xs = s.view((-1, xq.size(-1) // group_size))
return xq, xs
def silu_mul_quant(
gate: torch.Tensor, up: torch.Tensor, group_size: int, ceil_ue8m0: bool
) -> tuple[torch.Tensor, torch.Tensor]:
assert gate.size(-1) % group_size == 0
assert up.size(-1) % group_size == 0
assert gate.dtype == torch.bfloat16
assert up.dtype == torch.bfloat16
act_bf16 = silu(gate)
assert act_bf16.dtype == torch.bfloat16
# act & mul
a_m = act_bf16 * up
assert a_m.dtype == torch.bfloat16
q, s = do_quant(a_m, group_size, ceil_ue8m0)
return q, s
def pack_scales(x: torch.Tensor, tokens_per_expert: torch.Tensor) -> torch.Tensor:
"""
pack float32 scales into a int32 tensor
"""
assert x.dtype == torch.float32
E, T, G = x.size()
# Add i32_padding here so we can view it as a i32 tensor later on.
i32_padding = round_up(G, 4) - G
ref_s_i8 = torch.empty((E, T, G + i32_padding), dtype=torch.uint8, device="cuda")
for e in range(E):
nt = tokens_per_expert[e].item()
ref_s_i8[e, :nt, :G] = x[e, :nt].view(torch.int32) >> 23
ref_s_i32 = ref_s_i8.view(torch.int32)
return ref_s_i32
def ref_with_scale_fmt(
E: int,
T: int,
H: int,
group_size: int,
tokens_per_expert: torch.Tensor,
gate: torch.Tensor,
up: torch.Tensor,
scale_fmt: DeepGemmQuantScaleFMT,
) -> tuple[torch.Tensor, torch.Tensor]:
"""
The precision types of the operations triggered by this function
match closely with the kernel implementation so we compare more
accurately.
"""
scale_dtype = (
torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
)
ceil_ue8m0 = scale_fmt in [
DeepGemmQuantScaleFMT.UE8M0,
DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
]
ref_q = torch.empty((E, T, H), dtype=fp8_dtype, device="cuda")
ref_s_f32 = torch.empty(
(E, T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
)
for e in range(E):
nt = tokens_per_expert[e].item()
if nt == 0:
continue
ref_q[e, :nt], ref_s_f32[e, :nt] = silu_mul_quant(
gate[e, :nt], up[e, :nt], group_size, ceil_ue8m0=ceil_ue8m0
)
if scale_dtype == torch.float32:
return ref_q, ref_s_f32
assert scale_dtype == torch.int32
return ref_q, pack_scales(ref_s_f32, tokens_per_expert)
def token_random(E, T, H2, tokens_per_expert):
"""
Initialize each token in a random range so we test a range of
scale values.
"""
y = torch.empty((E, T, H2), dtype=torch.bfloat16, device="cuda")
for e in range(E):
for t in range(tokens_per_expert[e].item()):
exp = random.choice(range(1, 20))
y[e, t].uniform_(-(2**exp), 2**exp)
return y
@pytest.mark.parametrize("E,T,H,fp8_type", CASES)
@torch.inference_mode()
def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
group_size = 128
current_platform.seed_everything(42)
# Input tensor of shape (E, T, 2*H)
y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
tokens_per_expert = torch.randint(
low=0,
high=T,
@ -54,71 +205,83 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
device="cuda",
)
# Input tensor of shape (E, T, 2*H)
y = token_random(E, T, 2 * H, tokens_per_expert)
gate = y[..., :H].to(torch.bfloat16)
up = y[..., H:].to(torch.bfloat16)
scale_fmts = [
DeepGemmQuantScaleFMT.FLOAT32,
DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
DeepGemmQuantScaleFMT.UE8M0,
]
# Run the SiLU V2 kernel
# TODO (varun): use_e8m0 is set to false as the reference impl does
# not handle that case.
y_q, y_s = persistent_masked_m_silu_mul_quant(
y, tokens_per_expert, group_size=group_size, use_ue8m0=False
)
torch.cuda.synchronize()
fp8_info = torch.finfo(fp8_dtype)
fp8_max = fp8_info.max
fp8_min = fp8_info.min
eps = 1e-10
y1 = y[..., :H].float()
y2 = y[..., H:]
silu_x = y1 * torch.sigmoid(y1)
merged = silu_x * y2
for e in range(E):
nt = tokens_per_expert[e].item()
ref_s = torch.empty(
(T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
)
ref_q = torch.empty((T, H), dtype=fp8_dtype, device="cuda")
for t in range(nt):
data = merged[e, t].float()
ref_q_row = torch.empty_like(data)
# process full groups
n_full_groups = H // group_size
if n_full_groups > 0:
data_grp = data[: n_full_groups * group_size].view(
n_full_groups, group_size
)
amax = data_grp.abs().amax(dim=1).clamp(min=eps)
scale = amax / fp8_max
scaled = data[: n_full_groups * group_size] / scale.repeat_interleave(
group_size
)
ref_q_row[: n_full_groups * group_size] = scaled.clamp(
fp8_min, fp8_max
).to(fp8_dtype)
ref_s[t, :n_full_groups] = scale
# process remainder group
rem = H % group_size
if rem > 0:
data_rem = data[-rem:]
amax = data_rem.abs().amax().clamp(min=eps)
scale = amax / fp8_max
scaled = data_rem / scale
ref_q_row[-rem:] = scaled.clamp(fp8_min, fp8_max).to(fp8_dtype)
ref_s[t, -1] = scale
ref_q[t] = ref_q_row
y_se = y_s[e].float()
y_qe = y_q[e].float()
torch.testing.assert_close(
y_qe[:nt].to(torch.float32),
ref_q[:nt].to(torch.float32),
atol=2,
rtol=2e-1,
for scale_fmt in scale_fmts:
y_q, y_s = persistent_masked_m_silu_mul_quant(
y,
tokens_per_expert,
group_size=group_size,
quant_scale_fmt=scale_fmt,
)
torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
ref_y_q, ref_y_s = ref_with_scale_fmt(
E, T, H, group_size, tokens_per_expert, gate, up, scale_fmt=scale_fmt
)
# deepgemm scales transform
dg_scales = None
if (
has_deep_gemm()
and current_platform.has_device_capability(100)
and scale_fmt == DeepGemmQuantScaleFMT.UE8M0
):
from deep_gemm import transform_sf_into_required_layout
_q, _s = ref_with_scale_fmt(
E,
T,
H,
group_size,
tokens_per_expert,
gate,
up,
scale_fmt=DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
)
dg_scales = transform_sf_into_required_layout(
sf=_s,
mn=_q.size(1),
k=_q.size(2),
recipe=(1, 128, 128),
num_groups=_q.size(0),
is_sfa=True,
)
expected_scale_dtype = (
torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
)
assert y_s.dtype == expected_scale_dtype
assert ref_y_s.dtype == expected_scale_dtype
for e in range(E):
nt = tokens_per_expert[e].item()
torch.testing.assert_close(
y_q[e, :nt].to(torch.float32),
ref_y_q[e, :nt].to(torch.float32),
)
if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
G = H // group_size
y_s_sliced = as_uint8(y_s[e])
ref_s_sliced = as_uint8(ref_y_s[e])
torch.testing.assert_close(y_s_sliced[:nt, :G], ref_s_sliced[:nt, :G])
if dg_scales is not None:
dg_sliced = as_uint8(dg_scales[e])
torch.testing.assert_close(y_s_sliced[:nt, :G], dg_sliced[:nt, :G])
else:
torch.testing.assert_close(
y_s[e, :nt],
ref_y_s[e, :nt],
)

View File

@ -18,8 +18,6 @@ if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):

View File

@ -29,8 +29,6 @@ if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192
# Test configurations
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]

Some files were not shown because too many files have changed in this diff Show More