mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 07:17:04 +08:00
Merge remote-tracking branch 'origin/main' into refactor-fp8-linear
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
This commit is contained in:
commit
9d94f5d4f0
@ -59,7 +59,7 @@ while true; do
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- Pulling container"
|
||||
echo "--- Pulling container"
|
||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
docker pull "${image_name}"
|
||||
@ -177,13 +177,13 @@ if [[ -z "$render_gid" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||
# assign job count as the number of shards used
|
||||
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
|
||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||
# assign shard-id for each shard
|
||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
|
||||
echo "Shard ${GPU} commands:$commands_gpu"
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
|
||||
@ -46,6 +46,6 @@ docker run \
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
@ -226,6 +226,27 @@ steps:
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
|
||||
- label: Distributed Tests (8 GPUs) # 4min
|
||||
timeout_in_minutes: 10
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_8
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
num_gpus: 8
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- examples/offline_inference/torchrun_dp_example.py
|
||||
- vllm/config/parallel.py
|
||||
- vllm/distributed/
|
||||
- vllm/v1/engine/llm_engine.py
|
||||
- vllm/v1/executor/uniproc_executor.py
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
#- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and dp=4 with ep
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: EPLB Algorithm Test # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
@ -238,11 +259,11 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_algo.py
|
||||
|
||||
- label: EPLB Execution Test # 5min
|
||||
- label: EPLB Execution Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 15
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@ -250,6 +271,7 @@ steps:
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||
|
||||
- label: Metrics, Tracing Test # 12min
|
||||
timeout_in_minutes: 20
|
||||
@ -273,7 +295,7 @@ steps:
|
||||
|
||||
- label: Regression Test # 7min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
grade: Blocking
|
||||
source_file_dependencies:
|
||||
@ -288,7 +310,7 @@ steps:
|
||||
timeout_in_minutes: 40
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
#grade: Blocking
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
@ -337,6 +359,7 @@ steps:
|
||||
- tests/v1
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- pytest -v -s -m 'not cpu_test' v1/core
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/kv_offload
|
||||
@ -344,7 +367,7 @@ steps:
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
@ -353,6 +376,20 @@ steps:
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
# TODO: Add the "V1 Test attetion (MI300)" test group
|
||||
|
||||
- label: V1 Test attention (H100) # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
@ -479,10 +516,11 @@ steps:
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/test_multimodal_compile.py
|
||||
- pytest -v -s compile/piecewise/
|
||||
|
||||
- label: PyTorch Fullgraph Test # 22min
|
||||
timeout_in_minutes: 35
|
||||
- label: PyTorch Fullgraph Test # 27min
|
||||
timeout_in_minutes: 40
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@ -491,8 +529,23 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
- pytest -v -s compile/test_fusions_e2e.py
|
||||
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
source_file_dependencies:
|
||||
- tests/v1/cudagraph
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/config/compilation.py
|
||||
- vllm/compilation
|
||||
commands:
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
||||
|
||||
- label: Kernels Core Operation Test # 48min
|
||||
timeout_in_minutes: 75
|
||||
@ -544,6 +597,8 @@ steps:
|
||||
- tests/kernels/moe
|
||||
- vllm/model_executor/layers/fused_moe/
|
||||
- vllm/distributed/device_communicators/
|
||||
- vllm/envs.py
|
||||
- vllm/config
|
||||
commands:
|
||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
@ -562,10 +617,13 @@ steps:
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
torch_nightly: true
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/engine/arg_utils.py
|
||||
- vllm/config/model.py
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
@ -861,9 +919,10 @@ steps:
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
||||
timeout_in_minutes: 70
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
timeout_in_minutes: 15
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
@ -934,6 +993,7 @@ steps:
|
||||
- label: Transformers Nightly Models Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/"
|
||||
optional: true
|
||||
commands:
|
||||
@ -961,11 +1021,16 @@ steps:
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||
- vllm/platforms/cuda.py
|
||||
- vllm/attention/selector.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||
@ -1002,12 +1067,39 @@ steps:
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusions_e2e.py
|
||||
- tests/compile/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: ROCm GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
agent_pool: mi325_1
|
||||
mirror_hardwares: [amdproduction]
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
@ -1016,7 +1108,7 @@ steps:
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Blackwell Quantized MoE Test
|
||||
timeout_in_minutes: 60
|
||||
@ -1253,6 +1345,7 @@ steps:
|
||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
@ -1267,6 +1360,9 @@ steps:
|
||||
##### A100 test #####
|
||||
|
||||
- label: Distributed Tests (A100) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
@ -1281,6 +1377,9 @@ steps:
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
@ -1292,8 +1391,27 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H100 test #####
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
@ -1305,6 +1423,7 @@ steps:
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
- label: Distributed Tests (B200) # optional
|
||||
@ -1315,6 +1434,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### RL Integration Tests #####
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
@ -1330,3 +1450,27 @@ steps:
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
|
||||
|
||||
@ -445,6 +445,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_graph_partition.py
|
||||
- pytest -v -s compile/test_config.py
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
@ -477,10 +478,11 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
|
||||
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
@ -924,7 +926,7 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
|
||||
- label: Blackwell Fusion Tests # 30 min
|
||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
@ -945,7 +947,9 @@ steps:
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
|
||||
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
@ -968,8 +972,6 @@ steps:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
@ -1265,7 +1267,8 @@ steps:
|
||||
- pytest -v -s tests/compile/test_async_tp.py
|
||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
18
.github/CODEOWNERS
vendored
18
.github/CODEOWNERS
vendored
@ -3,8 +3,8 @@
|
||||
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
|
||||
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
@ -20,15 +20,15 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
# so spam a lot of people
|
||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||
/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||
/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/attention/backends/mla @pavanimajety
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
@ -36,11 +36,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/v1/offloading @ApostaC
|
||||
|
||||
# Test ownership
|
||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||
/.buildkite/lm-eval-harness @mgoin
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
|
||||
/tests/evals @mgoin
|
||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
@ -49,7 +49,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||
/tests/lora @jeejeelee
|
||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||
@ -57,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/v1/kv_connector @ApostaC
|
||||
/tests/v1/offloading @ApostaC
|
||||
|
||||
# Transformers backend
|
||||
# Transformers modeling backend
|
||||
/vllm/model_executor/models/transformers @hmellor
|
||||
/tests/models/test_transformers.py @hmellor
|
||||
|
||||
|
||||
76
.github/workflows/macos-smoke-test.yml
vendored
Normal file
76
.github/workflows/macos-smoke-test.yml
vendored
Normal file
@ -0,0 +1,76 @@
|
||||
name: macOS Apple Silicon Smoke Test
|
||||
|
||||
on:
|
||||
workflow_dispatch: # Manual trigger
|
||||
|
||||
jobs:
|
||||
macos-m1-smoke-test:
|
||||
runs-on: macos-latest
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
enable-cache: true
|
||||
cache-dependency-glob: |
|
||||
requirements/**/*.txt
|
||||
pyproject.toml
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install -r requirements/cpu-build.txt
|
||||
uv pip install -r requirements/cpu.txt
|
||||
|
||||
- name: Build vLLM
|
||||
run: uv pip install -v -e .
|
||||
env:
|
||||
CMAKE_BUILD_PARALLEL_LEVEL: 4
|
||||
|
||||
- name: Verify installation
|
||||
run: |
|
||||
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
||||
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
|
||||
|
||||
- name: Smoke test vllm serve
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
# Start server in background
|
||||
vllm serve Qwen/Qwen3-0.6B \
|
||||
--max-model-len=2048 \
|
||||
--load-format=dummy \
|
||||
--enforce-eager \
|
||||
--port 8000 &
|
||||
|
||||
SERVER_PID=$!
|
||||
|
||||
# Wait for server to start
|
||||
for i in {1..30}; do
|
||||
if curl -s http://localhost:8000/health > /dev/null; then
|
||||
echo "Server started successfully"
|
||||
break
|
||||
fi
|
||||
if [ "$i" -eq 30 ]; then
|
||||
echo "Server failed to start"
|
||||
kill "$SERVER_PID"
|
||||
exit 1
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
# Test health endpoint
|
||||
curl -f http://localhost:8000/health
|
||||
|
||||
# Test completion
|
||||
curl -f http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Qwen/Qwen3-0.6B",
|
||||
"prompt": "Hello",
|
||||
"max_tokens": 5
|
||||
}'
|
||||
|
||||
# Cleanup
|
||||
kill "$SERVER_PID"
|
||||
@ -3,10 +3,9 @@ MD007:
|
||||
MD013: false
|
||||
MD024:
|
||||
siblings_only: true
|
||||
MD031:
|
||||
list_items: false
|
||||
MD033: false
|
||||
MD045: false
|
||||
MD046: false
|
||||
MD051: false
|
||||
MD052: false
|
||||
MD053: false
|
||||
MD059: false
|
||||
|
||||
@ -861,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
# Hadacore kernels
|
||||
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||
if(HADACORE_ARCHS)
|
||||
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
|
||||
380
benchmarks/benchmark_batch_invariance.py
Executable file
380
benchmarks/benchmark_batch_invariance.py
Executable file
@ -0,0 +1,380 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
|
||||
|
||||
This benchmark runs the same workload twice:
|
||||
1. With VLLM_BATCH_INVARIANT=0 (baseline)
|
||||
2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
|
||||
|
||||
And reports the timing and throughput metrics for comparison.
|
||||
|
||||
Environment variables:
|
||||
VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
|
||||
VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
|
||||
VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
|
||||
VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
|
||||
VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
|
||||
VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
|
||||
VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
|
||||
VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
|
||||
VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
|
||||
VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
|
||||
VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
|
||||
|
||||
Example usage:
|
||||
# Benchmark qwen3 (default)
|
||||
python benchmarks/benchmark_batch_invariance.py
|
||||
|
||||
# Benchmark deepseek with 8 GPUs
|
||||
VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
|
||||
python benchmarks/benchmark_batch_invariance.py
|
||||
|
||||
# Quick test with fewer trials
|
||||
VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
|
||||
python benchmarks/benchmark_batch_invariance.py
|
||||
"""
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
|
||||
"""Generate a random prompt for benchmarking."""
|
||||
prompt_templates = [
|
||||
"Question: What is the capital of France?\nAnswer: The capital of France is",
|
||||
"Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
|
||||
"User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
|
||||
"Once upon a time in a distant galaxy, there lived",
|
||||
"The old man walked slowly down the street, remembering",
|
||||
"In the year 2157, humanity finally discovered",
|
||||
"To implement a binary search tree in Python, first we need to",
|
||||
"The algorithm works by iterating through the array and",
|
||||
"Here's how to optimize database queries using indexing:",
|
||||
"The Renaissance was a period in European history that",
|
||||
"Climate change is caused by several factors including",
|
||||
"The human brain contains approximately 86 billion neurons which",
|
||||
"I've been thinking about getting a new laptop because",
|
||||
"Yesterday I went to the store and bought",
|
||||
"My favorite thing about summer is definitely",
|
||||
]
|
||||
|
||||
base_prompt = random.choice(prompt_templates)
|
||||
|
||||
if max_words < min_words:
|
||||
max_words = min_words
|
||||
target_words = random.randint(min_words, max_words)
|
||||
|
||||
if target_words > 50:
|
||||
padding_text = (
|
||||
" This is an interesting topic that deserves more explanation. "
|
||||
* (target_words // 50)
|
||||
)
|
||||
base_prompt = base_prompt + padding_text
|
||||
|
||||
return base_prompt
|
||||
|
||||
|
||||
def run_benchmark_with_batch_invariant(
|
||||
model: str,
|
||||
tp_size: int,
|
||||
max_batch_size: int,
|
||||
num_trials: int,
|
||||
min_prompt: int,
|
||||
max_prompt: int,
|
||||
max_tokens: int,
|
||||
temperature: float,
|
||||
gpu_mem_util: float,
|
||||
max_model_len: int,
|
||||
backend: str,
|
||||
batch_invariant: bool,
|
||||
seed: int = 12345,
|
||||
) -> dict:
|
||||
"""
|
||||
Run the benchmark with the specified configuration.
|
||||
|
||||
Returns a dict with timing and throughput metrics.
|
||||
"""
|
||||
random.seed(seed)
|
||||
|
||||
# Set environment variables
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = backend
|
||||
if batch_invariant:
|
||||
os.environ["VLLM_BATCH_INVARIANT"] = "1"
|
||||
else:
|
||||
os.environ["VLLM_BATCH_INVARIANT"] = "0"
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
|
||||
print(f" Model: {model}")
|
||||
print(f" TP Size: {tp_size}")
|
||||
print(f" Backend: {backend}")
|
||||
print(f" Max Batch Size: {max_batch_size}")
|
||||
print(f" Trials: {num_trials}")
|
||||
print(f" Max Tokens: {max_tokens}")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
sampling = SamplingParams(
|
||||
temperature=temperature,
|
||||
top_p=0.95,
|
||||
max_tokens=max_tokens,
|
||||
seed=20240919,
|
||||
)
|
||||
|
||||
needle_prompt = "There once was a "
|
||||
|
||||
llm = None
|
||||
try:
|
||||
# Create LLM engine
|
||||
start_init = time.perf_counter()
|
||||
llm = LLM(
|
||||
model=model,
|
||||
max_num_seqs=max_batch_size,
|
||||
gpu_memory_utilization=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
dtype="bfloat16",
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
init_time = time.perf_counter() - start_init
|
||||
print(f"Engine initialization time: {init_time:.2f}s\n")
|
||||
|
||||
# Generate baseline
|
||||
print("Generating baseline (warmup)...")
|
||||
baseline_out = llm.generate([needle_prompt], sampling)
|
||||
assert len(baseline_out) == 1
|
||||
baseline_text = baseline_out[0].outputs[0].text
|
||||
print(f"Baseline output: '{baseline_text[:50]}...'\n")
|
||||
|
||||
# Run trials and measure timing
|
||||
trial_times: list[float] = []
|
||||
total_tokens = 0
|
||||
total_prompts = 0
|
||||
|
||||
for trial in range(num_trials):
|
||||
# Create a batch
|
||||
prompts: list[str] = []
|
||||
batch_size = random.randint(max_batch_size // 2, max_batch_size)
|
||||
needle_pos = random.randint(0, batch_size - 1)
|
||||
for i in range(batch_size):
|
||||
if i == needle_pos:
|
||||
prompts.append(needle_prompt)
|
||||
else:
|
||||
prompts.append(_random_prompt(min_prompt, max_prompt))
|
||||
|
||||
# Measure time for this trial
|
||||
start_time = time.perf_counter()
|
||||
outputs = llm.generate(prompts, sampling)
|
||||
trial_time = time.perf_counter() - start_time
|
||||
|
||||
trial_times.append(trial_time)
|
||||
total_prompts += len(prompts)
|
||||
|
||||
# Count tokens
|
||||
for output in outputs:
|
||||
if output.outputs:
|
||||
total_tokens += len(output.outputs[0].token_ids)
|
||||
|
||||
print(
|
||||
f"Trial {trial + 1}/{num_trials}: "
|
||||
f"batch_size={batch_size}, "
|
||||
f"time={trial_time:.2f}s"
|
||||
)
|
||||
|
||||
# Verify needle output still matches
|
||||
needle_output = outputs[needle_pos]
|
||||
assert needle_output.prompt == needle_prompt
|
||||
|
||||
# Compute statistics
|
||||
avg_time = sum(trial_times) / len(trial_times)
|
||||
min_time = min(trial_times)
|
||||
max_time = max(trial_times)
|
||||
throughput = total_tokens / sum(trial_times)
|
||||
prompts_per_sec = total_prompts / sum(trial_times)
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print("RESULTS:")
|
||||
print(f" Average time per trial: {avg_time:.2f}s")
|
||||
print(f" Min time: {min_time:.2f}s")
|
||||
print(f" Max time: {max_time:.2f}s")
|
||||
print(f" Total tokens generated: {total_tokens}")
|
||||
print(f" Total prompts processed: {total_prompts}")
|
||||
print(f" Throughput: {throughput:.2f} tokens/s")
|
||||
print(f" Prompts/s: {prompts_per_sec:.2f}")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
return {
|
||||
"init_time": init_time,
|
||||
"avg_time": avg_time,
|
||||
"min_time": min_time,
|
||||
"max_time": max_time,
|
||||
"total_tokens": total_tokens,
|
||||
"total_prompts": total_prompts,
|
||||
"throughput": throughput,
|
||||
"prompts_per_sec": prompts_per_sec,
|
||||
"trial_times": trial_times,
|
||||
}
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
if llm is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
llm.shutdown()
|
||||
|
||||
|
||||
def main():
|
||||
# Check platform support
|
||||
if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
|
||||
print("ERROR: Requires CUDA and >= Hopper (SM90)")
|
||||
print(f"Current platform: {current_platform.device_type}")
|
||||
if current_platform.is_cuda():
|
||||
print(f"Device capability: {current_platform.get_device_capability()}")
|
||||
return 1
|
||||
|
||||
# Read configuration from environment
|
||||
model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
|
||||
tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
|
||||
max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
|
||||
num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
|
||||
min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
|
||||
max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
|
||||
max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
|
||||
temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
|
||||
gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
|
||||
max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
|
||||
backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("VLLM BATCH INVARIANCE BENCHMARK")
|
||||
print("=" * 80)
|
||||
print("\nConfiguration:")
|
||||
print(f" Model: {model}")
|
||||
print(f" Tensor Parallel Size: {tp_size}")
|
||||
print(f" Attention Backend: {backend}")
|
||||
print(f" Max Batch Size: {max_batch_size}")
|
||||
print(f" Number of Trials: {num_trials}")
|
||||
print(f" Prompt Length Range: {min_prompt}-{max_prompt} words")
|
||||
print(f" Max Tokens to Generate: {max_tokens}")
|
||||
print(f" Temperature: {temperature}")
|
||||
print(f" GPU Memory Utilization: {gpu_mem_util}")
|
||||
print(f" Max Model Length: {max_model_len}")
|
||||
print("=" * 80)
|
||||
|
||||
# Run benchmark WITHOUT batch invariance (baseline)
|
||||
print("\n" + "=" * 80)
|
||||
print("PHASE 1: Running WITHOUT batch invariance (baseline)")
|
||||
print("=" * 80)
|
||||
baseline_results = run_benchmark_with_batch_invariant(
|
||||
model=model,
|
||||
tp_size=tp_size,
|
||||
max_batch_size=max_batch_size,
|
||||
num_trials=num_trials,
|
||||
min_prompt=min_prompt,
|
||||
max_prompt=max_prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
gpu_mem_util=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
backend=backend,
|
||||
batch_invariant=False,
|
||||
)
|
||||
|
||||
# Run benchmark WITH batch invariance
|
||||
print("\n" + "=" * 80)
|
||||
print("PHASE 2: Running WITH batch invariance")
|
||||
print("=" * 80)
|
||||
batch_inv_results = run_benchmark_with_batch_invariant(
|
||||
model=model,
|
||||
tp_size=tp_size,
|
||||
max_batch_size=max_batch_size,
|
||||
num_trials=num_trials,
|
||||
min_prompt=min_prompt,
|
||||
max_prompt=max_prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
gpu_mem_util=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
backend=backend,
|
||||
batch_invariant=True,
|
||||
)
|
||||
|
||||
# Compare results
|
||||
print("\n" + "=" * 80)
|
||||
print("COMPARISON: Batch Invariance vs Baseline")
|
||||
print("=" * 80)
|
||||
|
||||
init_overhead_pct = (
|
||||
(batch_inv_results["init_time"] - baseline_results["init_time"])
|
||||
/ baseline_results["init_time"]
|
||||
* 100
|
||||
)
|
||||
time_overhead_pct = (
|
||||
(batch_inv_results["avg_time"] - baseline_results["avg_time"])
|
||||
/ baseline_results["avg_time"]
|
||||
* 100
|
||||
)
|
||||
throughput_change_pct = (
|
||||
(batch_inv_results["throughput"] - baseline_results["throughput"])
|
||||
/ baseline_results["throughput"]
|
||||
* 100
|
||||
)
|
||||
|
||||
print("\nInitialization Time:")
|
||||
print(f" Baseline: {baseline_results['init_time']:.2f}s")
|
||||
print(f" Batch Invariant: {batch_inv_results['init_time']:.2f}s")
|
||||
print(f" Overhead: {init_overhead_pct:+.2f}%")
|
||||
|
||||
print("\nAverage Trial Time:")
|
||||
print(f" Baseline: {baseline_results['avg_time']:.2f}s")
|
||||
print(f" Batch Invariant: {batch_inv_results['avg_time']:.2f}s")
|
||||
print(f" Overhead: {time_overhead_pct:+.2f}%")
|
||||
|
||||
print("\nThroughput (tokens/s):")
|
||||
print(f" Baseline: {baseline_results['throughput']:.2f}")
|
||||
print(f" Batch Invariant: {batch_inv_results['throughput']:.2f}")
|
||||
print(f" Change: {throughput_change_pct:+.2f}%")
|
||||
|
||||
print("\nPrompts/s:")
|
||||
print(f" Baseline: {baseline_results['prompts_per_sec']:.2f}")
|
||||
print(f" Batch Invariant: {batch_inv_results['prompts_per_sec']:.2f}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("SUMMARY")
|
||||
print("=" * 80)
|
||||
if time_overhead_pct > 0:
|
||||
print(
|
||||
f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
|
||||
"overhead"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
|
||||
"faster (unexpected!)"
|
||||
)
|
||||
|
||||
if abs(throughput_change_pct) < 1.0:
|
||||
print("Throughput difference is negligible (< 1%)")
|
||||
elif throughput_change_pct < 0:
|
||||
print(
|
||||
f"Throughput decreased by {-throughput_change_pct:.1f}% "
|
||||
"with batch invariance"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Throughput increased by {throughput_change_pct:.1f}% "
|
||||
"with batch invariance (unexpected!)"
|
||||
)
|
||||
|
||||
print("=" * 80 + "\n")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
@ -69,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
|
||||
|
||||
# Remove the special tokens.
|
||||
return random.choices(
|
||||
[v for k, v in vocab.items() if k not in all_special_ids],
|
||||
[v for v in vocab.values() if v not in all_special_ids],
|
||||
k=length,
|
||||
)
|
||||
|
||||
|
||||
@ -561,8 +561,11 @@ async def client_main(
|
||||
f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501
|
||||
)
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
# Set unique seed per client (each client runs in its own process)
|
||||
# Add 1 to ensure no client uses the same seed as the main process
|
||||
client_seed = args.seed + client_id + 1
|
||||
random.seed(client_seed)
|
||||
np.random.seed(client_seed)
|
||||
|
||||
# Active conversations
|
||||
active_convs: ConversationsMap = {}
|
||||
@ -1490,6 +1493,7 @@ async def main() -> None:
|
||||
f"Invalid --warmup-percentage={args.warmup_percentage}"
|
||||
) from None
|
||||
|
||||
# Set global seeds for main process
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
|
||||
|
||||
@ -242,7 +242,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
|
||||
SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
|
||||
GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
|
||||
GIT_TAG v52.2.0
|
||||
GIT_TAG v52.6.0
|
||||
GIT_SHALLOW TRUE
|
||||
GIT_PROGRESS TRUE
|
||||
)
|
||||
@ -310,7 +310,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.9
|
||||
GIT_TAG v3.10
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 8e1b01d56210dc72030a2d0d41c2d8d266ba6309
|
||||
GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
@ -5,6 +5,10 @@
|
||||
#include <type_traits>
|
||||
#include <cstddef>
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#include "cpu_types.hpp"
|
||||
#include "scratchpad_manager.h"
|
||||
#include "cpu_attn_macros.h"
|
||||
@ -741,9 +745,21 @@ class AttentionScheduler {
|
||||
|
||||
static int64_t get_available_l2_size() {
|
||||
static int64_t size = []() {
|
||||
#if defined(__APPLE__)
|
||||
// macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
|
||||
int64_t l2_cache_size = 0;
|
||||
size_t len = sizeof(l2_cache_size);
|
||||
if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
|
||||
l2_cache_size > 0) {
|
||||
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||
}
|
||||
// Fallback if sysctlbyname fails
|
||||
return 128LL * 1024 >> 1; // use 50% of 128KB
|
||||
#else
|
||||
long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
|
||||
TORCH_CHECK_NE(l2_cache_size, -1);
|
||||
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||
#endif
|
||||
}();
|
||||
return size;
|
||||
}
|
||||
@ -816,15 +832,21 @@ struct VecTypeTrait<float> {
|
||||
using vec_t = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
// ARM only supports BF16 with ARMv8.6-A extension
|
||||
#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
|
||||
#else
|
||||
template <>
|
||||
struct VecTypeTrait<c10::BFloat16> {
|
||||
using vec_t = vec_op::BF16Vec16;
|
||||
};
|
||||
#endif
|
||||
|
||||
#if !defined(__powerpc__)
|
||||
template <>
|
||||
struct VecTypeTrait<c10::Half> {
|
||||
using vec_t = vec_op::FP16Vec16;
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
|
||||
@ -1586,9 +1608,17 @@ class AttentionMainLoop {
|
||||
|
||||
if (use_sink) {
|
||||
alignas(64) float s_aux_fp32[16];
|
||||
#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
|
||||
// ARM without native BF16 support: manual conversion
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
|
||||
}
|
||||
#else
|
||||
// All other platforms have BF16Vec16 available
|
||||
vec_op::BF16Vec16 vec_bf16(curr_s_aux);
|
||||
vec_op::FP32Vec16 vec_fp32(vec_bf16);
|
||||
vec_fp32.save(s_aux_fp32);
|
||||
#endif
|
||||
|
||||
float* __restrict__ curr_sum_buffer = sum_buffer;
|
||||
float* __restrict__ curr_max_buffer = max_buffer;
|
||||
|
||||
@ -100,6 +100,9 @@ void cpu_attention_with_kv_cache(
|
||||
const torch::Tensor& scheduler_metadata,
|
||||
const std::optional<torch::Tensor>& s_aux);
|
||||
|
||||
// Note: just for avoiding importing errors
|
||||
void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
|
||||
|
||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// vLLM custom ops
|
||||
|
||||
@ -275,6 +278,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
|
||||
"float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
|
||||
&cpu_attention_with_kv_cache);
|
||||
|
||||
// placeholders
|
||||
ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
|
||||
ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
|
||||
ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||
|
||||
@ -37,6 +37,16 @@
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#define FINAL_MASK 0xffffffffffffffffULL
|
||||
|
||||
#if defined(HIP_VERSION) && HIP_VERSION < 70000000
|
||||
// On ROCm versions before 7.0, __syncwarp isn't defined. The below
|
||||
// implementation is copy/pasted from the implementation in ROCm 7.0
|
||||
__device__ inline void __syncwarp() {
|
||||
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
|
||||
__builtin_amdgcn_wave_barrier();
|
||||
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#define FINAL_MASK 0xffffffff
|
||||
#endif
|
||||
|
||||
@ -279,17 +279,17 @@ __device__ __forceinline__ void token_bounds(int32_t n_tokens,
|
||||
}
|
||||
|
||||
template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
|
||||
int THREADS, typename Idx_t, bool USE_UE8M0, int GROUP_SIZE = 128,
|
||||
int NUM_STAGES = 3>
|
||||
typename scale_t, int THREADS, typename Idx_t, bool CEIL_UE8M0,
|
||||
int GROUP_SIZE = 128, int NUM_STAGES = 3>
|
||||
__global__ void silu_mul_fp8_quant_deep_gemm_kernel(
|
||||
const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
|
||||
float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
|
||||
scale_t* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
|
||||
// sizes
|
||||
Idx_t E, Idx_t T, Idx_t H,
|
||||
// strides (in elements)
|
||||
Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
|
||||
Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
|
||||
Idx_t stride_ys_g, Idx_t stride_counts_e) {
|
||||
Idx_t stride_ys_g, Idx_t stride_ys_p, Idx_t stride_counts_e) {
|
||||
#ifndef USE_ROCM
|
||||
static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
|
||||
|
||||
@ -466,9 +466,22 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
|
||||
|
||||
__nv_fp8x4_e4m3* y_q_base_ptr =
|
||||
reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
|
||||
auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g;
|
||||
|
||||
Idx_t scale_group_offset = 0;
|
||||
if constexpr (std::is_same<scale_t, uint8_t>::value) {
|
||||
// packed int32_t format
|
||||
int pack_id = warp_position_scales / 4;
|
||||
int scale_in_pack = warp_position_scales % 4;
|
||||
scale_group_offset = pack_id * stride_ys_p + scale_in_pack * stride_ys_g;
|
||||
} else {
|
||||
scale_group_offset = warp_position_scales * stride_ys_g;
|
||||
}
|
||||
|
||||
scale_t* const y_scale_base_ptr = _y_s + scale_group_offset;
|
||||
|
||||
for (auto j = tokens_lower; j < tokens_upper; j++) {
|
||||
int current_group_id = warp_position_scales; // Running count of which
|
||||
// group is being processed
|
||||
const Idx_t base_ys = expert_id * stride_ys_e;
|
||||
auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
|
||||
__nv_fp8x4_e4m3* y_q_ptr =
|
||||
@ -509,7 +522,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
|
||||
|
||||
__nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
|
||||
|
||||
if constexpr (USE_UE8M0) {
|
||||
if constexpr (CEIL_UE8M0) {
|
||||
y_s = hexp2(hceil(hlog2(y_s)));
|
||||
}
|
||||
|
||||
@ -527,8 +540,24 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
|
||||
y_q_ptr += WARP_SIZE * stride_yq_h;
|
||||
|
||||
if (!lane_id) {
|
||||
*y_s_ptr = y_s;
|
||||
y_s_ptr += stride_ys_g;
|
||||
// Store scales.
|
||||
if constexpr (std::is_same<scale_t, uint8_t>::value) {
|
||||
// Packed UE8MO format. Remove Mantissa.
|
||||
*y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
|
||||
|
||||
bool const jump_pack = (current_group_id + 1) % 4 == 0;
|
||||
// Minus 3 because we need to get to the first group in the
|
||||
// next pack.
|
||||
y_s_ptr += jump_pack ? (stride_ys_p - 3) : stride_ys_g;
|
||||
|
||||
} else {
|
||||
// float32 format
|
||||
static_assert(std::is_same<scale_t, float>::value);
|
||||
*y_s_ptr = y_s;
|
||||
y_s_ptr += stride_ys_g;
|
||||
}
|
||||
|
||||
current_group_id += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -573,7 +602,7 @@ void persistent_masked_m_silu_mul_quant(
|
||||
const at::Tensor& tokens_per_expert, // (E)
|
||||
at::Tensor& y_q, // (E, T, H) [OUT]
|
||||
at::Tensor& y_s, // (E, T, H//group_size) [OUT]
|
||||
bool use_ue8m0) {
|
||||
bool cast_scale_ue8m0) {
|
||||
#ifndef USE_ROCM
|
||||
|
||||
// This kernel currently only supports H % 128 == 0 and assumes a
|
||||
@ -583,9 +612,12 @@ void persistent_masked_m_silu_mul_quant(
|
||||
TORCH_CHECK(input.dtype() == torch::kBFloat16);
|
||||
TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
|
||||
y_q.dtype() == torch::kFloat8_e4m3fnuz);
|
||||
TORCH_CHECK(y_s.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(input.size(-1) % (GROUP_SIZE * 2) == 0);
|
||||
|
||||
bool const is_packed_ue8m0 =
|
||||
(y_s.dtype() == torch::kInt32 && cast_scale_ue8m0);
|
||||
TORCH_CHECK(y_s.dtype() == torch::kFloat32 || is_packed_ue8m0);
|
||||
|
||||
using Idx_t = int64_t;
|
||||
|
||||
Idx_t E = input.size(0);
|
||||
@ -597,15 +629,18 @@ void persistent_masked_m_silu_mul_quant(
|
||||
Idx_t stride_yq_e = y_q.stride(0);
|
||||
Idx_t stride_yq_t = y_q.stride(1);
|
||||
Idx_t stride_yq_h = y_q.stride(2);
|
||||
Idx_t stride_ys_e = y_s.stride(0);
|
||||
Idx_t stride_ys_t = y_s.stride(1);
|
||||
Idx_t stride_ys_g = y_s.stride(2);
|
||||
|
||||
Idx_t stride_counts_e = tokens_per_expert.stride(0);
|
||||
|
||||
int const NUM_GROUPS = H / GROUP_SIZE;
|
||||
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
#define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES) \
|
||||
// TODO: Get this from cuda_arch ?
|
||||
static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
|
||||
|
||||
#define KERNEL(BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G, \
|
||||
STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, STAGES) \
|
||||
static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE; \
|
||||
int sms = SILU_V2_BLOCK_COUNT; \
|
||||
static constexpr int max_shared_mem_bytes = \
|
||||
@ -615,43 +650,86 @@ void persistent_masked_m_silu_mul_quant(
|
||||
VLLM_DISPATCH_FP8_TYPES( \
|
||||
y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] { \
|
||||
vllm::silu_mul_fp8_quant_deep_gemm_kernel< \
|
||||
BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t, \
|
||||
USE_UE8M0, GROUP_SIZE, STAGES> \
|
||||
BLOCK_COUNT, max_shared_mem_bytes, fp8_t, scale_t, THREAD_COUNT, \
|
||||
Idx_t, CEIL_UE8M0, GROUP_SIZE, STAGES> \
|
||||
<<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>( \
|
||||
reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \
|
||||
(fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \
|
||||
(fp8_t*)y_q.data_ptr(), \
|
||||
reinterpret_cast<scale_t*>(y_s.data_ptr()), \
|
||||
reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
|
||||
T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e, \
|
||||
stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t, \
|
||||
stride_ys_g, stride_counts_e); \
|
||||
stride_yq_t, stride_yq_h, STRIDE_YS_E, STRIDE_YS_T, \
|
||||
STRIDE_YS_G, STRIDE_YS_P, stride_counts_e); \
|
||||
});
|
||||
|
||||
static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
|
||||
#define LAUNCH_ON_H(scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G, \
|
||||
STRIDE_YS_P, CEIL_UE8M0) \
|
||||
if (H >= 4096 && (NUM_GROUPS % 8) == 0) { \
|
||||
/* 8 warp config */ \
|
||||
static constexpr int NUM_STAGES = 4; \
|
||||
static constexpr int THREAD_COUNT = 256; \
|
||||
KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, \
|
||||
STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, NUM_STAGES); \
|
||||
} else { \
|
||||
/* 1 warp config */ \
|
||||
static constexpr int THREAD_COUNT = 32; \
|
||||
KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, \
|
||||
STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, 2); \
|
||||
}
|
||||
|
||||
int const NUM_GROUPS = H / GROUP_SIZE;
|
||||
if (!use_ue8m0) {
|
||||
if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
|
||||
/* 8 warps config */
|
||||
static constexpr int NUM_STAGES = 4;
|
||||
static constexpr int THREAD_COUNT = 256;
|
||||
KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
|
||||
} else {
|
||||
/* 1 warp config */
|
||||
static constexpr int THREAD_COUNT = 32;
|
||||
KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
|
||||
}
|
||||
} else {
|
||||
if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
|
||||
/* 8 warps config */
|
||||
static constexpr int NUM_STAGES = 4;
|
||||
static constexpr int THREAD_COUNT = 256;
|
||||
KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
|
||||
} else {
|
||||
/* 1 warp config */
|
||||
static constexpr int THREAD_COUNT = 32;
|
||||
KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
|
||||
}
|
||||
Idx_t stride_ys_e = y_s.stride(0);
|
||||
Idx_t stride_ys_t = y_s.stride(1);
|
||||
Idx_t stride_ys_g = y_s.stride(2);
|
||||
Idx_t stride_ys_p = 0;
|
||||
if (!cast_scale_ue8m0) {
|
||||
TORCH_CHECK(!is_packed_ue8m0);
|
||||
LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
|
||||
false);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!is_packed_ue8m0) {
|
||||
// UE8M0 but not packed
|
||||
LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
|
||||
true);
|
||||
return;
|
||||
}
|
||||
|
||||
TORCH_CHECK(cast_scale_ue8m0 && is_packed_ue8m0);
|
||||
TORCH_CHECK(y_s.dtype() == torch::kInt32);
|
||||
|
||||
// Int32 packed ue8m0 scales tensor.
|
||||
// Let E, T, G be the number to experts, number of tokens and number of groups
|
||||
// respectively. Let, E = 2, T = 4, G = 6, in this case the int32 scales
|
||||
// tensor are of shape [1, 4, 2] and stride [8, 1, 4]. The scales are expected
|
||||
// to be arranged as follows,
|
||||
// [[T0G0-T0G1-T0G2-T0G3, T0G4-T0G5-X-X,],
|
||||
// [T1G0-T1G1-T1G2-T1G3, T1G4-T1G5-X-X,]
|
||||
// [T2G0-T2G1-T2G2-T2G3, T2G4-T2G5-X-X,]
|
||||
// [T3G0-T3G1-T3G2-T3G3, T3G4-T3G5-X-X,]]
|
||||
// where, TxGy is the scale ue8m0 scale value of Token x, Group y.
|
||||
//
|
||||
// In memory (in bytes) the scale values are arranged as,
|
||||
// [T0G0, T0G1, T0G2, T0G3, T1G0, T1G2, T1G3, T1G4, T2G0, T2G1, T2G3, T2G4,
|
||||
// T3G0, T3G1, T3G2, T3G3, T0G4, T0G5, X, X, T1G4, T1G5, X, X, T2G4, T2G5,
|
||||
// X, X, T3G4, T3G5, X, X]
|
||||
//
|
||||
// An Int32 tensor of size [1, 4, 2] and stride [8, 1, 4] can be represented
|
||||
// as an uint8 tensor of shape [1, 2, 4, 4] and stride [32, 16, 4, 1]. In
|
||||
// english, ignoring the Experts dimension, the original int32 tensor is
|
||||
// simply treated as two packed [4, 4] uint8 tensor (or two [4, 1] int32
|
||||
// tensor). The following strides setting reflects this change. Caveat: This
|
||||
// means that the G dimension is no longer contiguous. i.e. Note that to move
|
||||
// from G3 to G4, we need to jump along the packing dimension. The kernel
|
||||
// handles this case.
|
||||
|
||||
stride_ys_e *= sizeof(int32_t);
|
||||
stride_ys_p = T * sizeof(int32_t); // Packing dimension
|
||||
stride_ys_t = sizeof(int32_t);
|
||||
stride_ys_g = 1;
|
||||
|
||||
LAUNCH_ON_H(uint8_t, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
|
||||
true);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -247,22 +247,6 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
||||
return out;
|
||||
}
|
||||
|
||||
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||
c10::SymInt size_k, c10::SymInt size_n,
|
||||
int64_t num_bits) {
|
||||
int const pack_factor = 32 / num_bits;
|
||||
auto options = torch::TensorOptions()
|
||||
.dtype(b_q_weight.dtype())
|
||||
.device(b_q_weight.device());
|
||||
return torch::empty_symint(
|
||||
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
||||
options);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("awq_marlin_repack", &awq_marlin_repack);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
|
||||
m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
|
||||
}
|
||||
|
||||
@ -321,22 +321,6 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
||||
return out;
|
||||
}
|
||||
|
||||
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
|
||||
torch::Tensor& perm, c10::SymInt size_k,
|
||||
c10::SymInt size_n, int64_t num_bits) {
|
||||
int const pack_factor = 32 / num_bits;
|
||||
auto options = torch::TensorOptions()
|
||||
.dtype(b_q_weight.dtype())
|
||||
.device(b_q_weight.device());
|
||||
return torch::empty_symint(
|
||||
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
||||
options);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("gptq_marlin_repack", &gptq_marlin_repack);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
|
||||
m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
|
||||
}
|
||||
|
||||
@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
|
||||
});
|
||||
|
||||
if (numel % 256 != 0) {
|
||||
out = out.index({torch::indexing::Slice(0, numel / had_size)});
|
||||
out = out.narrow(0, 0, numel / had_size);
|
||||
}
|
||||
|
||||
if (inplace && out.data_ptr() != x.data_ptr()) {
|
||||
|
||||
@ -116,6 +116,26 @@ struct sm90_fp8_config_default {
|
||||
ClusterShape, KernelSchedule, EpilogueSchedule>>;
|
||||
};
|
||||
|
||||
template <typename InType, typename OutType, bool EnableBias>
|
||||
struct sm90_fp8_config_M8192_K6144 {
|
||||
// M >= 8192, K >= 6144
|
||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||
using KernelSchedule =
|
||||
cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
|
||||
using EpilogueSchedule =
|
||||
typename cutlass::epilogue::TmaWarpSpecializedCooperative;
|
||||
using TileShape = Shape<_256, _128, _128>;
|
||||
using ClusterShape = Shape<_2, _1, _1>;
|
||||
|
||||
using Cutlass3xGemm = conditional_t<
|
||||
EnableBias,
|
||||
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
|
||||
TileShape, ClusterShape, KernelSchedule,
|
||||
EpilogueSchedule>,
|
||||
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||
ClusterShape, KernelSchedule, EpilogueSchedule>>;
|
||||
};
|
||||
|
||||
template <typename InType, typename OutType, bool EnableBias>
|
||||
struct sm90_fp8_config_M128 {
|
||||
// M in (64, 128]
|
||||
@ -273,6 +293,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
||||
using Cutlass3xGemmDefault =
|
||||
typename sm90_fp8_config_default<InType, OutType,
|
||||
EnableBias>::Cutlass3xGemm;
|
||||
using Cutlass3xGemmM8192_K6144 =
|
||||
typename sm90_fp8_config_M8192_K6144<InType, OutType,
|
||||
EnableBias>::Cutlass3xGemm;
|
||||
using Cutlass3xGemmM128 =
|
||||
typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
|
||||
|
||||
@ -291,6 +314,7 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
||||
|
||||
uint32_t const m = a.size(0);
|
||||
uint32_t const n = b.size(1);
|
||||
uint32_t const k = a.size(1);
|
||||
|
||||
if (m <= 16) {
|
||||
// m in [1, 16]
|
||||
@ -312,6 +336,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
||||
// m in (64, 128]
|
||||
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
|
||||
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
|
||||
} else if (m >= 8192 && k >= 6144) {
|
||||
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM8192_K6144>(
|
||||
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
|
||||
} else {
|
||||
// m in (128, inf)
|
||||
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(
|
||||
|
||||
@ -15,6 +15,17 @@ RUN apt-get update -q -y && apt-get install -q -y \
|
||||
# Remove sccache
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||
|
||||
# Install UV
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
ARG COMMON_WORKDIR
|
||||
WORKDIR ${COMMON_WORKDIR}
|
||||
|
||||
@ -59,13 +70,15 @@ FROM base AS test
|
||||
|
||||
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install vLLM
|
||||
# Install vLLM using uv (inherited from base stage)
|
||||
# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
|
||||
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
cd /install \
|
||||
&& pip install -U -r requirements/rocm.txt \
|
||||
&& pip install -U -r requirements/rocm-test.txt \
|
||||
&& uv pip install --system -r requirements/rocm.txt \
|
||||
&& uv pip install --system -r requirements/rocm-test.txt \
|
||||
&& pip uninstall -y vllm \
|
||||
&& pip install *.whl
|
||||
&& uv pip install --system *.whl
|
||||
|
||||
WORKDIR /vllm-workspace
|
||||
ARG COMMON_WORKDIR
|
||||
@ -89,14 +102,17 @@ RUN case "$(which python3)" in \
|
||||
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
|
||||
*) ;; esac
|
||||
|
||||
RUN python3 -m pip install --upgrade huggingface-hub[cli]
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system --upgrade huggingface-hub[cli]
|
||||
|
||||
# Install vLLM
|
||||
# Install vLLM using uv (inherited from base stage)
|
||||
# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
|
||||
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
cd /install \
|
||||
&& pip install -U -r requirements/rocm.txt \
|
||||
&& uv pip install --system -r requirements/rocm.txt \
|
||||
&& pip uninstall -y vllm \
|
||||
&& pip install *.whl
|
||||
&& uv pip install --system *.whl
|
||||
|
||||
ARG COMMON_WORKDIR
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
|
||||
ARG TRITON_BRANCH="57c693b6"
|
||||
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
|
||||
ARG PYTORCH_BRANCH="1c57644d"
|
||||
@ -7,7 +7,7 @@ ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="0e60e394"
|
||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||
ARG AITER_BRANCH="9716b1b8"
|
||||
ARG AITER_BRANCH="59bd8ff2"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
@ -19,6 +19,9 @@ ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx11
|
||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||
ENV AITER_ROCM_ARCH=gfx942;gfx950
|
||||
|
||||
# Required for RCCL in ROCm7.1
|
||||
ENV HSA_NO_SCRATCH_RECLAIM=1
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
|
||||
RUN mkdir -p /app
|
||||
|
||||
@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \
|
||||
libxext6 \
|
||||
libgl1 \
|
||||
lsb-release \
|
||||
libaio-dev \
|
||||
numactl \
|
||||
wget \
|
||||
vim \
|
||||
@ -68,8 +69,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
# install nixl from source code
|
||||
ENV NIXL_VERSION=0.7.0
|
||||
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip uninstall oneccl oneccl-devel -y
|
||||
|
||||
@ -46,7 +46,10 @@ nav:
|
||||
- contributing/model/multimodal.md
|
||||
- contributing/model/transcription.md
|
||||
- CI: contributing/ci
|
||||
- Design Documents: design
|
||||
- Design Documents:
|
||||
- Plugins:
|
||||
- design/*plugin*.md
|
||||
- design/*
|
||||
- API Reference:
|
||||
- api/README.md
|
||||
- api/vllm
|
||||
|
||||
@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
|
||||
Where to get started with vLLM depends on the type of user. If you are looking to:
|
||||
|
||||
- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
|
||||
- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
|
||||
- Build vLLM, we recommend starting with [Developer Guide](./contributing)
|
||||
- Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
|
||||
- Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
|
||||
|
||||
For information about the development of vLLM, see:
|
||||
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_latency.md"
|
||||
--8<-- "docs/argparse/bench_latency.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_serve.md"
|
||||
--8<-- "docs/argparse/bench_serve.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_sweep_plot.md"
|
||||
--8<-- "docs/argparse/bench_sweep_plot.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_sweep_serve.md"
|
||||
--8<-- "docs/argparse/bench_sweep_serve.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_sweep_serve_sla.md"
|
||||
--8<-- "docs/argparse/bench_sweep_serve_sla.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_throughput.md"
|
||||
--8<-- "docs/argparse/bench_throughput.inc.md"
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# vllm chat
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/chat.md"
|
||||
--8<-- "docs/argparse/chat.inc.md"
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# vllm complete
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/complete.md"
|
||||
--8<-- "docs/argparse/complete.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/run-batch.md"
|
||||
--8<-- "docs/argparse/run-batch.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/serve.md"
|
||||
--8<-- "docs/argparse/serve.inc.md"
|
||||
|
||||
@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
## CLI Arguments
|
||||
|
||||
The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
|
||||
To see the available options, take a look at the [CLI Reference](../cli/README.md)!
|
||||
|
||||
## Configuration file
|
||||
|
||||
|
||||
@ -10,8 +10,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu
|
||||
- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
|
||||
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
|
||||
|
||||
[Benchmark CLI]: #benchmark-cli
|
||||
|
||||
## Benchmark CLI
|
||||
|
||||
This section guides you through running benchmark tests with the extensive
|
||||
@ -985,7 +983,7 @@ each document has close to 512 tokens.
|
||||
|
||||
Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
|
||||
with an embedding model, also set `--no_reranker`. Because in this case the query is
|
||||
treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
|
||||
treated as an individual prompt by the server, here we send `random_batch_size - 1` documents
|
||||
to account for the extra prompt which is the query. The token accounting to report the
|
||||
throughput numbers correctly is also adjusted.
|
||||
|
||||
|
||||
@ -95,7 +95,7 @@ when manually triggering a build on Buildkite. This branch accomplishes two thin
|
||||
to warm it up so that future builds are faster.
|
||||
|
||||
<p align="center" width="100%">
|
||||
<img width="60%" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
|
||||
<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
|
||||
</p>
|
||||
|
||||
## Update dependencies
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Summary
|
||||
|
||||
!!! important
|
||||
Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||
Many decoder language models can now be automatically loaded using the [Transformers modeling backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||
|
||||
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
|
||||
|
||||
|
||||
@ -249,7 +249,7 @@ No extra registration is required beyond having your model class available via t
|
||||
## Examples in-tree
|
||||
|
||||
- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
|
||||
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
|
||||
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
|
||||
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
|
||||
|
||||
## Test with the API
|
||||
|
||||
@ -29,8 +29,8 @@ pip install vllm
|
||||
- API Path: `/chat/completions`
|
||||
- Model: `qwen/Qwen1.5-0.5B-Chat`
|
||||
|
||||

|
||||

|
||||
|
||||
1. Go to `Just chat`, and start to chat:
|
||||
|
||||

|
||||

|
||||
|
||||
@ -46,12 +46,12 @@ And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compos
|
||||
- **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
|
||||
- **Completion Mode**: `Completion`
|
||||
|
||||

|
||||

|
||||
|
||||
1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
|
||||
|
||||

|
||||

|
||||
|
||||
1. Click the chatbot you just created to open the chat interface and start interacting with the model:
|
||||
|
||||

|
||||

|
||||
|
||||
@ -156,7 +156,7 @@ In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.o
|
||||
|
||||
## Advanced Deployment Details
|
||||
|
||||
With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
|
||||
With the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
|
||||
|
||||
Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts.
|
||||
|
||||
@ -167,4 +167,4 @@ The platform integrates seamlessly with the Hugging Face Hub, allowing you to de
|
||||
- Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog
|
||||
- Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index)
|
||||
- Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm)
|
||||
- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
|
||||
- Understand the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
|
||||
|
||||
@ -128,7 +128,7 @@ A [CUDAGraphWrapper][vllm.compilation.cuda_graph.CUDAGraphWrapper] instance wrap
|
||||
3. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, the wrapper will perform CUDA Graphs capture (if key does not exist, create
|
||||
a new entry and cache it) or replay (if key exists in the cache).
|
||||
|
||||
The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and cenralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
|
||||
The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and centralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
|
||||
|
||||
#### Nested Wrapper design
|
||||
|
||||
|
||||
@ -19,9 +19,9 @@ The input activation format completely depends on the All2All Dispatch being use
|
||||
|
||||
The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
|
||||
|
||||

|
||||

|
||||
|
||||

|
||||

|
||||
|
||||
!!! note
|
||||
The main difference, in terms of operations, between the Batched and Non-Batched cases is the Permute / Unpermute operations. All other operations remain.
|
||||
@ -57,7 +57,7 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
|
||||
The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive` and `finalize` functions.
|
||||
The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers. Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
|
||||
|
||||

|
||||

|
||||
|
||||
### FusedMoEPermuteExpertsUnpermute
|
||||
|
||||
@ -88,7 +88,7 @@ The core FusedMoE implementation performs a series of operations. It would be in
|
||||
It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section.
|
||||
`FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use.
|
||||
|
||||

|
||||

|
||||
|
||||
### FusedMoEModularKernel
|
||||
|
||||
|
||||
220
docs/design/lora_resolver_plugins.md
Normal file
220
docs/design/lora_resolver_plugins.md
Normal file
@ -0,0 +1,220 @@
|
||||
# LoRA Resolver Plugins
|
||||
|
||||
This directory contains vLLM's LoRA resolver plugins built on the `LoRAResolver` framework.
|
||||
They automatically discover and load LoRA adapters from a specified local storage path, eliminating the need for manual configuration or server restarts.
|
||||
|
||||
## Overview
|
||||
|
||||
LoRA Resolver Plugins provide a flexible way to dynamically load LoRA adapters at runtime. When vLLM
|
||||
receives a request for a LoRA adapter that hasn't been loaded yet, the resolver plugins will attempt
|
||||
to locate and load the adapter from their configured storage locations. This enables:
|
||||
|
||||
- **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
|
||||
- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source.
|
||||
- **Automatic Discovery**: Seamless integration with existing LoRA workflows
|
||||
- **Scalable Deployment**: Centralized adapter management across multiple vLLM instances
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before using LoRA Resolver Plugins, ensure the following environment variables are configured:
|
||||
|
||||
### Required Environment Variables
|
||||
|
||||
1. **`VLLM_ALLOW_RUNTIME_LORA_UPDATING`**: Must be set to `true` or `1` to enable dynamic LoRA loading
|
||||
```bash
|
||||
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
|
||||
```
|
||||
|
||||
2. **`VLLM_PLUGINS`**: Must include the desired resolver plugins (comma-separated list)
|
||||
```bash
|
||||
export VLLM_PLUGINS=lora_filesystem_resolver
|
||||
```
|
||||
|
||||
3. **`VLLM_LORA_RESOLVER_CACHE_DIR`**: Must be set to a valid directory path for filesystem resolver
|
||||
```bash
|
||||
export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
|
||||
```
|
||||
|
||||
### Optional Environment Variables
|
||||
|
||||
- **`VLLM_PLUGINS`**: If not set, all available plugins will be loaded. If set to empty string, no plugins will be loaded.
|
||||
|
||||
## Available Resolvers
|
||||
|
||||
### lora_filesystem_resolver
|
||||
|
||||
The filesystem resolver is installed with vLLM by default and enables loading LoRA adapters from a local directory structure.
|
||||
|
||||
#### Setup Steps
|
||||
|
||||
1. **Create the LoRA adapter storage directory**:
|
||||
```bash
|
||||
mkdir -p /path/to/lora/adapters
|
||||
```
|
||||
|
||||
2. **Set environment variables**:
|
||||
```bash
|
||||
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
|
||||
export VLLM_PLUGINS=lora_filesystem_resolver
|
||||
export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
|
||||
```
|
||||
|
||||
3. **Start vLLM server**:
|
||||
Your base model can be `meta-llama/Llama-2-7b-hf`. Please make sure you set up the Hugging Face token in your env var `export HF_TOKEN=xxx235`.
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model your-base-model \
|
||||
--enable-lora
|
||||
```
|
||||
|
||||
#### Directory Structure Requirements
|
||||
|
||||
The filesystem resolver expects LoRA adapters to be organized in the following structure:
|
||||
|
||||
```text
|
||||
/path/to/lora/adapters/
|
||||
├── adapter1/
|
||||
│ ├── adapter_config.json
|
||||
│ ├── adapter_model.bin
|
||||
│ └── tokenizer files (if applicable)
|
||||
├── adapter2/
|
||||
│ ├── adapter_config.json
|
||||
│ ├── adapter_model.bin
|
||||
│ └── tokenizer files (if applicable)
|
||||
└── ...
|
||||
```
|
||||
|
||||
Each adapter directory must contain:
|
||||
|
||||
- **`adapter_config.json`**: Required configuration file with the following structure:
|
||||
```json
|
||||
{
|
||||
"peft_type": "LORA",
|
||||
"base_model_name_or_path": "your-base-model-name",
|
||||
"r": 16,
|
||||
"lora_alpha": 32,
|
||||
"target_modules": ["q_proj", "v_proj"],
|
||||
"bias": "none",
|
||||
"modules_to_save": null,
|
||||
"use_rslora": false,
|
||||
"use_dora": false
|
||||
}
|
||||
```
|
||||
|
||||
- **`adapter_model.bin`**: The LoRA adapter weights file
|
||||
|
||||
#### Usage Example
|
||||
|
||||
1. **Prepare your LoRA adapter**:
|
||||
```bash
|
||||
# Assuming you have a LoRA adapter in /tmp/my_lora_adapter
|
||||
cp -r /tmp/my_lora_adapter /path/to/lora/adapters/my_sql_adapter
|
||||
```
|
||||
|
||||
2. **Verify the directory structure**:
|
||||
```bash
|
||||
ls -la /path/to/lora/adapters/my_sql_adapter/
|
||||
# Should show: adapter_config.json, adapter_model.bin, etc.
|
||||
```
|
||||
|
||||
3. **Make a request using the adapter**:
|
||||
```bash
|
||||
curl http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "my_sql_adapter",
|
||||
"prompt": "Generate a SQL query for:",
|
||||
"max_tokens": 50,
|
||||
"temperature": 0.1
|
||||
}'
|
||||
```
|
||||
|
||||
#### How It Works
|
||||
|
||||
1. When vLLM receives a request for a LoRA adapter named `my_sql_adapter`
|
||||
2. The filesystem resolver checks if `/path/to/lora/adapters/my_sql_adapter/` exists
|
||||
3. If found, it validates the `adapter_config.json` file
|
||||
4. If the configuration matches the base model and is valid, the adapter is loaded
|
||||
5. The request is processed normally with the newly loaded adapter
|
||||
6. The adapter remains available for future requests
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
### Multiple Resolvers
|
||||
|
||||
You can configure multiple resolver plugins to load adapters from different sources:
|
||||
|
||||
'lora_s3_resolver' is an example of a custom resolver you would need to implement
|
||||
|
||||
```bash
|
||||
export VLLM_PLUGINS=lora_filesystem_resolver,lora_s3_resolver
|
||||
```
|
||||
|
||||
All listed resolvers are enabled; at request time, vLLM tries them in order until one succeeds.
|
||||
|
||||
### Custom Resolver Implementation
|
||||
|
||||
To implement your own resolver plugin:
|
||||
|
||||
1. **Create a new resolver class**:
|
||||
```python
|
||||
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
class CustomResolver(LoRAResolver):
|
||||
async def resolve_lora(self, base_model_name: str, lora_name: str) -> Optional[LoRARequest]:
|
||||
# Your custom resolution logic here
|
||||
pass
|
||||
```
|
||||
|
||||
2. **Register the resolver**:
|
||||
```python
|
||||
def register_custom_resolver():
|
||||
resolver = CustomResolver()
|
||||
LoRAResolverRegistry.register_resolver("Custom Resolver", resolver)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **"VLLM_LORA_RESOLVER_CACHE_DIR must be set to a valid directory"**
|
||||
- Ensure the directory exists and is accessible
|
||||
- Check file permissions on the directory
|
||||
|
||||
2. **"LoRA adapter not found"**
|
||||
- Verify the adapter directory name matches the requested model name
|
||||
- Check that `adapter_config.json` exists and is valid JSON
|
||||
- Ensure `adapter_model.bin` exists in the directory
|
||||
|
||||
3. **"Invalid adapter configuration"**
|
||||
- Verify `peft_type` is set to "LORA"
|
||||
- Check that `base_model_name_or_path` matches your base model
|
||||
- Ensure `target_modules` is properly configured
|
||||
|
||||
4. **"LoRA rank exceeds maximum"**
|
||||
- Check that `r` value in `adapter_config.json` doesn't exceed `max_lora_rank` setting
|
||||
|
||||
### Debugging Tips
|
||||
|
||||
1. **Enable debug logging**:
|
||||
```bash
|
||||
export VLLM_LOGGING_LEVEL=DEBUG
|
||||
```
|
||||
|
||||
2. **Verify environment variables**:
|
||||
```bash
|
||||
echo $VLLM_ALLOW_RUNTIME_LORA_UPDATING
|
||||
echo $VLLM_PLUGINS
|
||||
echo $VLLM_LORA_RESOLVER_CACHE_DIR
|
||||
```
|
||||
|
||||
3. **Test adapter configuration**:
|
||||
```bash
|
||||
python -c "
|
||||
import json
|
||||
with open('/path/to/lora/adapters/my_adapter/adapter_config.json') as f:
|
||||
config = json.load(f)
|
||||
print('Config valid:', config)
|
||||
"
|
||||
```
|
||||
@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
|
||||
|
||||
## Fused MoE Experts Kernels
|
||||
|
||||
The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
|
||||
The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
|
||||
|
||||
Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ You can use vLLM *custom arguments* to pass in arguments which are not part of t
|
||||
Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
|
||||
|
||||
!!! note
|
||||
Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise invalid custom arguments can cause unexpected behaviour.
|
||||
Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise, invalid custom arguments can cause unexpected behaviour.
|
||||
|
||||
## Offline Custom Arguments
|
||||
|
||||
|
||||
@ -71,7 +71,7 @@ Logits processor `update_state()` implementations should assume the following mo
|
||||
|
||||
* **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
|
||||
|
||||
* **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
|
||||
* **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
|
||||
|
||||
5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
|
||||
|
||||
@ -286,7 +286,7 @@ Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) wh
|
||||
|
||||
## Ways to Load Your Custom Logits Processor in vLLM
|
||||
|
||||
Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests.
|
||||
Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits processors cannot be loaded on-demand for individual requests.
|
||||
|
||||
This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor.
|
||||
|
||||
@ -438,7 +438,7 @@ The examples below show how a user would pass a custom argument (`target_token`)
|
||||
|
||||
## Best Practices for Writing Custom Logits Processors
|
||||
|
||||
Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently.
|
||||
Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus, it is important to implement these methods efficiently.
|
||||
|
||||
* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
|
||||
* For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
|
||||
@ -465,4 +465,4 @@ Once vLLM loads a logits processor during initialization, then vLLM will invoke
|
||||
|
||||
* **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default
|
||||
|
||||
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
|
||||
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However, the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
|
||||
|
||||
118
docs/features/interleaved_thinking.md
Normal file
118
docs/features/interleaved_thinking.md
Normal file
@ -0,0 +1,118 @@
|
||||
# Interleaved Thinking
|
||||
|
||||
## Introduction
|
||||
|
||||
Interleaved thinking allows models to reason between tool calls, enabling more sophisticated decision-making after receiving tool results. This feature helps models chain multiple tool calls with reasoning steps in between and make nuanced decisions based on intermediate results.
|
||||
|
||||
Important: Interleaved thinking increases token usage and response latency. Consider your budget and performance requirements when enabling this feature.
|
||||
|
||||
## How Interleaved Thinking Works
|
||||
|
||||
With interleaved thinking, the model can:
|
||||
|
||||
- Reason about the results of a tool call before deciding what to do next
|
||||
- Chain multiple tool calls with reasoning steps in between
|
||||
- Make more nuanced decisions based on intermediate results
|
||||
- Provide transparent reasoning for its tool selection process
|
||||
|
||||
## Supported Models
|
||||
|
||||
vLLM currently supports the following interleaved thinking models:
|
||||
|
||||
| Model Series | Reasoning Parser Name |
|
||||
|--------------|-----------------------|
|
||||
| moonshotai/Kimi-K2-Thinking | kimi_k2 |
|
||||
| MiniMaxAI/MiniMax-M2 | minimax_m2 |
|
||||
|
||||
## Example Usage
|
||||
|
||||
To use interleaved thinking with tool calls, specify a model that supports this feature and enable tool calls in your chat completion request. Here's an example:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
"""
|
||||
vllm serve MiniMaxAI/MiniMax-M2 \
|
||||
--tensor-parallel-size 4 \
|
||||
--tool-call-parser minimax_m2 \
|
||||
--reasoning-parser minimax_m2 \
|
||||
--enable-auto-tool-choice
|
||||
"""
|
||||
import json
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
||||
|
||||
|
||||
def get_current_weather(location: str, unit: "str"):
|
||||
"""Get the current weather in a given location"""
|
||||
if unit == "celsius":
|
||||
return f"The current temperature in {location} is 22°C."
|
||||
else:
|
||||
return f"The current temperature in {location} is 72°F."
|
||||
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "City and state, e.g., 'San Francisco, CA'",
|
||||
},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||
},
|
||||
"required": ["location", "unit"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
messages = [{"role": "user", "content": "What's the weather in Fahrenheit like in San Francisco?"}]
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
tool_choice="auto",
|
||||
)
|
||||
|
||||
tool_call = response.choices[0].message.tool_calls[0].function
|
||||
|
||||
messages.append(
|
||||
{
|
||||
"role": "assistant",
|
||||
"tool_calls": response.choices[0].message.tool_calls,
|
||||
"reasoning": response.choices[0].message.reasoning, # append reasoning
|
||||
}
|
||||
)
|
||||
|
||||
# Simulate tool execution
|
||||
available_tools = {"get_weather": get_current_weather}
|
||||
|
||||
completion_tool_calls = response.choices[0].message.tool_calls
|
||||
for call in completion_tool_calls:
|
||||
tool_to_call = available_tools[call.function.name]
|
||||
args = json.loads(call.function.arguments)
|
||||
result = tool_to_call(**args)
|
||||
messages.append(
|
||||
{
|
||||
"role": "tool",
|
||||
"content": result,
|
||||
"tool_call_id": call.id,
|
||||
"name": call.function.name,
|
||||
}
|
||||
)
|
||||
response_2 = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
tool_choice="auto",
|
||||
)
|
||||
print(response_2.choices[0].message.content)
|
||||
```
|
||||
This example demonstrates how to set up interleaved thinking with tool calls using a weather retrieval function. The model reasons about the tool results before generating the final response.
|
||||
@ -298,7 +298,7 @@ There are two steps to generate and deploy a mixed precision model quantized wit
|
||||
|
||||
Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later.
|
||||
|
||||
As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benifits. They are:
|
||||
As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benefits. They are:
|
||||
|
||||
- amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
|
||||
- amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
|
||||
|
||||
@ -28,10 +28,15 @@ After installation of XCode and the Command Line Tools, which include Apple Clan
|
||||
```bash
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
uv pip install -r requirements/cpu.txt
|
||||
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
|
||||
uv pip install -e .
|
||||
```
|
||||
|
||||
!!! tip
|
||||
The `--index-strategy unsafe-best-match` flag is needed to resolve dependencies across multiple package indexes (PyTorch CPU index and PyPI). Without this flag, you may encounter `typing-extensions` version conflicts.
|
||||
|
||||
The term "unsafe" refers to the package resolution strategy, not security. By default, `uv` only searches the first index where a package is found to prevent dependency confusion attacks. This flag allows `uv` to search all configured indexes to find the best compatible versions. Since both PyTorch and PyPI are trusted package sources, using this strategy is safe and appropriate for vLLM installation.
|
||||
|
||||
!!! note
|
||||
On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device.
|
||||
|
||||
|
||||
@ -104,7 +104,7 @@ Currently, there are no pre-built CPU wheels.
|
||||
|
||||
### Which `dtype` should be used?
|
||||
|
||||
- Currently vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.
|
||||
- Currently, vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.
|
||||
|
||||
### How to launch a vLLM service on CPU?
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
|
||||
|
||||
Currently the CPU implementation for s390x architecture supports FP32 datatype only.
|
||||
Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
|
||||
@ -83,7 +83,7 @@ uv pip install dist/*.whl
|
||||
!!! example "Troubleshooting"
|
||||
- **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
|
||||
- **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
|
||||
- `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
|
||||
- `AMD` requires at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
|
||||
- If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
|
||||
```toml title="pyproject.toml"
|
||||
[build-system]
|
||||
|
||||
@ -1,12 +1,15 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import importlib
|
||||
import importlib.metadata
|
||||
import importlib.util
|
||||
import logging
|
||||
import sys
|
||||
import traceback
|
||||
from argparse import SUPPRESS, HelpFormatter
|
||||
from argparse import SUPPRESS, Action, HelpFormatter
|
||||
from collections.abc import Iterable
|
||||
from importlib.machinery import ModuleSpec
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
from typing import TYPE_CHECKING, Literal
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from pydantic_core import core_schema
|
||||
@ -19,6 +22,11 @@ ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
|
||||
sys.path.insert(0, str(ROOT_DIR))
|
||||
|
||||
|
||||
def mock_if_no_torch(mock_module: str, mock: MagicMock):
|
||||
if not importlib.util.find_spec("torch"):
|
||||
sys.modules[mock_module] = mock
|
||||
|
||||
|
||||
# Mock custom op code
|
||||
class MockCustomOp:
|
||||
@staticmethod
|
||||
@ -29,18 +37,21 @@ class MockCustomOp:
|
||||
return decorator
|
||||
|
||||
|
||||
noop = lambda *a, **k: None
|
||||
sys.modules["vllm._C"] = MagicMock()
|
||||
sys.modules["vllm.model_executor.custom_op"] = MagicMock(CustomOp=MockCustomOp)
|
||||
sys.modules["vllm.utils.torch_utils"] = MagicMock(direct_register_custom_op=noop)
|
||||
mock_if_no_torch("vllm._C", MagicMock())
|
||||
mock_if_no_torch("vllm.model_executor.custom_op", MagicMock(CustomOp=MockCustomOp))
|
||||
mock_if_no_torch(
|
||||
"vllm.utils.torch_utils", MagicMock(direct_register_custom_op=lambda *a, **k: None)
|
||||
)
|
||||
|
||||
|
||||
# Mock any version checks by reading from compiled CI requirements
|
||||
with open(ROOT_DIR / "requirements/test.txt") as f:
|
||||
VERSIONS = dict(line.strip().split("==") for line in f if "==" in line)
|
||||
importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0"
|
||||
|
||||
|
||||
# Make torch.nn.Parameter safe to inherit from
|
||||
sys.modules["torch.nn"] = MagicMock(Parameter=object)
|
||||
mock_if_no_torch("torch.nn", MagicMock(Parameter=object))
|
||||
|
||||
|
||||
class PydanticMagicMock(MagicMock):
|
||||
@ -49,31 +60,34 @@ class PydanticMagicMock(MagicMock):
|
||||
def __init__(self, *args, **kwargs):
|
||||
name = kwargs.pop("name", None)
|
||||
super().__init__(*args, **kwargs)
|
||||
self.__spec__ = importlib.machinery.ModuleSpec(name, None)
|
||||
self.__spec__ = ModuleSpec(name, None)
|
||||
|
||||
def __get_pydantic_core_schema__(self, source_type, handler):
|
||||
return core_schema.any_schema()
|
||||
|
||||
|
||||
def auto_mock(module, attr, max_mocks=100):
|
||||
def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
|
||||
"""Function that automatically mocks missing modules during imports."""
|
||||
logger.info("Importing %s from %s", attr, module)
|
||||
logger.info("Importing %s from %s", attr, module_name)
|
||||
|
||||
for _ in range(max_mocks):
|
||||
try:
|
||||
module = importlib.import_module(module_name)
|
||||
|
||||
# First treat attr as an attr, then as a submodule
|
||||
return getattr(
|
||||
importlib.import_module(module),
|
||||
attr,
|
||||
importlib.import_module(f"{module}.{attr}"),
|
||||
)
|
||||
if hasattr(module, attr):
|
||||
return getattr(module, attr)
|
||||
|
||||
return importlib.import_module(f"{module_name}.{attr}")
|
||||
except ModuleNotFoundError as e:
|
||||
assert e.name is not None
|
||||
logger.info("Mocking %s for argparse doc generation", e.name)
|
||||
sys.modules[e.name] = PydanticMagicMock(name=e.name)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to import %s.%s: %s", module, attr, e)
|
||||
except Exception:
|
||||
logger.exception("Failed to import %s.%s: %s", module_name, attr)
|
||||
|
||||
raise ImportError(
|
||||
f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
|
||||
f"Failed to import {module_name}.{attr} after mocking {max_mocks} imports"
|
||||
)
|
||||
|
||||
|
||||
@ -91,21 +105,26 @@ ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
|
||||
CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
|
||||
openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
|
||||
openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
|
||||
FlexibleArgumentParser = auto_mock(
|
||||
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = auto_mock(
|
||||
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
|
||||
)
|
||||
|
||||
|
||||
class MarkdownFormatter(HelpFormatter):
|
||||
"""Custom formatter that generates markdown for argument groups."""
|
||||
|
||||
def __init__(self, prog, starting_heading_level=3):
|
||||
super().__init__(prog, max_help_position=float("inf"), width=float("inf"))
|
||||
def __init__(self, prog: str, starting_heading_level: int = 3):
|
||||
super().__init__(prog, max_help_position=sys.maxsize, width=sys.maxsize)
|
||||
|
||||
self._section_heading_prefix = "#" * starting_heading_level
|
||||
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
|
||||
self._markdown_output = []
|
||||
|
||||
def start_section(self, heading):
|
||||
def start_section(self, heading: str):
|
||||
if heading not in {"positional arguments", "options"}:
|
||||
heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
|
||||
self._markdown_output.append(heading_md)
|
||||
@ -113,14 +132,14 @@ class MarkdownFormatter(HelpFormatter):
|
||||
def end_section(self):
|
||||
pass
|
||||
|
||||
def add_text(self, text):
|
||||
def add_text(self, text: str):
|
||||
if text:
|
||||
self._markdown_output.append(f"{text.strip()}\n\n")
|
||||
|
||||
def add_usage(self, usage, actions, groups, prefix=None):
|
||||
pass
|
||||
|
||||
def add_arguments(self, actions):
|
||||
def add_arguments(self, actions: Iterable[Action]):
|
||||
for action in actions:
|
||||
if len(action.option_strings) == 0 or "--help" in action.option_strings:
|
||||
continue
|
||||
@ -169,7 +188,7 @@ def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
|
||||
# Auto-mock runtime imports
|
||||
if tb_list := traceback.extract_tb(e.__traceback__):
|
||||
path = Path(tb_list[-1].filename).relative_to(ROOT_DIR)
|
||||
auto_mock(module=".".join(path.parent.parts), attr=path.stem)
|
||||
auto_mock(module_name=".".join(path.parent.parts), attr=path.stem)
|
||||
return create_parser(add_cli_args, **kwargs)
|
||||
else:
|
||||
raise e
|
||||
@ -209,7 +228,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
|
||||
# Generate documentation for each parser
|
||||
for stem, parser in parsers.items():
|
||||
doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
|
||||
doc_path = ARGPARSE_DOC_DIR / f"{stem}.inc.md"
|
||||
# Specify encoding for building on Windows
|
||||
with open(doc_path, "w", encoding="utf-8") as f:
|
||||
f.write(super(type(parser), parser).format_help())
|
||||
|
||||
@ -15,9 +15,9 @@ These models are what we list in [supported text models](#list-of-text-only-lang
|
||||
|
||||
### Transformers
|
||||
|
||||
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
|
||||
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers modeling backend".
|
||||
|
||||
Currently, the Transformers backend works for the following:
|
||||
Currently, the Transformers modeling backend works for the following:
|
||||
|
||||
- Modalities: embedding models, language models and vision-language models*
|
||||
- Architectures: encoder-only, decoder-only, mixture-of-experts
|
||||
@ -25,7 +25,7 @@ Currently, the Transformers backend works for the following:
|
||||
|
||||
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
|
||||
|
||||
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
|
||||
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers modeling backend, it will be compatible with the following features of vLLM:
|
||||
|
||||
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
|
||||
- Any combination of the following vLLM parallelisation schemes:
|
||||
@ -44,7 +44,7 @@ llm.apply_model(lambda model: print(type(model)))
|
||||
|
||||
If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
|
||||
|
||||
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
|
||||
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers modeling backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
|
||||
|
||||
!!! note
|
||||
For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
|
||||
@ -53,12 +53,12 @@ If a model has a vLLM implementation but you would prefer to use the Transformer
|
||||
|
||||
If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!
|
||||
|
||||
For a model to be compatible with the Transformers backend for vLLM it must:
|
||||
For a model to be compatible with the Transformers modeling backend for vLLM it must:
|
||||
|
||||
- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
|
||||
- The model directory must have the correct structure (e.g. `config.json` is present).
|
||||
- `config.json` must contain `auto_map.AutoModel`.
|
||||
- be a Transformers backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
|
||||
- be a Transformers modeling backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
|
||||
- Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
|
||||
|
||||
If the compatible model is:
|
||||
@ -66,13 +66,13 @@ If the compatible model is:
|
||||
- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md).
|
||||
- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference](../serving/offline_inference.md) or `vllm serve <MODEL_DIR>` for the [openai-compatible-server](../serving/openai_compatible_server.md).
|
||||
|
||||
This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
|
||||
This means that, with the Transformers modeling backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
|
||||
|
||||
#### Writing custom models
|
||||
|
||||
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
|
||||
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers modeling backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
|
||||
|
||||
To make your model compatible with the Transformers backend, it needs:
|
||||
To make your model compatible with the Transformers modeling backend, it needs:
|
||||
|
||||
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
|
||||
- If your model is encoder-only:
|
||||
@ -134,7 +134,7 @@ Here is what happens in the background when this model is loaded:
|
||||
|
||||
1. The config is loaded.
|
||||
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
|
||||
3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
3. `MyModel` is loaded into one of the Transformers modeling backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
|
||||
That's it!
|
||||
|
||||
@ -182,7 +182,7 @@ To determine whether a given model is natively supported, you can check the `con
|
||||
If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
|
||||
|
||||
Models do not _need_ to be natively supported to be used in vLLM.
|
||||
The [Transformers backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
|
||||
The [Transformers modeling backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
|
||||
|
||||
!!! tip
|
||||
The easiest way to check if your model is really supported at runtime is to run the program below:
|
||||
@ -351,6 +351,7 @@ th {
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|
|
||||
| `AfmoeForCausalLM` | Afmoe | TBA | ✅︎ | ✅︎ |
|
||||
| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ |
|
||||
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
|
||||
@ -451,7 +452,7 @@ th {
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
|
||||
| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
|
||||
|
||||
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|
|
||||
@ -669,7 +670,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ |
|
||||
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>E+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
|
||||
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
|
||||
@ -684,7 +685,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
|
||||
| `LightOnOCRForConditionalGeneration` | LightOnOCR-1B | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ |
|
||||
@ -720,7 +721,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
|
||||
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
|
||||
|
||||
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
|
||||
@ -785,6 +786,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
|
||||
| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
!!! note
|
||||
`VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed.
|
||||
|
||||
### Pooling Models
|
||||
|
||||
See [this page](./pooling_models.md) for more information on how to use pooling models.
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Using vLLM
|
||||
|
||||
First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
|
||||
First, vLLM must be [installed](../getting_started/installation/README.md) for your chosen device in either a Python or Docker environment.
|
||||
|
||||
Then, vLLM supports the following usage patterns:
|
||||
|
||||
|
||||
@ -43,6 +43,7 @@ class ModelRequestData(NamedTuple):
|
||||
|
||||
|
||||
# Voxtral
|
||||
# Make sure to install mistral-common[audio].
|
||||
def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
|
||||
from mistral_common.audio import Audio
|
||||
from mistral_common.protocol.instruct.chunk import (
|
||||
|
||||
@ -1536,7 +1536,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 28 * 28,
|
||||
"max_pixels": 1280 * 28 * 28,
|
||||
"fps": [1],
|
||||
"fps": 1,
|
||||
},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
@ -161,6 +161,7 @@ def main():
|
||||
{
|
||||
"role": "assistant",
|
||||
"tool_calls": chat_completion.choices[0].message.tool_calls,
|
||||
"reasoning": chat_completion.choices[0].message.reasoning,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
49
examples/online_serving/token_generation_client.py
Normal file
49
examples/online_serving/token_generation_client.py
Normal file
@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import httpx
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
|
||||
DUMMY_API_KEY = "empty"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
transport = httpx.HTTPTransport()
|
||||
headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
|
||||
client = httpx.Client(
|
||||
transport=transport,
|
||||
base_url=GEN_ENDPOINT,
|
||||
timeout=600,
|
||||
headers=headers,
|
||||
)
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "How many countries are in the EU?"},
|
||||
]
|
||||
|
||||
|
||||
def main(client):
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False,
|
||||
)
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
|
||||
"stream": False,
|
||||
}
|
||||
resp = client.post(GEN_ENDPOINT, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
print(data)
|
||||
print("-" * 50)
|
||||
print("Token generation results:")
|
||||
res = tokenizer.decode(data["choices"][0]["token_ids"])
|
||||
print(res)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(client)
|
||||
@ -31,7 +31,7 @@ partial-json-parser # used for parsing partial JSON outputs
|
||||
pyzmq >= 25.0.0
|
||||
msgspec
|
||||
gguf >= 0.13.0
|
||||
mistral_common[image,audio] >= 1.8.5
|
||||
mistral_common[image] >= 1.8.5
|
||||
opencv-python-headless >= 4.11.0 # required for video IO
|
||||
pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
|
||||
@ -8,7 +8,7 @@ packaging>=24.2
|
||||
setuptools>=77.0.3,<81.0.0
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
|
||||
torch==2.8.0; platform_system == "Darwin"
|
||||
torch==2.9.0; platform_system == "Darwin"
|
||||
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
|
||||
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
||||
|
||||
@ -9,6 +9,6 @@ torch==2.9.0
|
||||
torchaudio==2.9.0
|
||||
# These must be updated alongside torch
|
||||
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
xformers==0.0.33; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
|
||||
xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
|
||||
# FlashInfer should be updated together with the Dockerfile
|
||||
flashinfer-python==0.5.2
|
||||
|
||||
@ -10,3 +10,7 @@ mkdocs-minify-plugin
|
||||
regex
|
||||
ruff
|
||||
pydantic
|
||||
|
||||
# For generating argparse docs.
|
||||
# Adding requirements here should only be used as a last resort.
|
||||
msgspec # Need for multiple inheritance involving msgspec.Struct
|
||||
@ -13,5 +13,5 @@ setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
amdsmi==6.2.4
|
||||
amdsmi==6.4.3
|
||||
timm>=1.0.17
|
||||
|
||||
4
setup.py
4
setup.py
@ -545,7 +545,9 @@ def get_vllm_version() -> str:
|
||||
# Allow overriding the version. This is useful to build platform-specific
|
||||
# wheels (e.g. CPU, TPU) without modifying the source.
|
||||
if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
|
||||
return env_version
|
||||
print(f"Overriding VLLM version with {env_version} from VLLM_VERSION_OVERRIDE")
|
||||
os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = env_version
|
||||
return get_version(write_to="vllm/_version.py")
|
||||
|
||||
version = get_version(write_to="vllm/_version.py")
|
||||
sep = "+" if "+" not in version else "." # dev versions might contain +
|
||||
|
||||
@ -22,6 +22,8 @@ from vllm.config import (
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from .. import silly_attention # noqa: F401
|
||||
|
||||
@ -193,7 +195,14 @@ def run_model(
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
|
||||
def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_multi_graph_piecewise_compile(
|
||||
use_inductor_graph_partition: bool, use_bytecode_hook: bool, monkeypatch
|
||||
):
|
||||
# Set the environment variable for this test
|
||||
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
||||
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
|
||||
@ -21,6 +21,8 @@ from vllm.config import (
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from ..silly_attention import get_global_counter, reset_global_counter
|
||||
|
||||
@ -124,6 +126,7 @@ def _run_simple_model(
|
||||
|
||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
||||
@torch.inference_mode()
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_simple_piecewise_compile(use_inductor):
|
||||
_run_simple_model(
|
||||
splitting_ops=["silly::attention"],
|
||||
|
||||
@ -29,6 +29,8 @@ from vllm.config import (
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from .. import silly_attention # noqa: F401
|
||||
|
||||
@ -334,6 +336,7 @@ def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
|
||||
("inductor", True), # Inductor, Inductor partition
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_toy_llama(
|
||||
backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
|
||||
):
|
||||
@ -513,4 +516,8 @@ def benchmark():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
benchmark()
|
||||
# Protect against subprocess reimport when using spawn_new_process_for_each_test
|
||||
import os
|
||||
|
||||
if os.environ.get("RUNNING_IN_SUBPROCESS") != "1":
|
||||
benchmark()
|
||||
|
||||
@ -15,6 +15,9 @@ from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import _is_torch_equal_or_newer
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from . import silly_attention # noqa: F401
|
||||
|
||||
|
||||
def test_version():
|
||||
# Test the version comparison logic using the private function
|
||||
@ -257,15 +260,6 @@ def test_should_split():
|
||||
splitting_ops = ["aten::add.Tensor"]
|
||||
assert not should_split(node, splitting_ops)
|
||||
|
||||
@torch.library.custom_op(
|
||||
"silly::attention",
|
||||
mutates_args=["out"],
|
||||
)
|
||||
def attention(
|
||||
q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
|
||||
) -> None:
|
||||
out.copy_(q + k + v)
|
||||
|
||||
q, k, v, out = [torch.randn(1)] * 4
|
||||
|
||||
# supports custom ops as OpOverloadPacket
|
||||
|
||||
@ -20,13 +20,22 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ..utils import flat_product, multi_gpu_test
|
||||
|
||||
is_blackwell = lambda: current_platform.is_device_capability(100)
|
||||
"""Are we running on Blackwell, a lot of tests depend on it"""
|
||||
|
||||
|
||||
class Matches(NamedTuple):
|
||||
attention_fusion: int = 0
|
||||
allreduce_fusion: int = 0
|
||||
sequence_parallel: int = 0
|
||||
async_tp: int = 0
|
||||
|
||||
|
||||
class ModelBackendTestCase(NamedTuple):
|
||||
model_name: str
|
||||
model_kwargs: dict[str, Any]
|
||||
backend: AttentionBackendEnum
|
||||
attention_fusions: int
|
||||
allreduce_fusions: int | None = None
|
||||
matches: Matches
|
||||
|
||||
|
||||
MODELS_FP8: list[ModelBackendTestCase] = []
|
||||
@ -38,17 +47,33 @@ if current_platform.is_cuda():
|
||||
ModelBackendTestCase(
|
||||
# Use smaller model for L40s in CI
|
||||
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
attention_fusions=32,
|
||||
allreduce_fusions=65,
|
||||
# TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
|
||||
# so FI attention+fp8_quant is at least tested once
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=AttentionBackendEnum.FLASHINFER
|
||||
if is_blackwell()
|
||||
else AttentionBackendEnum.TRITON_ATTN,
|
||||
matches=Matches(
|
||||
attention_fusion=32,
|
||||
allreduce_fusion=65,
|
||||
sequence_parallel=65,
|
||||
async_tp=128,
|
||||
),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=AttentionBackendEnum.FLASHINFER,
|
||||
attention_fusions=48,
|
||||
allreduce_fusions=96,
|
||||
# TODO FlashInfer attn broken on Hopper with kvcache=fp8:
|
||||
# https://github.com/vllm-project/vllm/issues/28568
|
||||
# TODO FlashInfer attn broken on Blackwell for llama4:
|
||||
# https://github.com/vllm-project/vllm/issues/28604
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
matches=Matches(
|
||||
attention_fusion=48,
|
||||
allreduce_fusion=96,
|
||||
sequence_parallel=96,
|
||||
async_tp=95, # mlp is moe, no fusion there
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
@ -57,8 +82,12 @@ if current_platform.is_cuda():
|
||||
model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=AttentionBackendEnum.FLASHINFER,
|
||||
attention_fusions=32,
|
||||
allreduce_fusions=65,
|
||||
matches=Matches(
|
||||
attention_fusion=32,
|
||||
allreduce_fusion=65,
|
||||
sequence_parallel=65,
|
||||
async_tp=128,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
@ -68,15 +97,23 @@ if current_platform.is_cuda():
|
||||
model_name="meta-llama/Llama-3.1-8B-Instruct",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
attention_fusions=0,
|
||||
allreduce_fusions=65,
|
||||
matches=Matches(
|
||||
attention_fusion=0,
|
||||
allreduce_fusion=65,
|
||||
sequence_parallel=65,
|
||||
async_tp=128,
|
||||
),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="Qwen/Qwen3-30B-A3B",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
attention_fusions=0,
|
||||
allreduce_fusions=97,
|
||||
matches=Matches(
|
||||
attention_fusion=0,
|
||||
allreduce_fusion=97,
|
||||
sequence_parallel=97,
|
||||
async_tp=96, # MLP is MoE, half the fusions of dense
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
@ -86,19 +123,19 @@ elif current_platform.is_rocm():
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
attention_fusions=32,
|
||||
matches=Matches(attention_fusion=32),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.ROCM_ATTN,
|
||||
attention_fusions=32,
|
||||
matches=Matches(attention_fusion=32),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
|
||||
attention_fusions=32,
|
||||
matches=Matches(attention_fusion=32),
|
||||
),
|
||||
]
|
||||
|
||||
@ -106,8 +143,7 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, model_kwargs, backend, "
|
||||
"attention_fusions, allreduce_fusions, custom_ops",
|
||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
||||
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
|
||||
list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
|
||||
# quant_fp4 only has the custom impl
|
||||
@ -118,15 +154,14 @@ def test_attn_quant(
|
||||
model_name: str,
|
||||
model_kwargs: dict[str, Any],
|
||||
backend: AttentionBackendEnum,
|
||||
attention_fusions: int,
|
||||
allreduce_fusions: int,
|
||||
matches: Matches,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
caplog_mp_spawn,
|
||||
monkeypatch,
|
||||
):
|
||||
if backend == AttentionBackendEnum.FLASHINFER and (
|
||||
not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
|
||||
not is_blackwell() or not has_flashinfer()
|
||||
):
|
||||
pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
|
||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
@ -169,12 +204,12 @@ def test_attn_quant(
|
||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
||||
run_model(compilation_config, model_name, **model_kwargs)
|
||||
|
||||
matches = re.findall(
|
||||
log_matches = re.findall(
|
||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(matches) == 1, log_holder.text
|
||||
assert int(matches[0]) == attention_fusions
|
||||
assert len(log_matches) == 1, log_holder.text
|
||||
assert int(log_matches[0]) == matches.attention_fusion
|
||||
|
||||
|
||||
CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
|
||||
@ -187,8 +222,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, model_kwargs, backend, "
|
||||
"attention_fusions, allreduce_fusions, custom_ops",
|
||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
||||
# Toggle RMSNorm and QuantFP8 for FP8 models
|
||||
list(
|
||||
flat_product(
|
||||
@ -209,8 +243,7 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
|
||||
model_name: str,
|
||||
model_kwargs: dict,
|
||||
backend: AttentionBackendEnum,
|
||||
attention_fusions: int,
|
||||
allreduce_fusions: int,
|
||||
matches: Matches,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
caplog_mp_spawn,
|
||||
@ -219,6 +252,13 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
|
||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
||||
|
||||
if "fp4" in model_name.lower() and not is_blackwell():
|
||||
pytest.skip("NVFP4 quant requires Blackwell")
|
||||
|
||||
if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
|
||||
# FlashInfer attn fusion requires Blackwell
|
||||
matches = matches._replace(attention_fusion=0)
|
||||
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
|
||||
if inductor_graph_partition:
|
||||
@ -258,23 +298,135 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
|
||||
run_model(
|
||||
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
|
||||
)
|
||||
matches = re.findall(
|
||||
log_matches = re.findall(
|
||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(matches) == 2, log_holder.text
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(matches[0]) == attention_fusions
|
||||
assert int(matches[1]) == attention_fusions
|
||||
assert int(log_matches[0]) == matches.attention_fusion
|
||||
assert int(log_matches[1]) == matches.attention_fusion
|
||||
|
||||
matches = re.findall(
|
||||
log_matches = re.findall(
|
||||
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(matches) == 2, log_holder.text
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(matches[0]) == allreduce_fusions
|
||||
assert int(matches[1]) == allreduce_fusions
|
||||
assert int(log_matches[0]) == matches.allreduce_fusion
|
||||
assert int(log_matches[1]) == matches.allreduce_fusion
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
||||
# Toggle RMSNorm and QuantFP8 for FP8 models
|
||||
list(
|
||||
flat_product(
|
||||
MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
|
||||
)
|
||||
)
|
||||
# Toggle RMSNorm for FP4 models and unquant models
|
||||
+ list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
|
||||
)
|
||||
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="sequence parallel only tested on CUDA",
|
||||
)
|
||||
def test_tp2_attn_quant_async_tp(
|
||||
model_name: str,
|
||||
model_kwargs: dict,
|
||||
backend: AttentionBackendEnum,
|
||||
matches: Matches,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
caplog_mp_spawn,
|
||||
monkeypatch,
|
||||
):
|
||||
if is_blackwell():
|
||||
# TODO: https://github.com/vllm-project/vllm/issues/27893
|
||||
pytest.skip("Blackwell is not supported for AsyncTP pass")
|
||||
|
||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
||||
|
||||
if "fp4" in model_name.lower() and not is_blackwell():
|
||||
pytest.skip("NVFP4 quant requires Blackwell")
|
||||
|
||||
if backend == AttentionBackendEnum.FLASHINFER:
|
||||
if not has_flashinfer():
|
||||
pytest.skip("FlashInfer backend requires flashinfer installed")
|
||||
if not is_blackwell():
|
||||
# FlashInfer attn fusion requires Blackwell
|
||||
matches = matches._replace(attention_fusion=0)
|
||||
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
|
||||
if inductor_graph_partition:
|
||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
splitting_ops: list[str] | None = None
|
||||
else:
|
||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
||||
splitting_ops = []
|
||||
|
||||
# Disable, compile cache to make sure custom passes run.
|
||||
# Otherwise, we can't verify fusion happened through the logs.
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
||||
# Force spawn as it is more general.
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
|
||||
|
||||
compilation_config = CompilationConfig(
|
||||
# Testing properties
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
cudagraph_mode=mode,
|
||||
custom_ops=custom_ops_list,
|
||||
splitting_ops=splitting_ops,
|
||||
# Common
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
pass_config=PassConfig(
|
||||
enable_attn_fusion=True,
|
||||
enable_noop=True,
|
||||
enable_sequence_parallelism=True,
|
||||
enable_async_tp=True,
|
||||
),
|
||||
# Inductor caches custom passes by default as well via uuid
|
||||
inductor_compile_config={"force_disable_caches": True},
|
||||
)
|
||||
|
||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
||||
run_model(
|
||||
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
|
||||
)
|
||||
log_matches = re.findall(
|
||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(log_matches[0]) == matches.attention_fusion
|
||||
assert int(log_matches[1]) == matches.attention_fusion
|
||||
|
||||
log_matches = re.findall(
|
||||
r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(log_matches[0]) == matches.sequence_parallel
|
||||
assert int(log_matches[1]) == matches.sequence_parallel
|
||||
|
||||
log_matches = re.findall(
|
||||
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(log_matches[0]) == matches.async_tp
|
||||
assert int(log_matches[1]) == matches.async_tp
|
||||
|
||||
|
||||
def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
|
||||
|
||||
124
tests/compile/test_graph_partition.py
Normal file
124
tests/compile/test_graph_partition.py
Normal file
@ -0,0 +1,124 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import operator
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torch.fx.experimental.proxy_tensor import make_fx
|
||||
|
||||
from vllm.compilation.backends import split_graph
|
||||
|
||||
|
||||
def test_getitem_moved_to_producer_subgraph():
|
||||
"""
|
||||
Test that getitem operations are moved to the same subgraph as their input,
|
||||
preventing tuple inputs to submodules.
|
||||
"""
|
||||
|
||||
def model_fn(x: torch.Tensor) -> torch.Tensor:
|
||||
# torch.split returns a tuple, creating real getitem operations
|
||||
# Should become first submodule that produces tuple
|
||||
chunks = torch.split(x, x.shape[0] // 2, dim=0)
|
||||
|
||||
# Following ops should become second submodule that consumes tuple
|
||||
result_0 = torch.relu(chunks[0])
|
||||
result_1 = torch.relu(chunks[1])
|
||||
return torch.cat([result_0, result_1], dim=0)
|
||||
|
||||
x = torch.randn(4, 3)
|
||||
gm = make_fx(model_fn)(x)
|
||||
|
||||
has_getitem = any(
|
||||
node.op == "call_function" and node.target == operator.getitem
|
||||
for node in gm.graph.nodes
|
||||
)
|
||||
assert has_getitem, "Test setup failed: graph should contain getitem operations"
|
||||
|
||||
# Split on tuple producer aten::split
|
||||
split_ops = ["aten::split.Tensor"]
|
||||
split_gm, split_items = split_graph(gm, split_ops)
|
||||
assert len(split_items) == 2, "Graph should be split into 2 submodules"
|
||||
|
||||
for split_item in split_items:
|
||||
submodule = split_item.graph
|
||||
|
||||
getitem_on_placeholder = []
|
||||
for node in submodule.graph.nodes:
|
||||
if (
|
||||
node.op == "call_function"
|
||||
and node.target == operator.getitem
|
||||
and node.args[0].op == "placeholder"
|
||||
):
|
||||
getitem_on_placeholder.append(node)
|
||||
|
||||
assert len(getitem_on_placeholder) == 0, (
|
||||
f"Submodule {split_item.submod_name} has getitem operations on "
|
||||
f"placeholder nodes: {[n.name for n in getitem_on_placeholder]}. "
|
||||
"This means tuple inputs were not properly eliminated."
|
||||
)
|
||||
|
||||
new_x = torch.randn(4, 3)
|
||||
output_original = gm(new_x)
|
||||
output_split = split_gm(new_x)
|
||||
|
||||
assert torch.allclose(output_original, output_split), "Output mismatch"
|
||||
|
||||
|
||||
def test_no_tuple_inputs_with_multiple_consumers():
|
||||
"""
|
||||
Test that when a tuple is consumed by multiple split operations,
|
||||
getitem operations are properly moved to avoid tuple inputs.
|
||||
"""
|
||||
|
||||
def model_fn(x: torch.Tensor) -> torch.Tensor:
|
||||
# torch.split returns a tuple, creating real getitem operations
|
||||
# Should become first submodule that produces tuple
|
||||
chunks = torch.split(x, x.shape[0] // 2, dim=0)
|
||||
|
||||
# These should become second submodule consuming tuple
|
||||
result_1 = torch.relu(chunks[0])
|
||||
result_2 = torch.relu(chunks[1])
|
||||
|
||||
# Artificial graph splitting point to create another
|
||||
# independent submodule that consumes tuple later
|
||||
# This would become the third submodule
|
||||
result_1 = torch.sigmoid(result_1)
|
||||
|
||||
# Fourth submodule that consumes tuple
|
||||
result = torch.cat([chunks[0], chunks[1], result_1, result_2])
|
||||
return result
|
||||
|
||||
x = torch.randn(4, 3)
|
||||
gm = make_fx(model_fn)(x)
|
||||
|
||||
has_getitem = any(
|
||||
node.op == "call_function" and node.target == operator.getitem
|
||||
for node in gm.graph.nodes
|
||||
)
|
||||
assert has_getitem, "Test setup failed: graph should contain getitem operations"
|
||||
|
||||
split_ops = ["aten::split.Tensor", "aten::sigmoid"]
|
||||
split_gm, split_items = split_graph(gm, split_ops)
|
||||
assert len(split_items) == 4, "Graph should be split into 4 submodules"
|
||||
|
||||
for split_item in split_items:
|
||||
submodule = split_item.graph
|
||||
|
||||
for node in submodule.graph.nodes:
|
||||
if (
|
||||
node.op == "call_function"
|
||||
and node.target == operator.getitem
|
||||
and node.args[0].op == "placeholder"
|
||||
):
|
||||
pytest.fail(
|
||||
f"Submodule {split_item.submod_name} has getitem on "
|
||||
f"placeholder {node.args[0].name}, indicating it receives "
|
||||
"a tuple input"
|
||||
)
|
||||
|
||||
new_x = torch.randn(4, 3)
|
||||
output_original = gm(new_x)
|
||||
output_split = split_gm(new_x)
|
||||
|
||||
assert torch.allclose(output_original, output_split), "Output mismatch after split"
|
||||
@ -10,8 +10,8 @@ from vllm.platforms import current_platform
|
||||
|
||||
def test_compile():
|
||||
vllm_config = VllmConfig()
|
||||
# Default configuration compiles mm encoder
|
||||
assert vllm_config.compilation_config.compile_mm_encoder
|
||||
# Default configuration does not compile mm encoder
|
||||
assert not vllm_config.compilation_config.compile_mm_encoder
|
||||
|
||||
|
||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||
@ -39,7 +39,10 @@ def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
|
||||
"Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.8,
|
||||
compilation_config={"mode": CompilationMode.VLLM_COMPILE},
|
||||
compilation_config={
|
||||
"mode": CompilationMode.VLLM_COMPILE,
|
||||
"compile_mm_encoder": True,
|
||||
},
|
||||
) as _,
|
||||
):
|
||||
pass
|
||||
|
||||
@ -5,15 +5,15 @@ import pytest
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
|
||||
from vllm.compilation.fusion import RMSNormQuantFusionPass
|
||||
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
|
||||
from vllm.compilation.fx_utils import find_auto_fn
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.compilation.sequence_parallelism import SequenceParallelismPass
|
||||
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CUDAGraphMode,
|
||||
DeviceConfig,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
@ -30,6 +30,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
kFp8StaticTensorSym,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
@ -45,169 +46,157 @@ prompts = [
|
||||
]
|
||||
|
||||
|
||||
class TestModel(torch.nn.Module):
|
||||
def __init__(self, hidden_size=16, intermediate_size=32):
|
||||
class TestAllReduceRMSNormModel(torch.nn.Module):
|
||||
def __init__(self, hidden_size=16, eps=1e-6):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.gate_proj = torch.nn.Parameter(
|
||||
torch.empty((intermediate_size, hidden_size))
|
||||
)
|
||||
self.norm = RMSNorm(intermediate_size, 1e-05)
|
||||
# Initialize weights
|
||||
torch.nn.init.normal_(self.gate_proj, std=0.02)
|
||||
self.eps = eps
|
||||
self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
|
||||
self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
|
||||
|
||||
def forward(self, hidden_states, residual):
|
||||
"""
|
||||
Forward pass implementing the operations in the FX graph
|
||||
def forward(self, x):
|
||||
z = torch.relu(x)
|
||||
x = resid = tensor_model_parallel_all_reduce(z)
|
||||
y = self.norm[0](x)
|
||||
|
||||
Args:
|
||||
hidden_states: Input tensor
|
||||
residual: Residual tensor from previous layer
|
||||
z2 = torch.mm(y, self.w[0])
|
||||
x2 = tensor_model_parallel_all_reduce(z2)
|
||||
|
||||
Returns:
|
||||
Tuple containing the output tensor
|
||||
"""
|
||||
# Reshape input
|
||||
view = hidden_states.reshape(-1, self.hidden_size)
|
||||
y2, resid = self.norm[1](x2, resid)
|
||||
|
||||
# matrix multiplication
|
||||
permute = self.gate_proj.permute(1, 0)
|
||||
mm = torch.mm(view, permute)
|
||||
z3 = torch.mm(y2, self.w[1])
|
||||
x3 = tensor_model_parallel_all_reduce(z3)
|
||||
|
||||
# Tensor parallel all-reduce
|
||||
all_reduce = tensor_model_parallel_all_reduce(mm)
|
||||
y3, resid = self.norm[2](x3, resid)
|
||||
|
||||
# layer normalization
|
||||
norm_output, residual_output = self.norm(all_reduce, residual)
|
||||
z4 = torch.mm(y3, self.w[2])
|
||||
x4 = tensor_model_parallel_all_reduce(z4)
|
||||
|
||||
return norm_output, residual_output
|
||||
y4, resid = self.norm[3](x4, resid)
|
||||
return y4
|
||||
|
||||
def ops_in_model_before(self):
|
||||
return [torch.ops.vllm.all_reduce.default]
|
||||
|
||||
def ops_in_model_after(self):
|
||||
return [
|
||||
torch.ops.vllm.reduce_scatter.default,
|
||||
torch.ops.vllm.all_gather.default,
|
||||
torch.ops.vllm.reduce_scatter.default,
|
||||
]
|
||||
|
||||
def ops_in_model(self):
|
||||
return [torch.ops._C.fused_add_rms_norm.default]
|
||||
if RMSNorm.enabled():
|
||||
return [
|
||||
torch.ops._C.rms_norm.default,
|
||||
torch.ops._C.fused_add_rms_norm.default,
|
||||
]
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
class TestQuantModel(torch.nn.Module):
|
||||
class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
|
||||
quant_key = kFp8StaticTensorSym
|
||||
|
||||
def __init__(self, hidden_size=16, intermediate_size=32):
|
||||
def __init__(self, hidden_size=16, eps=1e-6):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.vllm_config = get_current_vllm_config()
|
||||
self.gate_proj = torch.nn.Parameter(
|
||||
torch.empty((intermediate_size, hidden_size)), requires_grad=False
|
||||
)
|
||||
self.norm = RMSNorm(intermediate_size, 1e-05)
|
||||
# Initialize weights
|
||||
torch.nn.init.normal_(self.gate_proj, std=0.02)
|
||||
self.hidden_size = hidden_size
|
||||
self.eps = eps
|
||||
self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
|
||||
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
|
||||
self.w = [
|
||||
torch.rand(hidden_size, hidden_size)
|
||||
.to(dtype=current_platform.fp8_dtype())
|
||||
.t()
|
||||
for _ in range(3)
|
||||
]
|
||||
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
|
||||
|
||||
self.scale = torch.rand(1, dtype=torch.float32)
|
||||
# Create a weight that is compatible with torch._scaled_mm,
|
||||
# which expects a column-major layout.
|
||||
self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t()
|
||||
self.wscale = torch.rand(1, dtype=torch.float32)
|
||||
self.fp8_linear = TestFP8Layer(
|
||||
self.quant_key, self.quant_key, self.w, self.wscale, self.scale
|
||||
)
|
||||
|
||||
def forward(self, hidden_states, residual):
|
||||
"""
|
||||
Forward pass implementing the operations in the FX graph
|
||||
|
||||
Args:
|
||||
hidden_states: Input tensor
|
||||
residual: Residual tensor from previous layer
|
||||
|
||||
Returns:
|
||||
Tuple containing the output tensor
|
||||
"""
|
||||
# Reshape input
|
||||
view = hidden_states.reshape(-1, self.hidden_size)
|
||||
|
||||
# matrix multiplication
|
||||
permute = self.gate_proj.permute(1, 0)
|
||||
mm = torch.mm(view, permute)
|
||||
|
||||
# Tensor parallel all-reduce
|
||||
all_reduce = tensor_model_parallel_all_reduce(mm)
|
||||
|
||||
# layer normalization
|
||||
norm_output, residual_output = self.norm(all_reduce, residual)
|
||||
# scaled_mm with static input quantization
|
||||
fp8_linear_result = self.fp8_linear(norm_output)
|
||||
|
||||
return fp8_linear_result, residual_output
|
||||
|
||||
def ops_in_model_before(self):
|
||||
ops_to_remove = [torch.ops.vllm.all_reduce.default] # Always removed by SP
|
||||
# The following are only removed if fusion happens
|
||||
if (
|
||||
self.vllm_config
|
||||
and self.vllm_config.compilation_config.pass_config.enable_fusion
|
||||
):
|
||||
ops_to_remove.extend(
|
||||
[
|
||||
torch.ops._C.fused_add_rms_norm.default,
|
||||
torch.ops._C.static_scaled_fp8_quant.default,
|
||||
]
|
||||
self.fp8_linear_layers = [
|
||||
TestFP8Layer(
|
||||
self.quant_key, self.quant_key, self.w[i], self.wscale[i], self.scale[i]
|
||||
)
|
||||
return ops_to_remove
|
||||
for i in range(3)
|
||||
]
|
||||
|
||||
|
||||
def forward(self, hidden_states):
|
||||
# avoid having graph input be an arg to a pattern directly
|
||||
z = torch.relu(hidden_states)
|
||||
x = resid = tensor_model_parallel_all_reduce(z)
|
||||
y = self.norm[0](x)
|
||||
|
||||
z2 = self.fp8_linear_layers[0](y)
|
||||
|
||||
x2 = tensor_model_parallel_all_reduce(z2)
|
||||
y2, resid = self.norm[1](x2, resid)
|
||||
|
||||
z3 = self.fp8_linear_layers[1](y2)
|
||||
|
||||
x3 = tensor_model_parallel_all_reduce(z3)
|
||||
y3, resid = self.norm[2](x3, resid) # use resid here
|
||||
|
||||
z4 = self.fp8_linear_layers[2](y3)
|
||||
x4 = tensor_model_parallel_all_reduce(z4)
|
||||
y4, resid = self.norm[3](x4, resid) # use resid here
|
||||
return y4
|
||||
|
||||
def ops_in_model_after(self):
|
||||
ops_to_add = [
|
||||
torch.ops.vllm.reduce_scatter.default,
|
||||
return [
|
||||
torch.ops.vllm.all_gather.default,
|
||||
torch.ops.vllm.reduce_scatter.default,
|
||||
]
|
||||
|
||||
def ops_in_model_before(self):
|
||||
return [
|
||||
torch.ops.vllm.all_reduce.default,
|
||||
]
|
||||
# The following is only added if fusion happens
|
||||
if (
|
||||
self.vllm_config
|
||||
and self.vllm_config.compilation_config.pass_config.enable_fusion
|
||||
):
|
||||
ops_to_add.append(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
|
||||
return ops_to_add
|
||||
|
||||
def ops_in_model(self):
|
||||
if (
|
||||
self.vllm_config
|
||||
and self.vllm_config.compilation_config.pass_config.enable_fusion
|
||||
):
|
||||
# If fusion happens, the fused op is the one
|
||||
# we check for (de)functionalization
|
||||
if self.vllm_config.compilation_config.pass_config.enable_fusion:
|
||||
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
|
||||
else:
|
||||
# If no fusion, the original ops are checked
|
||||
elif RMSNorm.enabled():
|
||||
return [
|
||||
torch.ops._C.fused_add_rms_norm.default,
|
||||
# TODO functionalization pass does not handle this yet
|
||||
# torch.ops._C.static_scaled_fp8_quant.default,
|
||||
]
|
||||
elif any(
|
||||
layer.is_quant_fp8_enabled() for layer in self.fp8_linear_layers
|
||||
):
|
||||
return [
|
||||
torch.ops._C.static_scaled_fp8_quant.default,
|
||||
]
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("test_model_cls", [TestModel, TestQuantModel])
|
||||
@pytest.mark.parametrize(
|
||||
"test_model_cls, custom_ops",
|
||||
[
|
||||
(TestAllReduceRMSNormModel, "+rms_norm"),
|
||||
(TestAllReduceRMSNormModel, "-rms_norm"),
|
||||
(TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,+quant_fp8"),
|
||||
(TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,-quant_fp8"),
|
||||
(TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,+quant_fp8"),
|
||||
(TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,-quant_fp8"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize("seq_len", [16])
|
||||
@pytest.mark.parametrize("hidden_size", [16])
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("enable_fusion", [True, False])
|
||||
@pytest.mark.parametrize("dynamic", [False, True])
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
|
||||
def test_sequence_parallelism_pass(
|
||||
test_model_cls: type[torch.nn.Module],
|
||||
custom_ops: str,
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
enable_fusion: bool,
|
||||
dynamic: bool,
|
||||
):
|
||||
num_processes = 2
|
||||
|
||||
@ -219,11 +208,13 @@ def test_sequence_parallelism_pass(
|
||||
args=(
|
||||
num_processes,
|
||||
test_model_cls,
|
||||
custom_ops,
|
||||
batch_size,
|
||||
seq_len,
|
||||
hidden_size,
|
||||
dtype,
|
||||
enable_fusion,
|
||||
dynamic,
|
||||
),
|
||||
nprocs=nprocs,
|
||||
)
|
||||
@ -235,11 +226,13 @@ def sequence_parallelism_pass_on_test_model(
|
||||
local_rank: int,
|
||||
world_size: int,
|
||||
test_model_cls: type[torch.nn.Module],
|
||||
custom_ops: str,
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
enable_fusion: bool,
|
||||
dynamic: bool,
|
||||
):
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
@ -263,12 +256,16 @@ def sequence_parallelism_pass_on_test_model(
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
|
||||
# configure vllm config for SequenceParallelismPass
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
compilation_config = CompilationConfig(
|
||||
splitting_ops=[], # avoid automatic rms_norm enablement
|
||||
cudagraph_mode=CUDAGraphMode.NONE, # avoid piecewise warnings
|
||||
custom_ops=custom_ops_list,
|
||||
pass_config=PassConfig(
|
||||
enable_sequence_parallelism=True,
|
||||
enable_fusion=enable_fusion,
|
||||
enable_noop=True,
|
||||
)
|
||||
),
|
||||
) # NoOp needed for fusion
|
||||
device_config = DeviceConfig(device=torch.device("cuda"))
|
||||
|
||||
@ -288,7 +285,6 @@ def sequence_parallelism_pass_on_test_model(
|
||||
with set_current_vllm_config(vllm_config):
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
assert (
|
||||
sequence_parallelism_pass.compilation_config.splitting_ops
|
||||
@ -309,38 +305,29 @@ def sequence_parallelism_pass_on_test_model(
|
||||
|
||||
passes_for_backend.append(cleanup_pass)
|
||||
|
||||
backend_no_func = TestBackend(*passes_for_backend)
|
||||
backend_func = TestBackend(*passes_for_backend, func_pass)
|
||||
backend = TestBackend(*passes_for_backend)
|
||||
|
||||
model = test_model_cls(hidden_size, hidden_size * 2)
|
||||
model = test_model_cls(hidden_size)
|
||||
|
||||
hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
|
||||
residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
|
||||
|
||||
compiled_model_no_func = torch.compile(model, backend=backend_no_func)
|
||||
compiled_model_no_func(hidden_states, residual)
|
||||
compiled_model_func = torch.compile(model, backend=backend_func)
|
||||
compiled_model_func(hidden_states, residual)
|
||||
if dynamic:
|
||||
torch._dynamo.mark_dynamic(hidden_states, 0)
|
||||
|
||||
assert sequence_parallelism_pass.matched_count == 1
|
||||
compiled_model = torch.compile(model, backend=backend)
|
||||
compiled_model(hidden_states)
|
||||
|
||||
assert sequence_parallelism_pass.matched_count == 4
|
||||
|
||||
# In pre-nodes, all reduce should be there,
|
||||
# reduce scatter and all gather should not
|
||||
backend_no_func.check_before_ops(model.ops_in_model_before())
|
||||
for op in model.ops_in_model_before():
|
||||
assert backend.op_count(op, before=True) == 4
|
||||
|
||||
# In post-nodes, reduce scatter and all gather should be there,
|
||||
# all reduce should not
|
||||
backend_no_func.check_after_ops(model.ops_in_model_after())
|
||||
for op in model.ops_in_model_after():
|
||||
assert backend.op_count(op, before=False) == 4
|
||||
|
||||
# check if the functionalization pass is applied
|
||||
for op in model.ops_in_model():
|
||||
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
|
||||
|
||||
# make sure the ops were all de-functionalized
|
||||
found = dict()
|
||||
for node in backend_func.graph_post_pass.nodes:
|
||||
for op in model.ops_in_model():
|
||||
if is_func(node, op):
|
||||
found[op] = True
|
||||
assert all(found[op] for op in model.ops_in_model())
|
||||
find_auto_fn(backend.graph_post_pass.nodes, op)
|
||||
|
||||
@ -2,59 +2,134 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
|
||||
from vllm.config import CompilationMode
|
||||
from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CompilationMode,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
|
||||
|
||||
class MyMod(torch.nn.Module):
|
||||
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||
if cache is not None:
|
||||
return x + cache
|
||||
return x * 2
|
||||
if x.size()[0] >= 4:
|
||||
return x * 2
|
||||
else:
|
||||
return x * 100
|
||||
|
||||
|
||||
class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
|
||||
class MyWrapper(TorchCompileWithNoGuardsWrapper):
|
||||
def __init__(self, model):
|
||||
self.model = model
|
||||
compiled_callable = torch.compile(self.forward, backend="eager")
|
||||
super().__init__(
|
||||
compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
|
||||
super().__init__()
|
||||
|
||||
def forward(self, x: torch.Tensor): # type: ignore[override]
|
||||
# this is the function to be compiled
|
||||
return self.model(x)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
|
||||
"""Test basic functionality of TorchCompileWithNoGuardsWrapper."""
|
||||
# Set the environment variable for this test
|
||||
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
||||
|
||||
# Create a proper vLLM config instead of mocking
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.compilation_config = CompilationConfig()
|
||||
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
|
||||
vllm_config.compilation_config.backend = "inductor"
|
||||
|
||||
# Test DYNAMO_TRACE_ONCE
|
||||
with set_current_vllm_config(vllm_config):
|
||||
torch._dynamo.reset()
|
||||
mod = MyMod()
|
||||
wrapper = MyWrapper(mod)
|
||||
|
||||
# First call should trigger compilation
|
||||
x = torch.tensor([1, 2, 3, 4])
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
result1 = wrapper(x)
|
||||
expected1 = torch.tensor([2, 4, 6, 8])
|
||||
assert torch.allclose(result1, expected1), (
|
||||
f"Expected {expected1}, got {result1}"
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||
# this is the function to be compiled
|
||||
return self.model(x, cache)
|
||||
# Second call should use compiled code
|
||||
x2 = torch.tensor([1, 2, 3])
|
||||
result2 = wrapper(x2)
|
||||
expected2 = torch.tensor([2, 4, 6])
|
||||
assert torch.allclose(result2, expected2), (
|
||||
f"Expected {expected2}, got {result2}"
|
||||
)
|
||||
|
||||
def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||
# let torch.compile compile twice
|
||||
if len(self.compiled_codes) == 2:
|
||||
dispatch_id = 0 if cache is None else 1
|
||||
with self.dispatch_to_code(dispatch_id):
|
||||
return self.forward(x, cache)
|
||||
else:
|
||||
return self.compiled_callable(x, cache)
|
||||
# without the wrapper result would be different.
|
||||
result3 = mod(x2)
|
||||
expected3 = torch.tensor([100, 200, 300])
|
||||
|
||||
assert torch.allclose(result3, expected3), (
|
||||
f"Expected {result3}, got {expected3}"
|
||||
)
|
||||
|
||||
def test_torch_compile_wrapper():
|
||||
mod = MyMod()
|
||||
wrappers = []
|
||||
for i in range(3):
|
||||
torch._dynamo.reset()
|
||||
# with STOCK_TORCH_COMPILE we do not remove guards.
|
||||
vllm_config.compilation_config.mode = CompilationMode.STOCK_TORCH_COMPILE
|
||||
torch._dynamo.reset()
|
||||
with set_current_vllm_config(vllm_config):
|
||||
mod = MyMod()
|
||||
wrapper = MyWrapper(mod)
|
||||
wrappers.append(wrapper)
|
||||
x = torch.tensor([1])
|
||||
wrapper(x, None) # profile run, compile
|
||||
# create a cache tensor
|
||||
cache = torch.tensor([2])
|
||||
wrapper(x, cache) # warm up with cache, recompile
|
||||
|
||||
# for new input, dispatch to the compiled code directly
|
||||
new_x = torch.tensor([3])
|
||||
assert wrapper(new_x, None).item() == 6 # dispatch to the first compiled code
|
||||
assert wrapper(new_x, cache).item() == 5 # dispatch to the second compiled code
|
||||
# First call should trigger compilation
|
||||
x = torch.tensor([1, 2, 3, 4])
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
for wrapper in wrappers:
|
||||
# make sure they have independent compiled codes
|
||||
assert len(wrapper.compiled_codes) == 2
|
||||
result1 = wrapper(x)
|
||||
expected1 = torch.tensor([2, 4, 6, 8])
|
||||
assert torch.allclose(result1, expected1), (
|
||||
f"Expected {expected1}, got {result1}"
|
||||
)
|
||||
|
||||
# Second call should triger another compilation
|
||||
x2 = torch.tensor([1, 2, 3])
|
||||
result2 = wrapper(x2)
|
||||
expected2 = torch.tensor([100, 200, 300])
|
||||
assert torch.allclose(result2, expected2), (
|
||||
f"Expected {expected2}, got {result2}"
|
||||
)
|
||||
|
||||
# NO_COMPILATION level not supported.
|
||||
vllm_config.compilation_config.mode = None
|
||||
torch._dynamo.reset()
|
||||
with set_current_vllm_config(vllm_config):
|
||||
torch._dynamo.reset()
|
||||
mod = MyMod()
|
||||
|
||||
try:
|
||||
wrapper = MyWrapper(mod)
|
||||
except Exception:
|
||||
return
|
||||
raise AssertionError("expected an exception to be raised")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run with both parameter values
|
||||
|
||||
class MockMonkeypatch:
|
||||
def setenv(self, name, value):
|
||||
os.environ[name] = value
|
||||
|
||||
mp = MockMonkeypatch()
|
||||
|
||||
print("Testing with VLLM_USE_BYTECODE_HOOK=False")
|
||||
test_torch_compile_wrapper(False, mp)
|
||||
|
||||
print("Testing with VLLM_USE_BYTECODE_HOOK=True")
|
||||
test_torch_compile_wrapper(True, mp)
|
||||
|
||||
print("All tests passed!")
|
||||
|
||||
@ -1384,3 +1384,16 @@ def image_urls(request, local_asset_server) -> list[str]:
|
||||
"""Indirect fixture: takes a list of names, returns list of full URLs."""
|
||||
names: list[str] = request.param
|
||||
return [local_asset_server.url_for(name) for name in names]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def disable_deepgemm_ue8m0(monkeypatch):
|
||||
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
|
||||
|
||||
with monkeypatch.context() as monkeypatch_ctx:
|
||||
monkeypatch_ctx.setenv("VLLM_USE_DEEP_GEMM_E8M0", "0")
|
||||
is_deep_gemm_e8m0_used.cache_clear()
|
||||
yield
|
||||
# Clear cache so the next time it is used it is processed with the
|
||||
# default VLLM_USE_DEEP_GEMM_E8M0 setting.
|
||||
is_deep_gemm_e8m0_used.cache_clear()
|
||||
|
||||
@ -39,6 +39,7 @@ class ParallelSetup(NamedTuple):
|
||||
class CPTestOptions(NamedTuple):
|
||||
multi_node_only: bool
|
||||
load_format: str | None = None
|
||||
attn_backend: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -58,6 +59,7 @@ class CPTestSettings:
|
||||
multi_node_only: bool = False,
|
||||
runner: RunnerOption = "auto",
|
||||
load_format: str | None = None,
|
||||
attn_backend: str | None = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
for eager_mode_val in [False]:
|
||||
@ -79,7 +81,9 @@ class CPTestSettings:
|
||||
distributed_backends=["mp"],
|
||||
runner=runner,
|
||||
test_options=CPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
multi_node_only=multi_node_only,
|
||||
load_format=load_format,
|
||||
attn_backend=attn_backend,
|
||||
),
|
||||
)
|
||||
|
||||
@ -117,7 +121,7 @@ def _compare_cp_with_tp(
|
||||
chunked_prefill,
|
||||
) = parallel_setup
|
||||
|
||||
multi_node_only, load_format = test_options
|
||||
multi_node_only, load_format, attn_backend = test_options
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
@ -177,6 +181,13 @@ def _compare_cp_with_tp(
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
|
||||
if not attn_backend:
|
||||
cp_env = tp_env = {}
|
||||
else:
|
||||
cp_env = tp_env = {
|
||||
"VLLM_ATTENTION_BACKEND": attn_backend,
|
||||
}
|
||||
|
||||
cp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
@ -205,6 +216,8 @@ def _compare_cp_with_tp(
|
||||
model_id,
|
||||
cp_args,
|
||||
tp_args,
|
||||
cp_env,
|
||||
tp_env,
|
||||
method=method,
|
||||
max_wait_seconds=720,
|
||||
)
|
||||
|
||||
437
tests/distributed/test_multiproc_executor.py
Normal file
437
tests/distributed/test_multiproc_executor.py
Normal file
@ -0,0 +1,437 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Integration tests for MultiprocExecutor at the executor level.
|
||||
This test directly tests the executor without going through the LLM interface,
|
||||
focusing on executor initialization, RPC calls, and distributed execution.
|
||||
"""
|
||||
|
||||
import multiprocessing
|
||||
import os
|
||||
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import get_open_port
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
|
||||
|
||||
MODEL = "facebook/opt-125m"
|
||||
|
||||
|
||||
def create_vllm_config(
|
||||
tensor_parallel_size: int = 1,
|
||||
pipeline_parallel_size: int = 1,
|
||||
max_model_len: int = 256,
|
||||
gpu_memory_utilization: float = 0.3,
|
||||
distributed_executor_backend: str = "mp",
|
||||
nnodes: int = 1,
|
||||
node_rank: int = 0,
|
||||
master_port: int = 0,
|
||||
) -> VllmConfig:
|
||||
"""Create a VllmConfig for testing using EngineArgs."""
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
pipeline_parallel_size=pipeline_parallel_size,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
|
||||
# Override distributed node settings if needed
|
||||
if nnodes > 1 or node_rank > 0:
|
||||
vllm_config.parallel_config.nnodes = nnodes
|
||||
vllm_config.parallel_config.node_rank = node_rank
|
||||
vllm_config.parallel_config.master_port = master_port
|
||||
if nnodes > 1:
|
||||
vllm_config.parallel_config.disable_custom_all_reduce = True
|
||||
|
||||
return vllm_config
|
||||
|
||||
|
||||
def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput:
|
||||
"""Create a minimal SchedulerOutput for testing."""
|
||||
# This is a simplified version - in practice you'd need proper
|
||||
# SchedulerOutput construction based on the actual vLLM v1 API
|
||||
return SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_resumed_reqs=[],
|
||||
scheduled_running_reqs=[],
|
||||
num_scheduled_tokens={},
|
||||
total_num_scheduled_tokens=0,
|
||||
)
|
||||
|
||||
|
||||
def test_multiproc_executor_initialization():
|
||||
"""Test that MultiprocExecutor can be initialized with proper config."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
# Create executor - this should initialize workers
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify executor properties
|
||||
assert executor.world_size == 1, "World size should be 1 for single GPU"
|
||||
assert executor.local_world_size == 1, "Local world size should be 1"
|
||||
assert hasattr(executor, "workers"), "Executor should have workers"
|
||||
assert len(executor.workers) == 1, "Should have 1 worker for single GPU"
|
||||
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_initialization_tensor_parallel():
|
||||
"""Test MultiprocExecutor initialization with tensor parallelism."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
# Create executor
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify executor properties
|
||||
assert executor.world_size == 2, "World size should be 2 for TP=2"
|
||||
assert executor.local_world_size == 2, "Local world size should be 2"
|
||||
assert len(executor.workers) == 2, "Should have 2 workers for TP=2"
|
||||
|
||||
# Verify output rank calculation
|
||||
output_rank = executor._get_output_rank()
|
||||
assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1"
|
||||
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_collective_rpc():
|
||||
"""Test collective RPC calls to all workers."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
# Create executor
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Test check_health RPC - should work without errors
|
||||
executor.check_health()
|
||||
|
||||
# Test that RPC works correctly
|
||||
# Note: We're just testing that the RPC mechanism works,
|
||||
# not testing actual model execution here
|
||||
assert not executor.is_failed, "Executor should not be in failed state"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
def test_multiproc_executor_failure_callback():
|
||||
"""Test failure callback registration and invocation."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Test callback registration
|
||||
callback_invoked = []
|
||||
|
||||
def test_callback():
|
||||
callback_invoked.append(True)
|
||||
|
||||
# Register callback
|
||||
executor.register_failure_callback(test_callback)
|
||||
|
||||
# Callback should not be invoked yet
|
||||
assert len(callback_invoked) == 0, "Callback should not be invoked immediately"
|
||||
|
||||
# Simulate failure
|
||||
executor.is_failed = True
|
||||
|
||||
# Register another callback - should be invoked immediately
|
||||
executor.register_failure_callback(test_callback)
|
||||
assert len(callback_invoked) == 1, (
|
||||
"Callback should be invoked when executor is failed"
|
||||
)
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_worker_monitor():
|
||||
"""Test that worker monitor is set up correctly."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Verify all worker processes are alive
|
||||
for worker in executor.workers:
|
||||
assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive"
|
||||
|
||||
# Verify executor is not in failed state
|
||||
assert not executor.is_failed, "Executor should not be in failed state"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
# After shutdown, workers should be terminated
|
||||
import time
|
||||
|
||||
time.sleep(0.5) # Give processes time to terminate
|
||||
for worker in executor.workers:
|
||||
assert not worker.proc.is_alive(), (
|
||||
f"Worker rank {worker.rank} should terminate after shutdown"
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_get_response_message_queues():
|
||||
"""Test message queue retrieval for different ranks."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Get all message queues
|
||||
all_queues = executor.get_response_mqs()
|
||||
assert len(all_queues) == 2, "Should have 2 message queues for 2 workers"
|
||||
|
||||
# Get message queue for specific rank
|
||||
rank0_queue = executor.get_response_mqs(unique_reply_rank=0)
|
||||
assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0"
|
||||
|
||||
rank1_queue = executor.get_response_mqs(unique_reply_rank=1)
|
||||
assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
def test_multiproc_executor_shutdown_cleanup():
|
||||
"""Test that shutdown properly cleans up resources."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify executor is set up
|
||||
assert hasattr(executor, "workers"), "Executor should have workers"
|
||||
assert len(executor.workers) > 0, "Should have at least one worker"
|
||||
|
||||
# Shutdown
|
||||
executor.shutdown()
|
||||
|
||||
# Verify cleanup
|
||||
import time
|
||||
|
||||
time.sleep(0.5) # Give processes time to terminate
|
||||
|
||||
for worker in executor.workers:
|
||||
assert not worker.proc.is_alive(), "Worker processes should be terminated"
|
||||
|
||||
# Verify shutdown event is set
|
||||
assert executor.shutdown_event.is_set(), "Shutdown event should be set"
|
||||
|
||||
# Multiple shutdowns should be safe (idempotent)
|
||||
executor.shutdown()
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_multiproc_executor_pipeline_parallel():
|
||||
"""Test MultiprocExecutor with pipeline parallelism."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=2,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Verify executor properties
|
||||
assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2"
|
||||
assert len(executor.workers) == 4, "Should have 4 workers"
|
||||
|
||||
# Verify output rank calculation
|
||||
# For TP=2, PP=2: output should be from the last PP stage (ranks 2-3)
|
||||
# Specifically rank 2 (first rank of last PP stage)
|
||||
output_rank = executor._get_output_rank()
|
||||
assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)"
|
||||
|
||||
# Verify max_concurrent_batches for pipeline parallel
|
||||
assert executor.max_concurrent_batches == 2, (
|
||||
"Max concurrent batches should equal PP size"
|
||||
)
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
def test_multiproc_executor_properties():
|
||||
"""Test various executor properties and configurations."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Test supports_pp property
|
||||
assert MultiprocExecutor.supports_pp is True, (
|
||||
"MultiprocExecutor should support pipeline parallelism"
|
||||
)
|
||||
|
||||
# Test world_size calculation
|
||||
assert executor.world_size == (
|
||||
executor.parallel_config.tensor_parallel_size
|
||||
* executor.parallel_config.pipeline_parallel_size
|
||||
), "World size should equal TP * PP"
|
||||
|
||||
# Test local_world_size calculation
|
||||
assert executor.local_world_size == (
|
||||
executor.parallel_config.world_size // executor.parallel_config.nnodes
|
||||
), "Local world size should be world_size / nnodes"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_multiproc_executor_multi_node():
|
||||
"""
|
||||
Test MultiprocExecutor with multi-node configuration.
|
||||
This simulates 2 nodes with TP=4:
|
||||
- Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2
|
||||
- Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
|
||||
Total world_size = 4, nnodes = 2
|
||||
"""
|
||||
port = get_open_port()
|
||||
# symm_mem does not work for simulating multi instance in single node
|
||||
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
|
||||
|
||||
def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int):
|
||||
"""Run a single node's executor."""
|
||||
executor = None
|
||||
try:
|
||||
# Set CUDA_VISIBLE_DEVICES for this node
|
||||
if node_rank == 0:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
|
||||
else:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
|
||||
|
||||
# Create config for this node
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=4, # Total TP across all nodes
|
||||
pipeline_parallel_size=1,
|
||||
nnodes=2, # 2 nodes
|
||||
node_rank=node_rank,
|
||||
master_port=port, # same port
|
||||
)
|
||||
|
||||
# Create executor for this node
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify node-specific properties
|
||||
assert executor.world_size == 4, (
|
||||
f"World size should be 4 on node {node_rank}"
|
||||
)
|
||||
assert executor.local_world_size == 2, (
|
||||
f"Local world size should be 2 on node {node_rank}"
|
||||
)
|
||||
assert len(executor.workers) == 2, (
|
||||
f"Should have 2 local workers on node {node_rank}"
|
||||
)
|
||||
|
||||
# Verify worker ranks are correct for this node
|
||||
expected_ranks = [node_rank * 2, node_rank * 2 + 1]
|
||||
actual_ranks = sorted([w.rank for w in executor.workers])
|
||||
assert actual_ranks == expected_ranks, (
|
||||
f"Node {node_rank} should have workers "
|
||||
f"with ranks {expected_ranks}, got {actual_ranks}"
|
||||
)
|
||||
# Verify all workers are alive
|
||||
for worker in executor.workers:
|
||||
assert worker.proc.is_alive(), (
|
||||
f"Worker rank {worker.rank} should be alive on node {node_rank}"
|
||||
)
|
||||
# executor.gen
|
||||
# Put success result in queue BEFORE shutdown to avoid hanging
|
||||
result_queue.put({"node": node_rank, "success": True})
|
||||
import time
|
||||
|
||||
time.sleep(2)
|
||||
executor.shutdown()
|
||||
except Exception as e:
|
||||
# Put failure result in queue
|
||||
result_queue.put({"node": node_rank, "success": False, "error": str(e)})
|
||||
raise e
|
||||
finally:
|
||||
if executor is not None:
|
||||
executor.shutdown()
|
||||
|
||||
# Create a queue to collect results from both processes
|
||||
result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue()
|
||||
|
||||
# Start both node processes
|
||||
processes = []
|
||||
for node_rank in range(2):
|
||||
p = multiprocessing.Process(
|
||||
target=run_node,
|
||||
args=(node_rank, result_queue, port),
|
||||
name=f"Node{node_rank}",
|
||||
)
|
||||
p.start()
|
||||
processes.append(p)
|
||||
|
||||
# Wait for both processes to complete
|
||||
all_completed = True
|
||||
for p in processes:
|
||||
p.join(timeout=60)
|
||||
if p.is_alive():
|
||||
p.terminate()
|
||||
p.join(timeout=20)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
all_completed = False
|
||||
|
||||
# Check results from both nodes
|
||||
results: list[dict[str, int | bool]] = []
|
||||
while len(results) < 2:
|
||||
try:
|
||||
result = result_queue.get(timeout=1)
|
||||
results.append(result)
|
||||
except Exception:
|
||||
pass
|
||||
assert all_completed, "Not all processes completed successfully"
|
||||
assert len(results) == 2, f"Expected 2 results, got {len(results)}"
|
||||
assert results[0]["success"], f"Node 0 failed: {results[0]}"
|
||||
assert results[1]["success"], f"Node 1 failed: {results[1]}"
|
||||
@ -18,6 +18,7 @@ import pytest
|
||||
from vllm.config.compilation import CompilationMode
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
@ -161,6 +162,7 @@ def _compare_sp(
|
||||
test_options: SPTestOptions,
|
||||
num_gpus_available: int,
|
||||
use_inductor_graph_partition: bool,
|
||||
enable_async_tp: bool,
|
||||
*,
|
||||
method: Literal["generate", "encode"],
|
||||
is_multimodal: bool,
|
||||
@ -244,10 +246,10 @@ def _compare_sp(
|
||||
|
||||
compilation_config = {
|
||||
"mode": CompilationMode.VLLM_COMPILE,
|
||||
"custom_ops": ["+rms_norm"],
|
||||
"compile_sizes": [4, 8],
|
||||
"pass_config": {
|
||||
"enable_sequence_parallelism": True,
|
||||
"enable_async_tp": enable_async_tp,
|
||||
"enable_fusion": enable_fusion,
|
||||
"enable_noop": True,
|
||||
},
|
||||
@ -307,6 +309,7 @@ SP_TEST_MODELS = [
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
|
||||
@pytest.mark.parametrize("enable_async_tp", [False]) # TODO: enable async TP
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_sp_generation(
|
||||
model_id: str,
|
||||
@ -316,10 +319,19 @@ def test_tp_sp_generation(
|
||||
test_options: SPTestOptions,
|
||||
num_gpus_available,
|
||||
use_inductor_graph_partition: bool,
|
||||
enable_async_tp: bool,
|
||||
):
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
# Skip FP8 SP-only test on sm89 (compute capability 8.9)
|
||||
if (
|
||||
"fp8" in model_id.lower()
|
||||
and current_platform.get_device_capability() < (9, 0)
|
||||
and (not enable_async_tp)
|
||||
):
|
||||
pytest.skip("FP8 reduction support begins with sm90 capable devices.")
|
||||
|
||||
_compare_sp(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
@ -328,6 +340,7 @@ def test_tp_sp_generation(
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
use_inductor_graph_partition,
|
||||
enable_async_tp=enable_async_tp,
|
||||
method="generate",
|
||||
is_multimodal=False,
|
||||
)
|
||||
|
||||
262
tests/entrypoints/openai/test_serving_tokens.py
Normal file
262
tests/entrypoints/openai/test_serving_tokens.py
Normal file
@ -0,0 +1,262 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.v1.engine.detokenizer import check_stop_strings
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
GEN_ENDPOINT = "/inference/v1/generate"
|
||||
|
||||
|
||||
def get_vocab_size(model_name):
|
||||
config = ModelConfig(
|
||||
model=model_name,
|
||||
seed=0,
|
||||
dtype="bfloat16",
|
||||
)
|
||||
return config.get_vocab_size()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer():
|
||||
return AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def messages():
|
||||
return [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "How many countries are in the EU?"},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(request):
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"1024",
|
||||
"--enforce-eager",
|
||||
]
|
||||
|
||||
extra_args = getattr(request, "param", None)
|
||||
if extra_args is not None:
|
||||
args = args + (
|
||||
list(extra_args)
|
||||
if isinstance(extra_args, (list, tuple))
|
||||
else [str(extra_args)]
|
||||
)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server: RemoteOpenAIServer):
|
||||
transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
|
||||
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
|
||||
async with httpx.AsyncClient(
|
||||
transport=transport,
|
||||
base_url=server.url_root,
|
||||
timeout=600,
|
||||
headers=headers,
|
||||
) as c:
|
||||
yield c
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_endpoint(client):
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": [1, 2, 3],
|
||||
"sampling_params": {"max_tokens": 5},
|
||||
"stream": False,
|
||||
}
|
||||
resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
assert "choices" in data
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # default with Qwen3
|
||||
)
|
||||
for ignore_eos in [True, False]:
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
# NOTE coordinator will set this to skip detokenization
|
||||
"detokenize": False,
|
||||
"ignore_eos": ignore_eos,
|
||||
},
|
||||
"stream": False,
|
||||
}
|
||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
generate_data = generate_resp.json()
|
||||
generate_res = tokenizer.decode(
|
||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"messages": messages,
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"stream": False,
|
||||
"ignore_eos": ignore_eos,
|
||||
"chat_template_kwargs": dict(enable_thinking=False),
|
||||
}
|
||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||
completions_data = completions_resp.json()
|
||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||
|
||||
assert generate_res == completions_res
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stop_string_workflow(client, tokenizer, messages):
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # default with Qwen3
|
||||
)
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"detokenize": False,
|
||||
# stop strings are only supported when detokenize is True.
|
||||
"stop": ["27 member"],
|
||||
},
|
||||
# TODO stream test is much more interesting
|
||||
"stream": False,
|
||||
}
|
||||
with pytest.raises(httpx.HTTPStatusError):
|
||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
generate_resp.raise_for_status()
|
||||
|
||||
payload["sampling_params"]["stop"] = None
|
||||
generate_resp = await client.post(
|
||||
GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"}
|
||||
)
|
||||
generate_data = generate_resp.json()
|
||||
generate_res = tokenizer.decode(
|
||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
||||
)
|
||||
|
||||
# NOTE This is under the responsibility of the coordinator
|
||||
# stop_checker = StopChecker(
|
||||
# max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer
|
||||
# )
|
||||
stop_str, truncate_to = check_stop_strings(
|
||||
generate_res, len(generate_res), ["27 member"], False
|
||||
)
|
||||
assert stop_str == "27 member"
|
||||
# abort request that hit stop string (requires tokens-only mode)
|
||||
# res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501
|
||||
# res.raise_for_status()
|
||||
generate_res = generate_res[:truncate_to]
|
||||
|
||||
# Get stop_str response from chat completions
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"messages": messages,
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"stream": False,
|
||||
"stop": ["27 member"],
|
||||
"chat_template_kwargs": dict(enable_thinking=False),
|
||||
}
|
||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||
completions_data = completions_resp.json()
|
||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||
assert generate_res == completions_res
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"server",
|
||||
[
|
||||
[
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
"Alice=charent/self_cognition_Alice",
|
||||
"Bob=charent/self_cognition_Bob",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
]
|
||||
],
|
||||
indirect=True,
|
||||
)
|
||||
async def test_generate_with_lora_adapter(client, tokenizer, messages):
|
||||
# Verify adapters are listed
|
||||
models_resp = await client.get("/v1/models")
|
||||
models_resp.raise_for_status()
|
||||
models = {m["id"] for m in models_resp.json().get("data", [])}
|
||||
assert {"Alice", "Bob"}.issubset(models)
|
||||
|
||||
# Generate using a LoRA adapter by specifying its name as the model
|
||||
payload = {
|
||||
"model": "Alice",
|
||||
"token_ids": [1, 2, 3],
|
||||
"sampling_params": {"max_tokens": 5},
|
||||
"stream": False,
|
||||
}
|
||||
resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
assert "choices" in data
|
||||
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # default with Qwen3
|
||||
)
|
||||
payload = {
|
||||
"model": "Alice",
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"detokenize": False,
|
||||
},
|
||||
"stream": False,
|
||||
}
|
||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
generate_data = generate_resp.json()
|
||||
generate_res = tokenizer.decode(
|
||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": "Alice",
|
||||
"messages": messages,
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"stream": False,
|
||||
"chat_template_kwargs": dict(enable_thinking=False),
|
||||
}
|
||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||
completions_data = completions_resp.json()
|
||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||
|
||||
assert generate_res == completions_res
|
||||
@ -46,6 +46,16 @@ def test_single_input_classification(server: RemoteOpenAIServer, model_name: str
|
||||
assert hasattr(output.data[0], "probs")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str):
|
||||
response = requests.post(
|
||||
server.url_for("classify"),
|
||||
json={"model": model_name, "input": "hello", "add_special_tokens": False},
|
||||
)
|
||||
response.raise_for_status()
|
||||
ClassificationResponse.model_validate(response.json())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
|
||||
input_texts = [
|
||||
|
||||
@ -0,0 +1,95 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.openai.protocol import ClassificationResponse
|
||||
|
||||
VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
|
||||
MAXIMUM_VIDEOS = 1
|
||||
TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
|
||||
|
||||
HF_OVERRIDES = {
|
||||
"text_config": {
|
||||
"architectures": ["Qwen2_5_VLForSequenceClassification"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_vlm_classify():
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
"--max-model-len",
|
||||
"5000",
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"video": MAXIMUM_VIDEOS}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
|
||||
def test_classify_accepts_chat_text_only(
|
||||
server_vlm_classify: RemoteOpenAIServer, model_name: str
|
||||
) -> None:
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Please classify this text request."},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
response = requests.post(
|
||||
server_vlm_classify.url_for("classify"),
|
||||
json={"model": model_name, "messages": messages},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
output = ClassificationResponse.model_validate(response.json())
|
||||
|
||||
assert output.object == "list"
|
||||
assert output.model == model_name
|
||||
assert len(output.data) == 1
|
||||
assert len(output.data[0].probs) == 2
|
||||
assert output.usage.prompt_tokens == 22
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
|
||||
def test_classify_accepts_chat_video_url(
|
||||
server_vlm_classify: RemoteOpenAIServer, model_name: str
|
||||
) -> None:
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Please classify this video."},
|
||||
{"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
response = requests.post(
|
||||
server_vlm_classify.url_for("classify"),
|
||||
json={"model": model_name, "messages": messages},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
output = ClassificationResponse.model_validate(response.json())
|
||||
|
||||
assert output.object == "list"
|
||||
assert output.model == model_name
|
||||
assert len(output.data) == 1
|
||||
assert len(output.data[0].probs) == 2
|
||||
assert output.usage.prompt_tokens == 4807
|
||||
@ -83,8 +83,12 @@ async def call_vllm_api(
|
||||
stop: list[str] | None = None,
|
||||
url: str | None = None,
|
||||
seed: int | None = None,
|
||||
) -> str:
|
||||
"""Call vLLM's OpenAI-compatible completions endpoint."""
|
||||
) -> tuple[str, int]:
|
||||
"""Call vLLM's OpenAI-compatible completions endpoint.
|
||||
|
||||
Returns:
|
||||
Tuple of (response_text, completion_tokens)
|
||||
"""
|
||||
data = {
|
||||
"prompt": prompt,
|
||||
"temperature": temperature,
|
||||
@ -98,10 +102,12 @@ async def call_vllm_api(
|
||||
async with session.post(f"{url}/v1/completions", json=data) as response:
|
||||
response.raise_for_status()
|
||||
result = await response.json()
|
||||
return result["choices"][0]["text"]
|
||||
text = result["choices"][0]["text"]
|
||||
completion_tokens = result.get("usage", {}).get("completion_tokens", 0)
|
||||
return text, completion_tokens
|
||||
except Exception as e:
|
||||
print(f"Error calling vLLM API: {e}")
|
||||
return ""
|
||||
return "", 0
|
||||
|
||||
|
||||
def evaluate_gsm8k(
|
||||
@ -146,10 +152,11 @@ def evaluate_gsm8k(
|
||||
# Run evaluation
|
||||
async def run_async_evaluation():
|
||||
states: list[str] = [""] * num_questions
|
||||
output_tokens: list[int] = [0] * num_questions
|
||||
|
||||
async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
|
||||
async def get_answer(session: aiohttp.ClientSession, i: int) -> tuple[str, int]:
|
||||
prompt = few_shot_examples + questions[i]
|
||||
answer = await call_vllm_api(
|
||||
answer, tokens = await call_vllm_api(
|
||||
session=session,
|
||||
prompt=prompt,
|
||||
temperature=temperature,
|
||||
@ -159,7 +166,8 @@ def evaluate_gsm8k(
|
||||
seed=seed,
|
||||
)
|
||||
states[i] = answer
|
||||
return answer
|
||||
output_tokens[i] = tokens
|
||||
return answer, tokens
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
timeout=aiohttp.ClientTimeout(total=600)
|
||||
@ -167,24 +175,28 @@ def evaluate_gsm8k(
|
||||
tasks = [get_answer(session, i) for i in range(num_questions)]
|
||||
await tqdm.gather(*tasks, desc="Evaluating")
|
||||
|
||||
return states
|
||||
return states, output_tokens
|
||||
|
||||
print(f"Running GSM8K evaluation: {num_questions} questions, {num_shots}-shot")
|
||||
|
||||
tic = time.perf_counter()
|
||||
states = asyncio.run(run_async_evaluation())
|
||||
states, output_tokens = asyncio.run(run_async_evaluation())
|
||||
latency = time.perf_counter() - tic
|
||||
|
||||
# Compute metrics
|
||||
preds = [get_answer_value(state) for state in states]
|
||||
accuracy = np.mean(np.array(preds) == np.array(labels))
|
||||
invalid_rate = np.mean(np.array(preds) == INVALID)
|
||||
total_output_tokens = sum(output_tokens)
|
||||
tokens_per_second = total_output_tokens / latency if latency > 0 else 0.0
|
||||
|
||||
result = {
|
||||
"accuracy": accuracy,
|
||||
"invalid_rate": invalid_rate,
|
||||
"latency": latency,
|
||||
"questions_per_second": num_questions / latency,
|
||||
"total_output_tokens": total_output_tokens,
|
||||
"tokens_per_second": tokens_per_second,
|
||||
"num_questions": num_questions,
|
||||
"num_shots": num_shots,
|
||||
"max_tokens": max_tokens,
|
||||
@ -236,6 +248,8 @@ def main() -> None:
|
||||
print(f"Invalid responses: {result['invalid_rate']:.3f}")
|
||||
print(f"Total latency: {result['latency']:.3f} s")
|
||||
print(f"Questions per second: {result['questions_per_second']:.3f}")
|
||||
print(f"Total output tokens: {result['total_output_tokens']}")
|
||||
print(f"Output tokens per second: {result['tokens_per_second']:.3f}")
|
||||
|
||||
# Optional file saving
|
||||
if args.save_results:
|
||||
|
||||
@ -170,6 +170,7 @@ def test_cascade(
|
||||
logits_soft_cap=soft_cap if soft_cap is not None else 0,
|
||||
block_table=block_tables,
|
||||
common_prefix_len=common_prefix_len,
|
||||
max_num_splits=0, # no max
|
||||
fa_version=fa_version,
|
||||
)
|
||||
|
||||
|
||||
@ -40,8 +40,6 @@ NUM_EXPERTS = [8, 64]
|
||||
TOP_KS = [1, 2, 6]
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@ -33,8 +33,6 @@ if current_platform.get_device_capability() < (9, 0):
|
||||
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
# Test configurations
|
||||
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
|
||||
|
||||
@ -18,8 +18,6 @@ if current_platform.get_device_capability() < (7, 0):
|
||||
pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
DTYPES = [torch.bfloat16]
|
||||
|
||||
|
||||
@ -42,8 +42,6 @@ MNK_FACTORS = [
|
||||
]
|
||||
|
||||
vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
|
||||
@ -7,6 +7,7 @@ fp8 block-quantized case.
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
from contextlib import contextmanager
|
||||
|
||||
import pytest
|
||||
import torch.distributed
|
||||
@ -14,6 +15,7 @@ from torch.distributed import ProcessGroup
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEQuantConfig,
|
||||
fp8_w8a8_moe_quant_config,
|
||||
@ -21,7 +23,11 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
|
||||
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
|
||||
from vllm.utils.deep_gemm import (
|
||||
get_mk_alignment_for_contiguous_layout,
|
||||
is_deep_gemm_e8m0_used,
|
||||
is_deep_gemm_supported,
|
||||
)
|
||||
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
|
||||
|
||||
from ...utils import multi_gpu_test
|
||||
@ -57,6 +63,23 @@ requires_deep_gemm = pytest.mark.skipif(
|
||||
P = ParamSpec("P")
|
||||
|
||||
|
||||
@contextmanager
|
||||
def with_dp_metadata(M: int, world_size: int):
|
||||
num_tokens_across_dp = torch.tensor([M] * world_size, device="cpu", dtype=torch.int)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.parallel_config.data_parallel_size = world_size
|
||||
vllm_config.parallel_config.enable_expert_parallel = True
|
||||
|
||||
with set_forward_context(
|
||||
None,
|
||||
vllm_config,
|
||||
num_tokens=M,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
):
|
||||
yield
|
||||
|
||||
|
||||
def next_power_of_2(x):
|
||||
import math
|
||||
|
||||
@ -281,18 +304,21 @@ def deepep_deepgemm_moe_impl(
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
out = mk.forward(
|
||||
hidden_states=test_tensors.rank_tokens,
|
||||
w1=w1,
|
||||
w2=w2,
|
||||
topk_weights=test_tensors.topk_weights,
|
||||
topk_ids=test_tensors.topk,
|
||||
inplace=False,
|
||||
activation="silu",
|
||||
global_num_experts=num_experts,
|
||||
expert_map=build_expert_map(),
|
||||
apply_router_weight_on_input=False,
|
||||
)
|
||||
with with_dp_metadata(
|
||||
M=test_tensors.rank_tokens.size(0), world_size=pgi.world_size
|
||||
):
|
||||
out = mk.forward(
|
||||
hidden_states=test_tensors.rank_tokens,
|
||||
w1=w1,
|
||||
w2=w2,
|
||||
topk_weights=test_tensors.topk_weights,
|
||||
topk_ids=test_tensors.topk,
|
||||
inplace=False,
|
||||
activation="silu",
|
||||
global_num_experts=num_experts,
|
||||
expert_map=build_expert_map(),
|
||||
apply_router_weight_on_input=False,
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
@ -413,19 +439,16 @@ NUM_EXPERTS = [32]
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@requires_deep_ep
|
||||
@requires_deep_gemm
|
||||
@pytest.mark.skipif(
|
||||
is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
|
||||
)
|
||||
def test_ht_deepep_deepgemm_moe(
|
||||
mnk: tuple[int, int, int],
|
||||
num_experts: int,
|
||||
topk: int,
|
||||
world_dp_size: tuple[int, int],
|
||||
disable_deepgemm_ue8m0,
|
||||
):
|
||||
"""
|
||||
Tests for High-Throughput DeepEP + DeepGemm integration.
|
||||
"""
|
||||
import deep_gemm
|
||||
|
||||
m, n, k = mnk
|
||||
current_platform.seed_everything(7)
|
||||
@ -433,7 +456,7 @@ def test_ht_deepep_deepgemm_moe(
|
||||
if topk > num_experts:
|
||||
pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
|
||||
|
||||
block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
|
||||
block_m = get_mk_alignment_for_contiguous_layout()[0]
|
||||
block_size = [block_m, block_m]
|
||||
|
||||
world_size, dp_size = world_dp_size
|
||||
@ -487,9 +510,6 @@ USE_FP8_DISPATCH = [False]
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@requires_deep_ep
|
||||
@requires_deep_gemm
|
||||
@pytest.mark.skipif(
|
||||
is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
|
||||
)
|
||||
def test_ll_deepep_deepgemm_moe(
|
||||
mnk: tuple[int, int, int],
|
||||
num_experts: int,
|
||||
@ -497,10 +517,12 @@ def test_ll_deepep_deepgemm_moe(
|
||||
use_fp8_dispatch: bool,
|
||||
block_size: list[int],
|
||||
world_dp_size: tuple[int, int],
|
||||
disable_deepgemm_ue8m0,
|
||||
):
|
||||
"""
|
||||
Tests for Low-Latency DeepEP + DeepGemm integration.
|
||||
"""
|
||||
assert not is_deep_gemm_e8m0_used()
|
||||
|
||||
m, n, k = mnk
|
||||
current_platform.seed_everything(7)
|
||||
|
||||
@ -294,7 +294,7 @@ def torch_moe_impl(
|
||||
# blockwise quant and de-quant.
|
||||
assert not per_act_token_quant
|
||||
a = test_tensors.rank_tokens
|
||||
aq, aq_scale = per_token_group_quant_fp8(a, 128)
|
||||
aq, aq_scale = per_token_group_quant_fp8(a, 128, use_ue8m0=False)
|
||||
a = (
|
||||
(aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1))
|
||||
.view(a.shape)
|
||||
|
||||
@ -45,8 +45,6 @@ MNK_FACTORS = [
|
||||
]
|
||||
|
||||
vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
def quant_fp8_per_tensor_batches(a):
|
||||
@ -79,10 +77,14 @@ class TestData:
|
||||
|
||||
@staticmethod
|
||||
def make_moe_tensors_8bit(
|
||||
m: int, k: int, n: int, e: int, reorder: bool
|
||||
m: int, k: int, n: int, e: int, reorder: bool, activation: str = "silu"
|
||||
) -> "TestData":
|
||||
is_gated = activation != "relu2_no_mul"
|
||||
|
||||
hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||
w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16)
|
||||
w13 = torch.randn(
|
||||
(e, (2 * n) if is_gated else n, k), device="cuda", dtype=torch.bfloat16
|
||||
)
|
||||
w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16)
|
||||
|
||||
# Scale to fp8
|
||||
@ -192,18 +194,22 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
|
||||
@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("activation", ["silu", "relu2_no_mul"])
|
||||
def test_flashinfer_cutlass_moe_fp8_no_graph(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
activation: str,
|
||||
monkeypatch,
|
||||
):
|
||||
current_platform.seed_everything(7)
|
||||
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
|
||||
with set_current_vllm_config(vllm_config):
|
||||
td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)
|
||||
td = TestData.make_moe_tensors_8bit(
|
||||
m, k, n, e, reorder=False, activation=activation
|
||||
)
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
@ -235,7 +241,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
inplace=False,
|
||||
activation="silu",
|
||||
activation=activation,
|
||||
global_num_experts=e,
|
||||
expert_map=None,
|
||||
apply_router_weight_on_input=True,
|
||||
@ -255,7 +261,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
|
||||
td.layer,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
activation="silu",
|
||||
activation=activation,
|
||||
global_num_experts=e,
|
||||
expert_map=None,
|
||||
apply_router_weight_on_input=True,
|
||||
|
||||
@ -81,8 +81,6 @@ FUSED_MOE_WN16_MNK_FACTORS = [
|
||||
]
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
def run_moe_test(
|
||||
|
||||
@ -192,8 +192,6 @@ def pplx_cutlass_moe(
|
||||
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
def _pplx_moe(
|
||||
|
||||
@ -81,8 +81,6 @@ TOP_KS = [1, 2, 6]
|
||||
DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
def torch_prepare(
|
||||
|
||||
@ -1,6 +1,9 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
@ -8,27 +11,30 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
||||
persistent_masked_m_silu_mul_quant,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
|
||||
from vllm.utils.math_utils import cdiv, round_up
|
||||
|
||||
fp8_dtype = torch.float8_e4m3fn
|
||||
|
||||
CASES = [
|
||||
(1, 1, 128, fp8_dtype),
|
||||
(1, 4, 128, fp8_dtype),
|
||||
(2, 4, 256, fp8_dtype),
|
||||
(32, 64, 256, fp8_dtype),
|
||||
(17, 31, 768, fp8_dtype),
|
||||
(1, 1, 128 * 1, fp8_dtype),
|
||||
(1, 1, 128 * 3, fp8_dtype),
|
||||
(1, 1, 128 * 4, fp8_dtype),
|
||||
(8, 16, 128 * 1, fp8_dtype),
|
||||
(8, 16, 128 * 2, fp8_dtype),
|
||||
(8, 16, 128 * 3, fp8_dtype),
|
||||
(1, 4, 128 * 1, fp8_dtype),
|
||||
(2, 4, 128 * 2, fp8_dtype),
|
||||
(1, 4, 128 * 3, fp8_dtype),
|
||||
(8, 16, 128 * 4, fp8_dtype),
|
||||
(8, 16, 128 * 5, fp8_dtype),
|
||||
(8, 16, 128 * 6, fp8_dtype),
|
||||
(8, 16, 128 * 7, fp8_dtype),
|
||||
(8, 16, 128 * 8, fp8_dtype),
|
||||
(8, 16, 128 * 9, fp8_dtype),
|
||||
(8, 64, 7168, fp8_dtype),
|
||||
(8, 128, 128 * 33, fp8_dtype),
|
||||
(1, 4, 128 * 10, fp8_dtype),
|
||||
(8, 128, 7168, fp8_dtype),
|
||||
(8, 512, 7168, fp8_dtype),
|
||||
(8, 1024, 7168, fp8_dtype),
|
||||
(17, 31, 768, fp8_dtype),
|
||||
(32, 64, 256, fp8_dtype),
|
||||
(256, 8, 7168, fp8_dtype),
|
||||
(256, 32, 7168, fp8_dtype),
|
||||
(256, 64, 7168, fp8_dtype),
|
||||
@ -38,14 +44,159 @@ CASES = [
|
||||
]
|
||||
|
||||
|
||||
def as_uint8(x) -> torch.Tensor:
|
||||
return (
|
||||
torch.empty(x.shape, dtype=x.dtype, device=x.device).copy_(x).view(torch.uint8)
|
||||
)
|
||||
|
||||
|
||||
def silu(x: torch.Tensor) -> torch.Tensor:
|
||||
one_f32 = torch.tensor([1.0], device=x.device, dtype=torch.float32)
|
||||
x_f32 = x.to(torch.float32)
|
||||
act_f32 = x_f32 / (one_f32 + torch.exp(-x_f32))
|
||||
assert act_f32.dtype == torch.float32
|
||||
return act_f32.to(torch.bfloat16)
|
||||
|
||||
|
||||
def do_quant(x: torch.Tensor, group_size: int, ceil_ue8m0: bool):
|
||||
eps_bf16 = torch.tensor([1e-10], device=x.device, dtype=torch.bfloat16)
|
||||
one_bf16 = torch.tensor([1.0], device=x.device, dtype=torch.bfloat16)
|
||||
fp8_max_bf16 = torch.tensor(
|
||||
[torch.finfo(fp8_dtype).max], device=x.device, dtype=torch.bfloat16
|
||||
)
|
||||
fp8_min_bf16 = torch.tensor(
|
||||
[torch.finfo(fp8_dtype).min], device=x.device, dtype=torch.bfloat16
|
||||
)
|
||||
fp8_max_inv = one_bf16 / fp8_max_bf16
|
||||
assert fp8_max_inv.dtype == torch.bfloat16
|
||||
|
||||
assert x.size(-1) % group_size == 0
|
||||
num_groups = x.numel() // group_size
|
||||
x_og_shape = x.shape
|
||||
|
||||
x = x.to(torch.bfloat16)
|
||||
x = x.view((-1, group_size))
|
||||
amax = x.abs().amax(dim=1).clamp(min=eps_bf16)
|
||||
assert amax.dtype == torch.bfloat16
|
||||
s = amax * fp8_max_inv
|
||||
|
||||
if ceil_ue8m0:
|
||||
s = torch.exp2(
|
||||
torch.ceil(torch.log2(s).to(torch.bfloat16)).to(torch.bfloat16)
|
||||
).to(torch.bfloat16)
|
||||
|
||||
inv_s = one_bf16 / s
|
||||
inv_s = inv_s.view((num_groups, 1))
|
||||
xq = torch.clamp(x * inv_s, min=fp8_min_bf16.item(), max=fp8_max_bf16.item()).to(
|
||||
fp8_dtype
|
||||
)
|
||||
|
||||
xq = xq.view(x_og_shape)
|
||||
xs = s.view((-1, xq.size(-1) // group_size))
|
||||
return xq, xs
|
||||
|
||||
|
||||
def silu_mul_quant(
|
||||
gate: torch.Tensor, up: torch.Tensor, group_size: int, ceil_ue8m0: bool
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
assert gate.size(-1) % group_size == 0
|
||||
assert up.size(-1) % group_size == 0
|
||||
|
||||
assert gate.dtype == torch.bfloat16
|
||||
assert up.dtype == torch.bfloat16
|
||||
|
||||
act_bf16 = silu(gate)
|
||||
assert act_bf16.dtype == torch.bfloat16
|
||||
|
||||
# act & mul
|
||||
a_m = act_bf16 * up
|
||||
assert a_m.dtype == torch.bfloat16
|
||||
|
||||
q, s = do_quant(a_m, group_size, ceil_ue8m0)
|
||||
return q, s
|
||||
|
||||
|
||||
def pack_scales(x: torch.Tensor, tokens_per_expert: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
pack float32 scales into a int32 tensor
|
||||
"""
|
||||
assert x.dtype == torch.float32
|
||||
E, T, G = x.size()
|
||||
|
||||
# Add i32_padding here so we can view it as a i32 tensor later on.
|
||||
i32_padding = round_up(G, 4) - G
|
||||
ref_s_i8 = torch.empty((E, T, G + i32_padding), dtype=torch.uint8, device="cuda")
|
||||
for e in range(E):
|
||||
nt = tokens_per_expert[e].item()
|
||||
ref_s_i8[e, :nt, :G] = x[e, :nt].view(torch.int32) >> 23
|
||||
|
||||
ref_s_i32 = ref_s_i8.view(torch.int32)
|
||||
|
||||
return ref_s_i32
|
||||
|
||||
|
||||
def ref_with_scale_fmt(
|
||||
E: int,
|
||||
T: int,
|
||||
H: int,
|
||||
group_size: int,
|
||||
tokens_per_expert: torch.Tensor,
|
||||
gate: torch.Tensor,
|
||||
up: torch.Tensor,
|
||||
scale_fmt: DeepGemmQuantScaleFMT,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
The precision types of the operations triggered by this function
|
||||
match closely with the kernel implementation so we compare more
|
||||
accurately.
|
||||
"""
|
||||
scale_dtype = (
|
||||
torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
|
||||
)
|
||||
ceil_ue8m0 = scale_fmt in [
|
||||
DeepGemmQuantScaleFMT.UE8M0,
|
||||
DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
|
||||
]
|
||||
|
||||
ref_q = torch.empty((E, T, H), dtype=fp8_dtype, device="cuda")
|
||||
ref_s_f32 = torch.empty(
|
||||
(E, T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
|
||||
)
|
||||
|
||||
for e in range(E):
|
||||
nt = tokens_per_expert[e].item()
|
||||
if nt == 0:
|
||||
continue
|
||||
ref_q[e, :nt], ref_s_f32[e, :nt] = silu_mul_quant(
|
||||
gate[e, :nt], up[e, :nt], group_size, ceil_ue8m0=ceil_ue8m0
|
||||
)
|
||||
|
||||
if scale_dtype == torch.float32:
|
||||
return ref_q, ref_s_f32
|
||||
|
||||
assert scale_dtype == torch.int32
|
||||
return ref_q, pack_scales(ref_s_f32, tokens_per_expert)
|
||||
|
||||
|
||||
def token_random(E, T, H2, tokens_per_expert):
|
||||
"""
|
||||
Initialize each token in a random range so we test a range of
|
||||
scale values.
|
||||
"""
|
||||
y = torch.empty((E, T, H2), dtype=torch.bfloat16, device="cuda")
|
||||
for e in range(E):
|
||||
for t in range(tokens_per_expert[e].item()):
|
||||
exp = random.choice(range(1, 20))
|
||||
y[e, t].uniform_(-(2**exp), 2**exp)
|
||||
return y
|
||||
|
||||
|
||||
@pytest.mark.parametrize("E,T,H,fp8_type", CASES)
|
||||
@torch.inference_mode()
|
||||
def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
|
||||
def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
|
||||
group_size = 128
|
||||
current_platform.seed_everything(42)
|
||||
|
||||
# Input tensor of shape (E, T, 2*H)
|
||||
y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
|
||||
tokens_per_expert = torch.randint(
|
||||
low=0,
|
||||
high=T,
|
||||
@ -54,71 +205,83 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
|
||||
device="cuda",
|
||||
)
|
||||
|
||||
# Input tensor of shape (E, T, 2*H)
|
||||
y = token_random(E, T, 2 * H, tokens_per_expert)
|
||||
|
||||
gate = y[..., :H].to(torch.bfloat16)
|
||||
up = y[..., H:].to(torch.bfloat16)
|
||||
|
||||
scale_fmts = [
|
||||
DeepGemmQuantScaleFMT.FLOAT32,
|
||||
DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
|
||||
DeepGemmQuantScaleFMT.UE8M0,
|
||||
]
|
||||
|
||||
# Run the SiLU V2 kernel
|
||||
# TODO (varun): use_e8m0 is set to false as the reference impl does
|
||||
# not handle that case.
|
||||
y_q, y_s = persistent_masked_m_silu_mul_quant(
|
||||
y, tokens_per_expert, group_size=group_size, use_ue8m0=False
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
fp8_info = torch.finfo(fp8_dtype)
|
||||
fp8_max = fp8_info.max
|
||||
fp8_min = fp8_info.min
|
||||
eps = 1e-10
|
||||
|
||||
y1 = y[..., :H].float()
|
||||
y2 = y[..., H:]
|
||||
silu_x = y1 * torch.sigmoid(y1)
|
||||
merged = silu_x * y2
|
||||
|
||||
for e in range(E):
|
||||
nt = tokens_per_expert[e].item()
|
||||
ref_s = torch.empty(
|
||||
(T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
|
||||
)
|
||||
ref_q = torch.empty((T, H), dtype=fp8_dtype, device="cuda")
|
||||
|
||||
for t in range(nt):
|
||||
data = merged[e, t].float()
|
||||
ref_q_row = torch.empty_like(data)
|
||||
|
||||
# process full groups
|
||||
n_full_groups = H // group_size
|
||||
if n_full_groups > 0:
|
||||
data_grp = data[: n_full_groups * group_size].view(
|
||||
n_full_groups, group_size
|
||||
)
|
||||
amax = data_grp.abs().amax(dim=1).clamp(min=eps)
|
||||
scale = amax / fp8_max
|
||||
scaled = data[: n_full_groups * group_size] / scale.repeat_interleave(
|
||||
group_size
|
||||
)
|
||||
ref_q_row[: n_full_groups * group_size] = scaled.clamp(
|
||||
fp8_min, fp8_max
|
||||
).to(fp8_dtype)
|
||||
ref_s[t, :n_full_groups] = scale
|
||||
|
||||
# process remainder group
|
||||
rem = H % group_size
|
||||
if rem > 0:
|
||||
data_rem = data[-rem:]
|
||||
amax = data_rem.abs().amax().clamp(min=eps)
|
||||
scale = amax / fp8_max
|
||||
scaled = data_rem / scale
|
||||
ref_q_row[-rem:] = scaled.clamp(fp8_min, fp8_max).to(fp8_dtype)
|
||||
ref_s[t, -1] = scale
|
||||
|
||||
ref_q[t] = ref_q_row
|
||||
|
||||
y_se = y_s[e].float()
|
||||
y_qe = y_q[e].float()
|
||||
|
||||
torch.testing.assert_close(
|
||||
y_qe[:nt].to(torch.float32),
|
||||
ref_q[:nt].to(torch.float32),
|
||||
atol=2,
|
||||
rtol=2e-1,
|
||||
for scale_fmt in scale_fmts:
|
||||
y_q, y_s = persistent_masked_m_silu_mul_quant(
|
||||
y,
|
||||
tokens_per_expert,
|
||||
group_size=group_size,
|
||||
quant_scale_fmt=scale_fmt,
|
||||
)
|
||||
|
||||
torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
|
||||
ref_y_q, ref_y_s = ref_with_scale_fmt(
|
||||
E, T, H, group_size, tokens_per_expert, gate, up, scale_fmt=scale_fmt
|
||||
)
|
||||
|
||||
# deepgemm scales transform
|
||||
dg_scales = None
|
||||
if (
|
||||
has_deep_gemm()
|
||||
and current_platform.has_device_capability(100)
|
||||
and scale_fmt == DeepGemmQuantScaleFMT.UE8M0
|
||||
):
|
||||
from deep_gemm import transform_sf_into_required_layout
|
||||
|
||||
_q, _s = ref_with_scale_fmt(
|
||||
E,
|
||||
T,
|
||||
H,
|
||||
group_size,
|
||||
tokens_per_expert,
|
||||
gate,
|
||||
up,
|
||||
scale_fmt=DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
|
||||
)
|
||||
dg_scales = transform_sf_into_required_layout(
|
||||
sf=_s,
|
||||
mn=_q.size(1),
|
||||
k=_q.size(2),
|
||||
recipe=(1, 128, 128),
|
||||
num_groups=_q.size(0),
|
||||
is_sfa=True,
|
||||
)
|
||||
|
||||
expected_scale_dtype = (
|
||||
torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
|
||||
)
|
||||
assert y_s.dtype == expected_scale_dtype
|
||||
assert ref_y_s.dtype == expected_scale_dtype
|
||||
|
||||
for e in range(E):
|
||||
nt = tokens_per_expert[e].item()
|
||||
|
||||
torch.testing.assert_close(
|
||||
y_q[e, :nt].to(torch.float32),
|
||||
ref_y_q[e, :nt].to(torch.float32),
|
||||
)
|
||||
|
||||
if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
|
||||
G = H // group_size
|
||||
y_s_sliced = as_uint8(y_s[e])
|
||||
ref_s_sliced = as_uint8(ref_y_s[e])
|
||||
torch.testing.assert_close(y_s_sliced[:nt, :G], ref_s_sliced[:nt, :G])
|
||||
if dg_scales is not None:
|
||||
dg_sliced = as_uint8(dg_scales[e])
|
||||
torch.testing.assert_close(y_s_sliced[:nt, :G], dg_sliced[:nt, :G])
|
||||
else:
|
||||
torch.testing.assert_close(
|
||||
y_s[e, :nt],
|
||||
ref_y_s[e, :nt],
|
||||
)
|
||||
|
||||
@ -18,8 +18,6 @@ if current_platform.get_device_capability() < (9, 0):
|
||||
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
|
||||
|
||||
@ -29,8 +29,6 @@ if current_platform.get_device_capability() < (9, 0):
|
||||
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
# Test configurations
|
||||
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user