Merge branch 'main' into imarkov/eplb_optimizations

This commit is contained in:
ilmarkov 2025-12-09 21:27:37 +00:00
commit 60f744d7ce
73 changed files with 1160 additions and 443 deletions

View File

@ -398,7 +398,8 @@ steps:
timeout_in_minutes: 25 timeout_in_minutes: 25
gpu: h100 gpu: h100
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/v1/attention
- vllm/model_executor/layers
- tests/v1/determinism/ - tests/v1/determinism/
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@ -440,23 +441,29 @@ steps:
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
source_file_dependencies: source_file_dependencies:
- vllm/entrypoints - vllm/entrypoints
- vllm/multimodal
- examples/ - examples/
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic
- python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/chat.py - python3 offline_inference/basic/classify.py
- python3 offline_inference/prefix_caching.py - python3 offline_inference/basic/embed.py
- python3 offline_inference/llm_engine_example.py - python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py # for pooling models
- python3 offline_inference/basic/embed.py - python3 pooling/pooling/vision_language_pooling.py --seed 0
- python3 offline_inference/basic/score.py # for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@ -718,6 +725,18 @@ steps:
- uv pip install --system conch-triton-kernels - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: OpenAI API correctness # 10min - label: OpenAI API correctness # 10min
timeout_in_minutes: 15 timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
@ -727,7 +746,7 @@ steps:
- csrc/ - csrc/
- vllm/entrypoints/openai/ - vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py - vllm/model_executor/models/whisper.py
commands: # LMEval commands: # LMEval+Transcription WER check
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
- pytest -s entrypoints/openai/correctness/ - pytest -s entrypoints/openai/correctness/
@ -963,6 +982,19 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1 # 60min - label: Multi-Modal Models Test (Extended) 1 # 60min
timeout_in_minutes: 120 timeout_in_minutes: 120
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -1098,7 +1130,6 @@ steps:
- vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py - vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py - vllm/model_executor/layers/quantization/input_quant_fp8.py
- vllm/model_executor/layers/fused_moe/layer.py
- tests/compile/test_fusion_attn.py - tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py - tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py - tests/compile/distributed/test_fusion_all_reduce.py
@ -1132,12 +1163,25 @@ steps:
- vllm/model_executor/layers/activation.py - vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py - vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py - tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands: commands:
- nvidia-smi - nvidia-smi
# Run all e2e fusion tests # Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test - label: Blackwell Quantized MoE Test
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
@ -1155,6 +1199,16 @@ steps:
commands: commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py - pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
@ -1397,6 +1451,39 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
gpu: a100
optional: true
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
##### H100 test #####
- label: LM Eval Large Models (H100) # optional
gpu: h100
optional: true
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test ##### ##### H200 test #####
- label: Distributed Tests (H200) # optional - label: Distributed Tests (H200) # optional
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -1440,29 +1527,6 @@ steps:
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
timeout_in_minutes: 70
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: LM Eval Large Models (4 Card) - label: LM Eval Large Models (4 Card)
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4 agent_pool: mi325_4
@ -1478,21 +1542,6 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (H100) # optional
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
- label: ROCm LM Eval Large Models (8 Card) - label: ROCm LM Eval Large Models (8 Card)
mirror_hardwares: [amdproduction] mirror_hardwares: [amdproduction]
agent_pool: mi325_8 agent_pool: mi325_8
@ -1517,6 +1566,20 @@ steps:
- uv pip install --system 'gpt-oss[eval]==0.0.5' - uv pip install --system 'gpt-oss[eval]==0.0.5'
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
timeout_in_minutes: 30
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy - label: DeepSeek V2-Lite Accuracy
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4 agent_pool: mi325_4
@ -1550,17 +1613,26 @@ steps:
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
##### RL Integration Tests ##### - label: DeepSeek V2-Lite Async EPLB Accuracy
- label: Prime-RL Integration Test # 15min timeout_in_minutes: 60
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_2 agent_pool: mi325_4
# grade: Blocking # grade: Blocking
timeout_in_minutes: 30 gpu: h100
optional: true optional: true
num_gpus: 2 num_gpus: 4
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

View File

@ -468,7 +468,9 @@ steps:
# tests covered elsewhere. # tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that # Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965 # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" # However, find does not normally propagate error codes, so we combine it with xargs
# (using -0 for proper path handling)
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Smoke Test # 15min - label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30 timeout_in_minutes: 30
@ -482,7 +484,9 @@ steps:
# as it is a heavy test that is covered in other steps. # as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that # Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965 # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" # However, find does not normally propagate error codes, so we combine it with xargs
# (using -0 for proper path handling)
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Test # 27min - label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40 timeout_in_minutes: 40

View File

@ -13,7 +13,7 @@ jobs:
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Set up Python - name: Set up Python
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0

View File

@ -12,7 +12,7 @@ jobs:
timeout-minutes: 30 timeout-minutes: 30
steps: steps:
- uses: actions/checkout@v6 - uses: actions/checkout@v6.0.1
- uses: astral-sh/setup-uv@v7 - uses: astral-sh/setup-uv@v7
with: with:

View File

@ -16,7 +16,7 @@ jobs:
pre-commit: pre-commit:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with: with:
python-version: "3.12" python-version: "3.12"

View File

@ -15,7 +15,7 @@ jobs:
actions: write actions: write
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0 - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
with: with:
# Increasing this value ensures that changes to this workflow # Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months # propagate to all issues and PRs in days rather than months

View File

@ -96,8 +96,9 @@ start_server() {
# This correctly passes each element as a separate argument. # This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled # Start server with profiling enabled
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & VLLM_SERVER_DEV_MODE=1 \
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else else
# Start server without profiling # Start server without profiling
VLLM_SERVER_DEV_MODE=1 \ VLLM_SERVER_DEV_MODE=1 \

View File

@ -963,8 +963,7 @@ def create_argument_parser():
parser.add_argument( parser.add_argument(
"--profile", "--profile",
action="store_true", action="store_true",
help="Use Torch Profiler. The endpoint must be launched with " help="Use vLLM Profiling. --profiler-config must be provided on the server.",
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
) )
parser.add_argument( parser.add_argument(
"--result-dir", "--result-dir",

View File

@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes.
- [vllm.config.MultiModalConfig][] - [vllm.config.MultiModalConfig][]
- [vllm.config.PoolerConfig][] - [vllm.config.PoolerConfig][]
- [vllm.config.StructuredOutputsConfig][] - [vllm.config.StructuredOutputsConfig][]
- [vllm.config.ProfilerConfig][]
- [vllm.config.ObservabilityConfig][] - [vllm.config.ObservabilityConfig][]
- [vllm.config.KVTransferConfig][] - [vllm.config.KVTransferConfig][]
- [vllm.config.CompilationConfig][] - [vllm.config.CompilationConfig][]

View File

@ -5,16 +5,15 @@
## Profile with PyTorch Profiler ## Profile with PyTorch Profiler
We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables: We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config`
when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:
- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default - `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default - `torch_profiler_with_memory` to record memory, off by default
- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default - `torch_profiler_with_stack` to enable recording stack information, on by default
- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default - `torch_profiler_with_flops` to enable recording FLOPs, off by default
- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default - `torch_profiler_use_gzip` to control gzip-compressing profiling files, on by default
- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default - `torch_profiler_dump_cuda_time_total` to control dumping and printing the aggregated CUDA self time table, on by default
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag. When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
@ -40,8 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline
#### OpenAI Server #### OpenAI Server
```bash ```bash
VLLM_TORCH_PROFILER_DIR=./vllm_profile \ vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}'
vllm serve meta-llama/Llama-3.1-8B-Instruct
``` ```
vllm bench command: vllm bench command:
@ -104,13 +102,12 @@ To profile the server, you will want to prepend your `vllm serve` command with `
```bash ```bash
# server # server
VLLM_TORCH_CUDA_PROFILE=1 \
nsys profile \ nsys profile \
--trace-fork-before-exec=true \ --trace-fork-before-exec=true \
--cuda-graph-trace=node \ --cuda-graph-trace=node \
--capture-range=cudaProfilerApi \ --capture-range=cudaProfilerApi \
--capture-range-end repeat \ --capture-range-end repeat \
vllm serve meta-llama/Llama-3.1-8B-Instruct vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config.profiler cuda
# client # client
vllm bench serve \ vllm bench serve \

View File

@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
def is_reasoning_end(self, input_ids: list[int]) -> bool: def is_reasoning_end(self, input_ids: list[int]) -> bool:
return self.end_token_id in input_ids return self.end_token_id in input_ids
def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool:
return self.end_token_id in delta_token_ids
... ...
``` ```

View File

@ -1,14 +1,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import time import time
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
# enable torch profiler, can also be set on cmd line
os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
# Sample prompts. # Sample prompts.
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
@ -22,7 +18,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
def main(): def main():
# Create an LLM. # Create an LLM.
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1) llm = LLM(
model="facebook/opt-125m",
tensor_parallel_size=1,
profiler_config={
"profiler": "torch",
"torch_profiler_dir": "./vllm_profile",
},
)
llm.start_profile() llm.start_profile()

View File

@ -17,7 +17,6 @@ def test_compile():
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@pytest.mark.forked @pytest.mark.forked
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
@pytest.mark.xfail
def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch): def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
"""Test that Qwen2.5-VL vision submodules are compiled. """Test that Qwen2.5-VL vision submodules are compiled.

View File

@ -80,6 +80,8 @@ def test_compile_ranges(use_fresh_inductor_cache):
vllm_config = VllmConfig( vllm_config = VllmConfig(
scheduler_config=SchedulerConfig( scheduler_config=SchedulerConfig(
max_num_batched_tokens=8192, max_num_batched_tokens=8192,
max_model_len=8192,
is_encoder_decoder=False,
), ),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, mode=CompilationMode.VLLM_COMPILE,
@ -112,6 +114,8 @@ def test_compile_config_get_compile_ranges():
VllmConfig( VllmConfig(
scheduler_config=SchedulerConfig( scheduler_config=SchedulerConfig(
max_num_batched_tokens=8192, max_num_batched_tokens=8192,
max_model_len=8192,
is_encoder_decoder=False,
), ),
compilation_config=compilation_config, compilation_config=compilation_config,
) )
@ -134,6 +138,8 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
) )
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
max_num_batched_tokens=8192, max_num_batched_tokens=8192,
max_model_len=8192,
is_encoder_decoder=False,
) )
torch.set_default_device("cuda") torch.set_default_device("cuda")

View File

@ -5,9 +5,14 @@ import copy
import pytest import pytest
import torch import torch
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.compilation.inductor_pass import (
CallableInductorPass,
InductorPass,
pass_context,
)
from vllm.compilation.pass_manager import PostGradPassManager from vllm.compilation.pass_manager import PostGradPassManager
from vllm.config import ModelConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
from vllm.config.utils import Range
# dummy custom pass that doesn't inherit # dummy custom pass that doesn't inherit
@ -42,35 +47,37 @@ class ProperPass(InductorPass):
], ],
) )
def test_pass_manager_uuid(callable): def test_pass_manager_uuid(callable):
# Some passes need dtype to be set # Set the pass context as PassManager uuid uses it
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) with pass_context(Range(start=1, end=8)):
# Some passes need dtype to be set
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
pass_manager = PostGradPassManager() pass_manager = PostGradPassManager()
pass_manager.configure(config) pass_manager.configure(config)
# Check that UUID is different if the same pass is added 2x # Check that UUID is different if the same pass is added 2x
pass_manager.add(callable) pass_manager.add(callable)
uuid1 = pass_manager.uuid() uuid1 = pass_manager.uuid()
pass_manager.add(callable) pass_manager.add(callable)
uuid2 = pass_manager.uuid() uuid2 = pass_manager.uuid()
assert uuid1 != uuid2 assert uuid1 != uuid2
# UUID should be the same as the original one, # UUID should be the same as the original one,
# as we constructed in the same way. # as we constructed in the same way.
pass_manager2 = PostGradPassManager() pass_manager2 = PostGradPassManager()
pass_manager2.configure(config) pass_manager2.configure(config)
pass_manager2.add(callable) pass_manager2.add(callable)
assert uuid1 == pass_manager2.uuid() assert uuid1 == pass_manager2.uuid()
# UUID should be different due to config change # UUID should be different due to config change
config2 = copy.deepcopy(config) config2 = copy.deepcopy(config)
config2.compilation_config.pass_config.fuse_norm_quant = ( config2.compilation_config.pass_config.fuse_norm_quant = (
not config2.compilation_config.pass_config.fuse_norm_quant not config2.compilation_config.pass_config.fuse_norm_quant
) )
config2.compilation_config.pass_config.fuse_act_quant = ( config2.compilation_config.pass_config.fuse_act_quant = (
not config2.compilation_config.pass_config.fuse_act_quant not config2.compilation_config.pass_config.fuse_act_quant
) )
pass_manager3 = PostGradPassManager() pass_manager3 = PostGradPassManager()
pass_manager3.configure(config2) pass_manager3.configure(config2)
pass_manager3.add(callable) pass_manager3.add(callable)
assert uuid1 != pass_manager3.uuid() assert uuid1 != pass_manager3.uuid()

View File

@ -26,7 +26,14 @@ def clear_cache():
_cached_get_attn_backend.cache_clear() _cached_get_attn_backend.cache_clear()
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) devices = ["cpu"]
if current_platform.is_cuda():
devices.append("cuda")
if current_platform.is_rocm():
devices.append("hip")
@pytest.mark.parametrize("device", devices)
def test_mha_attn_platform(device: str): def test_mha_attn_platform(device: str):
""" """
Test the attention selector between different platform and device. Test the attention selector between different platform and device.
@ -46,7 +53,7 @@ def test_mha_attn_platform(device: str):
patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()), patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
): ):
attn = MultiHeadAttention(16, 64, scale=1) attn = MultiHeadAttention(16, 64, scale=1)
assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
else: else:
# Test CUDA with head_size=64 (divisible by 32) # Test CUDA with head_size=64 (divisible by 32)
# - should use vLLM's FlashAttention # - should use vLLM's FlashAttention

View File

@ -103,7 +103,7 @@ def ref_dynamic_per_tensor_fp8_quant(
.clamp(fp8_traits_min, fp8_traits_max) .clamp(fp8_traits_min, fp8_traits_max)
.to(FP8_DTYPE) .to(FP8_DTYPE)
) )
return ref_out, ref_scale.view((1, 1)) return ref_out, ref_scale.view(1)
def native_w8a8_block_matmul( def native_w8a8_block_matmul(

View File

@ -132,6 +132,41 @@ class TestBaseThinkingReasoningParserMethods:
is False is False
) )
def test_is_reasoning_end_streaming(self, test_tokenizer):
"""Test the is_reasoning_end_streaming method."""
parser = TestThinkingReasoningParser(test_tokenizer)
end_token_id = parser.end_token_id
start_token_id = parser.start_token_id
assert (
parser.is_reasoning_end_streaming([1, 2, end_token_id], [end_token_id])
is True
)
assert parser.is_reasoning_end_streaming([1, 2, 3, 4], [4]) is False
assert parser.is_reasoning_end_streaming([], []) is False
assert (
parser.is_reasoning_end_streaming(
[1, start_token_id, 2, end_token_id], [end_token_id]
)
is True
)
assert (
parser.is_reasoning_end_streaming([1, start_token_id, 2, 3], [3]) is False
)
assert (
parser.is_reasoning_end_streaming(
[1, start_token_id, 2, end_token_id, 2, start_token_id, 2],
[2],
)
is False
)
assert (
parser.is_reasoning_end_streaming(
[1, start_token_id, 2, end_token_id, 2, 2], [2]
)
is False
)
def test_extract_content_ids(self, test_tokenizer): def test_extract_content_ids(self, test_tokenizer):
"""Test the extract_content_ids method.""" """Test the extract_content_ids method."""
parser = TestThinkingReasoningParser(test_tokenizer) parser = TestThinkingReasoningParser(test_tokenizer)

View File

@ -40,6 +40,7 @@ def test_identity_reasoning_parser_basic(tokenizer):
input_tokens = tokenizer.tokenize(input_text) input_tokens = tokenizer.tokenize(input_text)
input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
assert parser.is_reasoning_end(input_ids) is True assert parser.is_reasoning_end(input_ids) is True
assert parser.is_reasoning_end_streaming(input_ids, input_ids) is True
# Test extract_content_ids returns all input_ids # Test extract_content_ids returns all input_ids
assert parser.extract_content_ids(input_ids) == input_ids assert parser.extract_content_ids(input_ids) == input_ids

View File

@ -615,6 +615,7 @@ def test_extract_tool_calls_streaming(
"single_tool_weather", "single_tool_weather",
"multiple_tool_calls", "multiple_tool_calls",
"content_before_tool", "content_before_tool",
"complex",
], ],
argnames=["model_output", "expected_tool_calls", "expected_content"], argnames=["model_output", "expected_tool_calls", "expected_content"],
argvalues=[ argvalues=[
@ -673,6 +674,21 @@ def test_extract_tool_calls_streaming(
], ],
"bla", "bla",
), ),
(
# Complex
"""[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="bash",
arguments=json.dumps(
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
),
)
)
],
"",
),
], ],
) )
def test_extract_tool_calls_streaming_one_chunk( def test_extract_tool_calls_streaming_one_chunk(

View File

@ -161,10 +161,10 @@ class TestCudagraphDispatcher:
assert rt_mode == CUDAGraphMode.NONE assert rt_mode == CUDAGraphMode.NONE
assert key == BatchDescriptor(num_tokens=15) assert key == BatchDescriptor(num_tokens=15)
# 4. Cascade attention should have a fall back mode # 4. disable_full should have a fall back mode (e.g., cascade attention)
desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False) desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False)
rt_mode, key = dispatcher.dispatch( rt_mode, key = dispatcher.dispatch(
num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True num_tokens=8, uniform_decode=False, has_lora=False, disable_full=True
) )
if "PIECEWISE" in cudagraph_mode_str: # string contains check if "PIECEWISE" in cudagraph_mode_str: # string contains check
assert rt_mode == CUDAGraphMode.PIECEWISE assert rt_mode == CUDAGraphMode.PIECEWISE

View File

@ -10,6 +10,7 @@ from utils import (
BACKENDS, BACKENDS,
_extract_step_logprobs, _extract_step_logprobs,
_random_prompt, _random_prompt,
is_device_capability_below_90,
resolve_model_name, resolve_model_name,
skip_unsupported, skip_unsupported,
) )
@ -17,6 +18,8 @@ from utils import (
import vllm.model_executor.layers.batch_invariant as batch_invariant import vllm.model_executor.layers.batch_invariant as batch_invariant
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
@skip_unsupported @skip_unsupported
@pytest.mark.timeout(1000) @pytest.mark.timeout(1000)
@ -190,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
max_model_len=8192, max_model_len=8192,
dtype="bfloat16", # not everything is supported dtype="bfloat16", # not everything is supported
gpu_memory_utilization=0.9, gpu_memory_utilization=0.9,
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
) )
# Use more realistic prompts for better token generation # Use more realistic prompts for better token generation
@ -393,6 +397,8 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
gpu_memory_utilization=0.9, gpu_memory_utilization=0.9,
max_model_len=2048, max_model_len=2048,
dtype="bfloat16", dtype="bfloat16",
enable_prefix_caching=False,
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
) )
prompt = "the capital of france is" prompt = "the capital of france is"
@ -459,6 +465,7 @@ def test_logprobs_without_batch_invariance_should_fail(
max_num_seqs=32, max_num_seqs=32,
max_model_len=8192, max_model_len=8192,
dtype="bfloat16", dtype="bfloat16",
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
) )
# build ragged prompts to change shapes significantly across BS=1 vs BS=N # build ragged prompts to change shapes significantly across BS=1 vs BS=N
@ -682,6 +689,7 @@ def test_decode_logprobs_match_prefill_logprobs(
max_num_seqs=32, max_num_seqs=32,
max_model_len=8192, max_model_len=8192,
dtype="bfloat16", dtype="bfloat16",
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
) )
# Use a few test prompts # Use a few test prompts
@ -925,6 +933,8 @@ def LLM_with_max_seqs(
max_model_len=max_model_len, max_model_len=max_model_len,
dtype="bfloat16", dtype="bfloat16",
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
enable_prefix_caching=False,
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
# Enable for MOE models # Enable for MOE models
# enable_expert_parallel=True, # enable_expert_parallel=True,
) )

View File

@ -11,8 +11,10 @@ from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer from vllm.utils.flashinfer import has_flashinfer
skip_unsupported = pytest.mark.skipif( skip_unsupported = pytest.mark.skipif(
not (current_platform.is_cuda() and current_platform.has_device_capability(90)), not (current_platform.is_cuda() and current_platform.has_device_capability(80)),
reason="Requires CUDA and >= Hopper (SM90)", # Supports testing on Ampere and Ada Lovelace devices.
# Note: For devices with SM < 90, batch invariance does not support CUDA Graphs.
reason="Requires CUDA and >= Ampere (SM80)",
) )
BACKENDS: list[str] = [ BACKENDS: list[str] = [
@ -97,3 +99,7 @@ def _extract_step_logprobs(request_output):
return t, inner.token_ids return t, inner.token_ids
return None, None return None, None
def is_device_capability_below_90() -> bool:
return not current_platform.has_device_capability(90)

View File

@ -124,6 +124,8 @@ def run_tests(
with monkeypatch.context() as m: with monkeypatch.context() as m:
# avoid precision errors # avoid precision errors
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
# lock matmul precision to full FP32
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
# m.setenv("VLLM_BATCH_INVARIANT", "1") # m.setenv("VLLM_BATCH_INVARIANT", "1")
outputs: list[tuple[str, list, list]] = [] outputs: list[tuple[str, list, list]] = []
for n, ( for n, (

View File

@ -70,6 +70,7 @@ class TestReasoningStructuredOutput:
request.use_structured_output = True request.use_structured_output = True
request.prompt_token_ids = [1, 2, 3, 4, 5] request.prompt_token_ids = [1, 2, 3, 4, 5]
request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8] request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8]
request.num_computed_tokens = 5
return request return request
def test_should_fill_bitmask_with_enable_in_reasoning( def test_should_fill_bitmask_with_enable_in_reasoning(

View File

@ -2,8 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
import vllm.envs as envs from vllm.config import ProfilerConfig
from vllm.profiler.gpu_profiler import WorkerProfiler from vllm.profiler.wrapper import WorkerProfiler
class ConcreteWorkerProfiler(WorkerProfiler): class ConcreteWorkerProfiler(WorkerProfiler):
@ -11,11 +11,11 @@ class ConcreteWorkerProfiler(WorkerProfiler):
A basic implementation of a worker profiler for testing purposes. A basic implementation of a worker profiler for testing purposes.
""" """
def __init__(self): def __init__(self, profiler_config: ProfilerConfig):
self.start_call_count = 0 self.start_call_count = 0
self.stop_call_count = 0 self.stop_call_count = 0
self.should_fail_start = False self.should_fail_start = False
super().__init__() super().__init__(profiler_config)
def _start(self) -> None: def _start(self) -> None:
if self.should_fail_start: if self.should_fail_start:
@ -26,17 +26,19 @@ class ConcreteWorkerProfiler(WorkerProfiler):
self.stop_call_count += 1 self.stop_call_count += 1
@pytest.fixture(autouse=True) @pytest.fixture
def reset_mocks(): def default_profiler_config():
"""Fixture to reset mocks and env variables before each test.""" return ProfilerConfig(
envs.VLLM_PROFILER_DELAY_ITERS = 0 profiler="torch",
envs.VLLM_PROFILER_MAX_ITERS = 0 torch_profiler_dir="/tmp/mock",
delay_iterations=0,
max_iterations=0,
)
def test_immediate_start_stop(): def test_immediate_start_stop(default_profiler_config):
"""Test standard start without delay.""" """Test standard start without delay."""
profiler = ConcreteWorkerProfiler() profiler = ConcreteWorkerProfiler(default_profiler_config)
profiler.start() profiler.start()
assert profiler._running is True assert profiler._running is True
assert profiler._active is True assert profiler._active is True
@ -48,10 +50,10 @@ def test_immediate_start_stop():
assert profiler.stop_call_count == 1 assert profiler.stop_call_count == 1
def test_delayed_start(): def test_delayed_start(default_profiler_config):
"""Test that profiler waits for N steps before actually starting.""" """Test that profiler waits for N steps before actually starting."""
envs.VLLM_PROFILER_DELAY_ITERS = 2 default_profiler_config.delay_iterations = 2
profiler = ConcreteWorkerProfiler() profiler = ConcreteWorkerProfiler(default_profiler_config)
# User requests start # User requests start
profiler.start() profiler.start()
@ -71,10 +73,10 @@ def test_delayed_start():
assert profiler.start_call_count == 1 assert profiler.start_call_count == 1
def test_max_iterations(): def test_max_iterations(default_profiler_config):
"""Test that profiler stops automatically after max iterations.""" """Test that profiler stops automatically after max iterations."""
envs.VLLM_PROFILER_MAX_ITERS = 2 default_profiler_config.max_iterations = 2
profiler = ConcreteWorkerProfiler() profiler = ConcreteWorkerProfiler(default_profiler_config)
profiler.start() profiler.start()
assert profiler._running is True assert profiler._running is True
@ -95,12 +97,11 @@ def test_max_iterations():
assert profiler.stop_call_count == 1 assert profiler.stop_call_count == 1
def test_delayed_start_and_max_iters(): def test_delayed_start_and_max_iters(default_profiler_config):
"""Test combined delayed start and max iterations.""" """Test combined delayed start and max iterations."""
envs.VLLM_PROFILER_DELAY_ITERS = 2 default_profiler_config.delay_iterations = 2
envs.VLLM_PROFILER_MAX_ITERS = 2 default_profiler_config.max_iterations = 2
profiler = ConcreteWorkerProfiler() profiler = ConcreteWorkerProfiler(default_profiler_config)
profiler.start() profiler.start()
# Step 1 # Step 1
@ -127,9 +128,9 @@ def test_delayed_start_and_max_iters():
assert profiler.stop_call_count == 1 assert profiler.stop_call_count == 1
def test_idempotency(): def test_idempotency(default_profiler_config):
"""Test that calling start/stop multiple times doesn't break logic.""" """Test that calling start/stop multiple times doesn't break logic."""
profiler = ConcreteWorkerProfiler() profiler = ConcreteWorkerProfiler(default_profiler_config)
# Double Start # Double Start
profiler.start() profiler.start()
@ -142,10 +143,10 @@ def test_idempotency():
assert profiler.stop_call_count == 1 # Should only stop once assert profiler.stop_call_count == 1 # Should only stop once
def test_step_inactive(): def test_step_inactive(default_profiler_config):
"""Test that stepping while inactive does nothing.""" """Test that stepping while inactive does nothing."""
envs.VLLM_PROFILER_DELAY_ITERS = 2 default_profiler_config.delay_iterations = 2
profiler = ConcreteWorkerProfiler() profiler = ConcreteWorkerProfiler(default_profiler_config)
# Not started yet # Not started yet
profiler.step() profiler.step()
@ -155,9 +156,9 @@ def test_step_inactive():
assert profiler.start_call_count == 0 assert profiler.start_call_count == 0
def test_start_failure(): def test_start_failure(default_profiler_config):
"""Test behavior when the underlying _start method raises exception.""" """Test behavior when the underlying _start method raises exception."""
profiler = ConcreteWorkerProfiler() profiler = ConcreteWorkerProfiler(default_profiler_config)
profiler.should_fail_start = True profiler.should_fail_start = True
profiler.start() profiler.start()
@ -168,9 +169,9 @@ def test_start_failure():
assert profiler.start_call_count == 0 # Logic failed inside start assert profiler.start_call_count == 0 # Logic failed inside start
def test_shutdown(): def test_shutdown(default_profiler_config):
"""Test that shutdown calls stop only if running.""" """Test that shutdown calls stop only if running."""
profiler = ConcreteWorkerProfiler() profiler = ConcreteWorkerProfiler(default_profiler_config)
# Case 1: Not running # Case 1: Not running
profiler.shutdown() profiler.shutdown()
@ -182,10 +183,10 @@ def test_shutdown():
assert profiler.stop_call_count == 1 assert profiler.stop_call_count == 1
def test_mixed_delay_and_stop(): def test_mixed_delay_and_stop(default_profiler_config):
"""Test manual stop during the delay period.""" """Test manual stop during the delay period."""
envs.VLLM_PROFILER_DELAY_ITERS = 5 default_profiler_config.delay_iterations = 5
profiler = ConcreteWorkerProfiler() profiler = ConcreteWorkerProfiler(default_profiler_config)
profiler.start() profiler.start()
profiler.step() profiler.step()

View File

@ -9,6 +9,8 @@ import vllm.envs as envs
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
_FP8_DTYPE = current_platform.fp8_dtype()
def is_aiter_found() -> bool: def is_aiter_found() -> bool:
from importlib.util import find_spec from importlib.util import find_spec
@ -467,6 +469,59 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
return torch.empty_like(x), torch.empty_like(residual) return torch.empty_like(x), torch.empty_like(residual)
def _rocm_aiter_per_tensor_quant_impl(
x: torch.Tensor,
quant_dtype: torch.dtype,
scale: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
from aiter.ops.quant import per_tensor_quant_hip
return per_tensor_quant_hip(x, scale, quant_dtype)
def _rocm_aiter_per_tensor_quant_fake(
x: torch.Tensor,
quant_dtype: torch.dtype,
scale: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
return torch.empty_like(x, dtype=quant_dtype), torch.empty(
1, dtype=torch.float32, device=x.device
)
def _rocm_aiter_per_token_quant_impl(
x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None
) -> tuple[torch.Tensor, torch.Tensor]:
from aiter.ops.quant import dynamic_per_token_scaled_quant
assert quant_dtype in [torch.int8, _FP8_DTYPE]
out_shape = x.shape
out = torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device)
if scale is None:
scale = torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device)
dynamic_per_token_scaled_quant(
out,
x,
scale,
scale_ub=None,
shuffle_scale=False,
num_rows=None,
num_rows_factor=1,
)
return out, scale
def _rocm_aiter_per_token_quant_fake(
x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None
) -> tuple[torch.Tensor, torch.Tensor]:
out_shape = x.shape
return (
torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device),
torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device),
)
# Global flag to ensure ops are registered only once # Global flag to ensure ops are registered only once
_OPS_REGISTERED = False _OPS_REGISTERED = False
@ -665,6 +720,22 @@ class rocm_aiter_ops:
dispatch_key=current_platform.dispatch_key, dispatch_key=current_platform.dispatch_key,
) )
direct_register_custom_op(
op_name="rocm_aiter_per_tensor_quant",
op_func=_rocm_aiter_per_tensor_quant_impl,
mutates_args=[],
fake_impl=_rocm_aiter_per_tensor_quant_fake,
dispatch_key=current_platform.dispatch_key,
)
direct_register_custom_op(
op_name="rocm_aiter_per_token_quant",
op_func=_rocm_aiter_per_token_quant_impl,
mutates_args=["scale"],
fake_impl=_rocm_aiter_per_token_quant_fake,
dispatch_key=current_platform.dispatch_key,
)
_OPS_REGISTERED = True _OPS_REGISTERED = True
@staticmethod @staticmethod
@ -859,6 +930,22 @@ class rocm_aiter_ops:
kv_scale=kv_scale, kv_scale=kv_scale,
) )
@staticmethod
def per_tensor_quant(
x: torch.Tensor,
quant_dtype: torch.dtype,
scale: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
return torch.ops.vllm.rocm_aiter_per_tensor_quant(x, quant_dtype, scale)
@staticmethod
def per_token_quant(
x: torch.Tensor,
quant_dtype: torch.dtype,
scale: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
return torch.ops.vllm.rocm_aiter_per_token_quant(x, quant_dtype, scale)
@staticmethod @staticmethod
def triton_fp4_gemm_dynamic_qaunt( def triton_fp4_gemm_dynamic_qaunt(
x: torch.Tensor, x: torch.Tensor,

View File

@ -1726,7 +1726,7 @@ def scaled_fp8_quant(
output, input, scale, scale_ub output, input, scale, scale_ub
) )
else: else:
scale = torch.empty((1, 1), device=input.device, dtype=torch.float32) scale = torch.empty(1, device=input.device, dtype=torch.float32)
torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
else: else:
assert scale.numel() == 1, f"{scale.shape}" assert scale.numel() == 1, f"{scale.shape}"

View File

@ -89,7 +89,10 @@ def maybe_get_vit_flash_attn_backend(
if attn_backend == AttentionBackendEnum.ROCM_AITER_FA: if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
from aiter import flash_attn_varlen_func from aiter import flash_attn_varlen_func
else: else:
from vllm.attention.utils.fa_utils import flash_attn_varlen_func try:
from vllm.attention.utils.fa_utils import flash_attn_varlen_func
except ImportError:
flash_attn_varlen_func = None
else: else:
flash_attn_varlen_func = None flash_attn_varlen_func = None

View File

@ -12,7 +12,6 @@ from typing import Any
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
import vllm.envs as envs
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType from vllm.inputs import PromptType
@ -79,12 +78,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler."
)
engine_args = EngineArgs.from_cli_args(args) engine_args = EngineArgs.from_cli_args(args)
if args.profile and not engine_args.profiler_config.profiler == "torch":
raise ValueError(
"The torch profiler is not enabled. Please provide profiler_config."
)
# Lazy import to avoid importing LLM when the bench command is not selected. # Lazy import to avoid importing LLM when the bench command is not selected.
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
@ -144,7 +142,7 @@ def main(args: argparse.Namespace):
run_to_completion(profile_dir=None) run_to_completion(profile_dir=None)
if args.profile: if args.profile:
profile_dir = envs.VLLM_TORCH_PROFILER_DIR profile_dir = engine_args.profiler_config.torch_profiler_dir
print(f"Profiling (results will be saved to '{profile_dir}')...") print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir) run_to_completion(profile_dir=profile_dir)
return return

View File

@ -1097,8 +1097,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
parser.add_argument( parser.add_argument(
"--profile", "--profile",
action="store_true", action="store_true",
help="Use Torch Profiler. The endpoint must be launched with " help="Use vLLM Profiling. --profiler-config must be provided on the server.",
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
) )
parser.add_argument( parser.add_argument(
"--save-result", "--save-result",

View File

@ -655,8 +655,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
"--profile", "--profile",
action="store_true", action="store_true",
default=False, default=False,
help="Use Torch Profiler. The env variable " help="Use vLLM Profiling. --profiler-config must be provided on the server.",
"VLLM_TORCH_PROFILER_DIR must be set to enable profiler.",
) )
# prefix repetition dataset # prefix repetition dataset

View File

@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import functools import functools
import hashlib import hashlib
import inspect import inspect
@ -8,15 +10,17 @@ import json
import types import types
from collections.abc import Callable from collections.abc import Callable
from contextlib import contextmanager from contextlib import contextmanager
from typing import Any from typing import TYPE_CHECKING, Any
import torch import torch
from torch import fx from torch import fx
from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
from vllm.config.utils import Range
from vllm.utils.torch_utils import is_torch_equal_or_newer from vllm.utils.torch_utils import is_torch_equal_or_newer
if TYPE_CHECKING:
from vllm.config.utils import Range
if is_torch_equal_or_newer("2.6"): if is_torch_equal_or_newer("2.6"):
from torch._inductor.custom_graph_pass import CustomGraphPass from torch._inductor.custom_graph_pass import CustomGraphPass
else: else:

View File

@ -53,8 +53,27 @@ class PiecewiseBackend:
self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1 self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
self.is_full_graph = total_piecewise_compiles == 1 self.is_full_graph = total_piecewise_compiles == 1
# TODO: we need to generalize encoder compilation to other models
self.is_encoder_compilation = vllm_backend.prefix in [
"Qwen2_5_VisionPatchEmbed",
"Qwen2_5_VisionPatchMerger",
"Qwen2_5_VisionBlock",
]
self.compile_ranges = self.compilation_config.get_compile_ranges() self.compile_ranges = self.compilation_config.get_compile_ranges()
if self.is_encoder_compilation:
# For encoder compilation we use the max int32 value
# to set the upper bound of the compile ranges
max_int32 = 2**31 - 1
last_compile_range = self.compile_ranges[-1]
assert (
last_compile_range.end
== vllm_config.scheduler_config.max_num_batched_tokens
)
self.compile_ranges[-1] = Range(
start=last_compile_range.start, end=max_int32
)
log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}" log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
logger.debug_once(log_string) logger.debug_once(log_string)

View File

@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig
from vllm.config.observability import ObservabilityConfig from vllm.config.observability import ObservabilityConfig
from vllm.config.parallel import EPLBConfig, ParallelConfig from vllm.config.parallel import EPLBConfig, ParallelConfig
from vllm.config.pooler import PoolerConfig from vllm.config.pooler import PoolerConfig
from vllm.config.profiler import ProfilerConfig
from vllm.config.scheduler import SchedulerConfig from vllm.config.scheduler import SchedulerConfig
from vllm.config.speculative import SpeculativeConfig from vllm.config.speculative import SpeculativeConfig
from vllm.config.speech_to_text import SpeechToTextConfig from vllm.config.speech_to_text import SpeechToTextConfig
@ -89,6 +90,8 @@ __all__ = [
"SpeechToTextConfig", "SpeechToTextConfig",
# From vllm.config.structured_outputs # From vllm.config.structured_outputs
"StructuredOutputsConfig", "StructuredOutputsConfig",
# From vllm.config.profiler
"ProfilerConfig",
# From vllm.config.utils # From vllm.config.utils
"ConfigType", "ConfigType",
"SupportsMetricsInfo", "SupportsMetricsInfo",

199
vllm/config/profiler.py Normal file
View File

@ -0,0 +1,199 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Any, Literal
from pydantic import Field, model_validator
from pydantic.dataclasses import dataclass
from typing_extensions import Self
import vllm.envs as envs
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
logger = init_logger(__name__)
ProfilerKind = Literal["torch", "cuda"]
@config
@dataclass
class ProfilerConfig:
"""Dataclass which contains profiler config for the engine."""
profiler: ProfilerKind | None = None
"""Which profiler to use. Defaults to None. Options are:
- 'torch': Use PyTorch profiler.\n
- 'cuda': Use CUDA profiler."""
torch_profiler_dir: str = ""
"""Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
worker's traces (CPU & GPU) will be saved under this directory. Note that
it must be an absolute path."""
torch_profiler_with_stack: bool = True
"""If `True`, enables stack tracing in the torch profiler. Enabled by default."""
torch_profiler_with_flops: bool = False
"""If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""
torch_profiler_use_gzip: bool = True
"""If `True`, saves torch profiler traces in gzip format. Enabled by default"""
torch_profiler_dump_cuda_time_total: bool = True
"""If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""
torch_profiler_record_shapes: bool = False
"""If `True`, records tensor shapes in the torch profiler. Disabled by default."""
torch_profiler_with_memory: bool = False
"""If `True`, enables memory profiling in the torch profiler.
Disabled by default."""
ignore_frontend: bool = False
"""If `True`, disables the front-end profiling of AsyncLLM when using the
'torch' profiler. This is needed to reduce overhead when using delay/limit options,
since the front-end profiling does not track iterations and will capture the
entire range.
"""
delay_iterations: int = Field(default=0, ge=0)
"""Number of engine iterations to skip before starting profiling.
Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
"""
max_iterations: int = Field(default=0, ge=0)
"""Maximum number of engine iterations to profile after starting profiling.
Defaults to 0, meaning no limit.
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
"""Get field from env var if set, with deprecation warning."""
if envs.is_set(env_var_name):
value = getattr(envs, env_var_name)
logger.warning_once(
"Using %s environment variable is deprecated and will be removed in "
"v0.14.0 or v1.0.0, whichever is soonest. Please use "
"--profiler-config.%s command line argument or "
"ProfilerConfig(%s=...) config field instead.",
env_var_name,
field_name,
field_name,
)
return value
return None
def _set_from_env_if_set(
self,
field_name: str,
env_var_name: str,
to_bool: bool = True,
to_int: bool = False,
) -> None:
"""Set field from env var if set, with deprecation warning."""
value = self._get_from_env_if_set(field_name, env_var_name)
if value is not None:
if to_bool:
value = value == "1"
if to_int:
value = int(value)
setattr(self, field_name, value)
@model_validator(mode="after")
def _validate_profiler_config(self) -> Self:
maybe_use_cuda_profiler = self._get_from_env_if_set(
"profiler", "VLLM_TORCH_CUDA_PROFILE"
)
if maybe_use_cuda_profiler is not None:
self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None
else:
self._set_from_env_if_set(
"torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False
)
if self.torch_profiler_dir:
self.profiler = "torch"
self._set_from_env_if_set(
"torch_profiler_record_shapes",
"VLLM_TORCH_PROFILER_RECORD_SHAPES",
)
self._set_from_env_if_set(
"torch_profiler_with_memory",
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY",
)
self._set_from_env_if_set(
"torch_profiler_with_stack",
"VLLM_TORCH_PROFILER_WITH_STACK",
)
self._set_from_env_if_set(
"torch_profiler_with_flops",
"VLLM_TORCH_PROFILER_WITH_FLOPS",
)
self._set_from_env_if_set(
"ignore_frontend",
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM",
)
self._set_from_env_if_set(
"torch_profiler_use_gzip",
"VLLM_TORCH_PROFILER_USE_GZIP",
)
self._set_from_env_if_set(
"torch_profiler_dump_cuda_time_total",
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL",
)
self._set_from_env_if_set(
"delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True
)
self._set_from_env_if_set(
"max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True
)
has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
logger.warning_once(
"Using 'torch' profiler with delay_iterations or max_iterations "
"while ignore_frontend is False may result in high overhead."
)
profiler_dir = self.torch_profiler_dir
if profiler_dir and self.profiler != "torch":
raise ValueError(
"torch_profiler_dir is only applicable when profiler is set to 'torch'"
)
if self.profiler == "torch" and not profiler_dir:
raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")
if profiler_dir:
is_gs_path = (
profiler_dir.startswith("gs://")
and profiler_dir[5:]
and profiler_dir[5] != "/"
)
if not is_gs_path:
self.torch_profiler_dir = os.path.abspath(
os.path.expanduser(profiler_dir)
)
return self

View File

@ -39,6 +39,7 @@ from .lora import LoRAConfig
from .model import ModelConfig from .model import ModelConfig
from .observability import ObservabilityConfig from .observability import ObservabilityConfig
from .parallel import ParallelConfig from .parallel import ParallelConfig
from .profiler import ProfilerConfig
from .scheduler import SchedulerConfig from .scheduler import SchedulerConfig
from .speculative import SpeculativeConfig from .speculative import SpeculativeConfig
from .structured_outputs import StructuredOutputsConfig from .structured_outputs import StructuredOutputsConfig
@ -218,6 +219,8 @@ class VllmConfig:
You can specify the full compilation config like so: You can specify the full compilation config like so:
`{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}` `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
""" """
profiler_config: ProfilerConfig = Field(default_factory=ProfilerConfig)
"""Profiling configuration."""
kv_transfer_config: KVTransferConfig | None = None kv_transfer_config: KVTransferConfig | None = None
"""The configurations for distributed KV cache transfer.""" """The configurations for distributed KV cache transfer."""
kv_events_config: KVEventsConfig | None = None kv_events_config: KVEventsConfig | None = None
@ -296,6 +299,8 @@ class VllmConfig:
vllm_factors.append("None") vllm_factors.append("None")
if self.structured_outputs_config: if self.structured_outputs_config:
vllm_factors.append(self.structured_outputs_config.compute_hash()) vllm_factors.append(self.structured_outputs_config.compute_hash())
if self.profiler_config:
vllm_factors.append(self.profiler_config.compute_hash())
else: else:
vllm_factors.append("None") vllm_factors.append("None")
vllm_factors.append(self.observability_config.compute_hash()) vllm_factors.append(self.observability_config.compute_hash())

View File

@ -50,6 +50,7 @@ from vllm.config import (
ObservabilityConfig, ObservabilityConfig,
ParallelConfig, ParallelConfig,
PoolerConfig, PoolerConfig,
ProfilerConfig,
SchedulerConfig, SchedulerConfig,
SpeculativeConfig, SpeculativeConfig,
StructuredOutputsConfig, StructuredOutputsConfig,
@ -536,6 +537,8 @@ class EngineArgs:
worker_cls: str = ParallelConfig.worker_cls worker_cls: str = ParallelConfig.worker_cls
worker_extension_cls: str = ParallelConfig.worker_extension_cls worker_extension_cls: str = ParallelConfig.worker_extension_cls
profiler_config: ProfilerConfig = get_field(VllmConfig, "profiler_config")
kv_transfer_config: KVTransferConfig | None = None kv_transfer_config: KVTransferConfig | None = None
kv_events_config: KVEventsConfig | None = None kv_events_config: KVEventsConfig | None = None
@ -1168,7 +1171,7 @@ class EngineArgs:
vllm_group.add_argument( vllm_group.add_argument(
"--structured-outputs-config", **vllm_kwargs["structured_outputs_config"] "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
) )
vllm_group.add_argument("--profiler-config", **vllm_kwargs["profiler_config"])
vllm_group.add_argument( vllm_group.add_argument(
"--optimization-level", **vllm_kwargs["optimization_level"] "--optimization-level", **vllm_kwargs["optimization_level"]
) )
@ -1786,6 +1789,7 @@ class EngineArgs:
kv_transfer_config=self.kv_transfer_config, kv_transfer_config=self.kv_transfer_config,
kv_events_config=self.kv_events_config, kv_events_config=self.kv_events_config,
ec_transfer_config=self.ec_transfer_config, ec_transfer_config=self.ec_transfer_config,
profiler_config=self.profiler_config,
additional_config=self.additional_config, additional_config=self.additional_config,
optimization_level=self.optimization_level, optimization_level=self.optimization_level,
) )

View File

@ -20,6 +20,7 @@ from vllm.beam_search import (
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
PoolerConfig, PoolerConfig,
ProfilerConfig,
StructuredOutputsConfig, StructuredOutputsConfig,
is_init_field, is_init_field,
) )
@ -211,6 +212,7 @@ class LLM:
structured_outputs_config: dict[str, Any] structured_outputs_config: dict[str, Any]
| StructuredOutputsConfig | StructuredOutputsConfig
| None = None, | None = None,
profiler_config: dict[str, Any] | ProfilerConfig | None = None,
kv_cache_memory_bytes: int | None = None, kv_cache_memory_bytes: int | None = None,
compilation_config: int | dict[str, Any] | CompilationConfig | None = None, compilation_config: int | dict[str, Any] | CompilationConfig | None = None,
logits_processors: list[str | type[LogitsProcessor]] | None = None, logits_processors: list[str | type[LogitsProcessor]] | None = None,
@ -282,6 +284,20 @@ class LLM:
else: else:
structured_outputs_instance = StructuredOutputsConfig() structured_outputs_instance = StructuredOutputsConfig()
if profiler_config is not None:
if isinstance(profiler_config, dict):
profiler_config_instance = ProfilerConfig(
**{
k: v
for k, v in profiler_config.items()
if is_init_field(ProfilerConfig, k)
}
)
else:
profiler_config_instance = profiler_config
else:
profiler_config_instance = ProfilerConfig()
# warn about single-process data parallel usage. # warn about single-process data parallel usage.
_dp_size = int(kwargs.get("data_parallel_size", 1)) _dp_size = int(kwargs.get("data_parallel_size", 1))
_distributed_executor_backend = kwargs.get("distributed_executor_backend") _distributed_executor_backend = kwargs.get("distributed_executor_backend")
@ -324,6 +340,7 @@ class LLM:
mm_processor_kwargs=mm_processor_kwargs, mm_processor_kwargs=mm_processor_kwargs,
pooler_config=pooler_config, pooler_config=pooler_config,
structured_outputs_config=structured_outputs_instance, structured_outputs_config=structured_outputs_instance,
profiler_config=profiler_config_instance,
compilation_config=compilation_config_instance, compilation_config=compilation_config_instance,
logits_processors=logits_processors, logits_processors=logits_processors,
**kwargs, **kwargs,

View File

@ -99,12 +99,7 @@ class MistralToolParser(ToolParser):
self.bot_token = "[TOOL_CALLS]" self.bot_token = "[TOOL_CALLS]"
self.bot_token_id = self.vocab.get(self.bot_token) self.bot_token_id = self.vocab.get(self.bot_token)
self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL) self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
if not _is_pre_v11_tokeniser(self.model_tokenizer): self._is_pre_v11 = _is_pre_v11_tokeniser(self.model_tokenizer)
self.fn_name_regex = re.compile(
r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL
)
else:
self.fn_name_regex = None
if self.bot_token_id is None: if self.bot_token_id is None:
raise RuntimeError( raise RuntimeError(
@ -148,23 +143,24 @@ class MistralToolParser(ToolParser):
tool_content = model_output.replace(self.bot_token, "").strip() tool_content = model_output.replace(self.bot_token, "").strip()
try: try:
# we first try to directly load the json as parsing very nested
# jsons is difficult
try: try:
if self.fn_name_regex: if not self._is_pre_v11:
function_call_arr = [] function_call_arr = []
for single_tool_content in model_output.split(self.bot_token): for single_tool_content in model_output.split(self.bot_token):
matches = self.fn_name_regex.findall(single_tool_content) if "{" not in single_tool_content:
continue
for match in matches: end_name = single_tool_content.find("{")
fn_name = match[0] fn_name, args = (
args = match[1] single_tool_content[:end_name],
single_tool_content[end_name:],
)
# fn_name is encoded outside serialized json dump # fn_name is encoded outside serialized json dump
# only arguments are serialized # only arguments are serialized
function_call_arr.append( function_call_arr.append(
{"name": fn_name, "arguments": json.loads(args)} {"name": fn_name, "arguments": json.loads(args)}
) )
else: else:
function_call_arr = json.loads(tool_content) function_call_arr = json.loads(tool_content)
except json.JSONDecodeError: except json.JSONDecodeError:

View File

@ -5,7 +5,7 @@
from fastapi import APIRouter, FastAPI, Request from fastapi import APIRouter, FastAPI, Request
from fastapi.responses import Response from fastapi.responses import Response
import vllm.envs as envs from vllm.config import ProfilerConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.logger import init_logger from vllm.logger import init_logger
@ -35,15 +35,12 @@ async def stop_profile(raw_request: Request):
def attach_router(app: FastAPI): def attach_router(app: FastAPI):
if envs.VLLM_TORCH_PROFILER_DIR: profiler_config = getattr(app.state.args, "profiler_config", None)
assert profiler_config is None or isinstance(profiler_config, ProfilerConfig)
if profiler_config is not None and profiler_config.profiler is not None:
logger.warning_once( logger.warning_once(
"Torch Profiler is enabled in the API server. This should ONLY be " "Profiler with mode '%s' is enabled in the "
"used for local development!" "API server. This should ONLY be used for local development!",
profiler_config.profiler,
) )
elif envs.VLLM_TORCH_CUDA_PROFILE:
logger.warning_once(
"CUDA Profiler is enabled in the API server. This should ONLY be "
"used for local development!"
)
if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
app.include_router(router) app.include_router(router)

View File

@ -75,6 +75,7 @@ if TYPE_CHECKING:
VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_MM_INPUT_CACHE_GIB: int = 4
VLLM_TARGET_DEVICE: str = "cuda" VLLM_TARGET_DEVICE: str = "cuda"
VLLM_MAIN_CUDA_VERSION: str = "12.9" VLLM_MAIN_CUDA_VERSION: str = "12.9"
VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
MAX_JOBS: str | None = None MAX_JOBS: str | None = None
NVCC_THREADS: str | None = None NVCC_THREADS: str | None = None
VLLM_USE_PRECOMPILED: bool = False VLLM_USE_PRECOMPILED: bool = False
@ -88,20 +89,23 @@ if TYPE_CHECKING:
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
VLLM_PLUGINS: list[str] | None = None VLLM_PLUGINS: list[str] | None = None
VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
VLLM_TORCH_CUDA_PROFILE: bool = False # Deprecated env variables for profiling, kept for backward compatibility
# See also vllm/config/profiler.py and `--profiler-config` argument
VLLM_TORCH_CUDA_PROFILE: str | None = None
VLLM_TORCH_PROFILER_DIR: str | None = None VLLM_TORCH_PROFILER_DIR: str | None = None
VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False VLLM_TORCH_PROFILER_RECORD_SHAPES: str | None = None
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: str | None = None
VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: str | None = None
VLLM_TORCH_PROFILER_WITH_STACK: str | None = None
VLLM_TORCH_PROFILER_WITH_FLOPS: str | None = None
VLLM_TORCH_PROFILER_USE_GZIP: str | None = None
VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: str | None = None
VLLM_PROFILER_DELAY_ITERS: str | None = None
VLLM_PROFILER_MAX_ITERS: str | None = None
# End of deprecated env variables for profiling
VLLM_USE_AOT_COMPILE: bool = False VLLM_USE_AOT_COMPILE: bool = False
VLLM_USE_BYTECODE_HOOK: bool = False VLLM_USE_BYTECODE_HOOK: bool = False
VLLM_FORCE_AOT_LOAD: bool = False VLLM_FORCE_AOT_LOAD: bool = False
VLLM_TORCH_PROFILER_WITH_STACK: bool = True
VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
VLLM_PROFILER_DELAY_ITERS: int = 0
VLLM_PROFILER_MAX_ITERS: int = 0
VLLM_TORCH_PROFILER_USE_GZIP: bool = True
VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True
VLLM_USE_TRITON_AWQ: bool = False VLLM_USE_TRITON_AWQ: bool = False
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
VLLM_SKIP_P2P_CHECK: bool = False VLLM_SKIP_P2P_CHECK: bool = False
@ -453,6 +457,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Main CUDA version of vLLM. This follows PyTorch but can be overridden. # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
"VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
or "12.9", or "12.9",
# Controls PyTorch float32 matmul precision mode within vLLM workers.
# Valid options mirror torch.set_float32_matmul_precision
"VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
"VLLM_FLOAT32_MATMUL_PRECISION",
"highest",
["highest", "high", "medium"],
case_sensitive=False,
),
# Maximum number of compilation jobs to run in parallel. # Maximum number of compilation jobs to run in parallel.
# By default this is the number of CPUs # By default this is the number of CPUs
"MAX_JOBS": lambda: os.getenv("MAX_JOBS", None), "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
@ -842,71 +854,52 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv( "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
"VLLM_LORA_RESOLVER_CACHE_DIR", None "VLLM_LORA_RESOLVER_CACHE_DIR", None
), ),
# Enables torch CUDA profiling if set. # Enables torch CUDA profiling if set to 1.
# On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered. # Deprecated, see profiler_config.
"VLLM_TORCH_CUDA_PROFILE": lambda: bool( "VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"),
os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0"
),
# Enables torch profiler if set. # Enables torch profiler if set.
# Both AsyncLLM's CPU traces as well as workers' # Deprecated, see profiler_config.
# traces (CPU & GPU) will be saved under this directory. "VLLM_TORCH_PROFILER_DIR": lambda: os.getenv("VLLM_TORCH_PROFILER_DIR"),
# Note that it must be an absolute path. # Enable torch profiler to record shapes if set to 1.
"VLLM_TORCH_PROFILER_DIR": lambda: ( # Deprecated, see profiler_config.
None "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: (
if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES")
else (
val
if val.startswith("gs://") and val[5:] and val[5] != "/"
else os.path.abspath(os.path.expanduser(val))
)
), ),
# Enable torch profiler to record shapes if set # Enable torch profiler to profile memory if set to 1.
# VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will # Deprecated, see profiler_config.
# not record shapes. "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: (
"VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool( os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY")
os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"
), ),
# Enable torch profiler to profile memory if set # Enable torch profiler to profile stack if set to 1.
# VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler # Deprecated, see profiler_config.
# will not profile memory. "VLLM_TORCH_PROFILER_WITH_STACK": lambda: (
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool( os.getenv("VLLM_TORCH_PROFILER_WITH_STACK")
os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"
), ),
# Enable torch profiler to profile stack if set # Enable torch profiler to profile flops if set to 1.
# VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL # Deprecated, see profiler_config.
# profile stack by default. "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: (
"VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool( os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS")
os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"
), ),
# Enable torch profiler to profile flops if set # Disable torch profiling of the AsyncLLMEngine process if set to 1.
# VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will # Deprecated, see profiler_config.
# not profile flops. "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: (
"VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool( os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM")
os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
),
# Disable torch profiling of the AsyncLLMEngine process.
# If set to 1, will not profile the engine process.
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool(
os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0"
), ),
# Delay number of iterations before starting profiling when using # Delay number of iterations before starting profiling when using
# the torch/torch CUDA profiler. If set to 0, will start profiling immediately. # the torch/torch CUDA profiler. If set to 0, will start profiling immediately.
"VLLM_PROFILER_DELAY_ITERS": lambda: int( # Deprecated, see profiler_config.
os.getenv("VLLM_PROFILER_DELAY_ITERS", "0") "VLLM_PROFILER_DELAY_ITERS": lambda: (os.getenv("VLLM_PROFILER_DELAY_ITERS")),
),
# Maximum number of iterations to profile when using the torch/torch CUDA profiler. # Maximum number of iterations to profile when using the torch/torch CUDA profiler.
# If set to 0, will not limit the number of iterations. # If set to 0, will not limit the number of iterations.
"VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")), "VLLM_PROFILER_MAX_ITERS": lambda: os.getenv("VLLM_PROFILER_MAX_ITERS"),
# Control whether torch profiler gzip-compresses profiling files. # Control whether torch profiler gzip-compresses profiling files.
# Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default). # Deprecated, see profiler_config.
"VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool( "VLLM_TORCH_PROFILER_USE_GZIP": lambda: os.getenv("VLLM_TORCH_PROFILER_USE_GZIP"),
os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0"
),
# Control whether torch profiler dumps the self_cuda_time_total table. # Control whether torch profiler dumps the self_cuda_time_total table.
# Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping # Set to 0 to disable dumping the table.
# (enabled by default). # Deprecated, see profiler_config.
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool( "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: (
os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0" os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL")
), ),
# If set, vLLM will use Triton implementations of AWQ. # If set, vLLM will use Triton implementations of AWQ.
"VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),

View File

@ -292,7 +292,7 @@ def set_forward_context(
if num_tokens_across_dp is None: if num_tokens_across_dp is None:
assert ubatch_slices is None assert ubatch_slices is None
assert num_tokens is not None assert num_tokens is not None
_, num_tokens_across_dp = coordinate_batch_across_dp( _, num_tokens_across_dp, _ = coordinate_batch_across_dp(
num_tokens_unpadded=num_tokens, num_tokens_unpadded=num_tokens,
parallel_config=vllm_config.parallel_config, parallel_config=vllm_config.parallel_config,
allow_microbatching=False, allow_microbatching=False,

View File

@ -935,7 +935,11 @@ def enable_batch_invariant_mode():
# Batch invariant matmuls are no longer needed after cublas overrides # Batch invariant matmuls are no longer needed after cublas overrides
if not is_torch_equal_or_newer("2.10.0.dev"): if not is_torch_equal_or_newer("2.10.0.dev"):
if current_platform.is_device_capability(100): if (
current_platform.is_device_capability(100)
or current_platform.is_device_capability(80)
or current_platform.is_device_capability(89)
):
# For PyTorch 2.9, B200 uses GEMV for bs=1 # For PyTorch 2.9, B200 uses GEMV for bs=1
# Requires https://github.com/pytorch/pytorch/pull/166735 # Requires https://github.com/pytorch/pytorch/pull/166735
_batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA") _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")

View File

@ -895,6 +895,48 @@ def get_moe_configs(
return None return None
def _ensure_block_size_k_divisible(
size_k: int, block_size_k: int, group_size: int
) -> int:
"""Ensure block_size_k is a divisor of size_k and divisible by group_size.
This ensures BLOCK_SIZE_K compatibility with MoeWNA16 CUDA kernel which
requires size_k % BLOCK_SIZE_K == 0 and BLOCK_SIZE_K % group_size == 0.
Args:
size_k: The size_k dimension that must be divisible by result.
block_size_k: Preferred block size (will be adjusted if needed).
group_size: The result must be divisible by this.
Returns:
A valid BLOCK_SIZE_K that divides size_k and is divisible by group_size.
"""
# Fast path: already valid
if size_k % block_size_k == 0 and block_size_k % group_size == 0:
return block_size_k
# Find the largest value that:
# 1. Divides size_k (size_k % candidate == 0)
# 2. Is divisible by group_size (candidate % group_size == 0)
# 3. Is <= block_size_k (prefer smaller values close to block_size_k)
#
# Strategy: Search from min(block_size_k, size_k) down to group_size,
# stepping by group_size to ensure divisibility by group_size
max_search = min(block_size_k, size_k)
start = (max_search // group_size) * group_size
for candidate in range(start, group_size - 1, -group_size):
if size_k % candidate == 0:
return candidate
# Fallback: if group_size divides size_k, use it
# This should always be true with correct group_size configuration
if size_k % group_size == 0:
return group_size
# This should not happen with correct group_size, but ensure divisibility
return size_k
def get_moe_wna16_block_config( def get_moe_wna16_block_config(
config: dict[str, int], config: dict[str, int],
use_moe_wna16_cuda: bool, use_moe_wna16_cuda: bool,
@ -960,6 +1002,9 @@ def get_moe_wna16_block_config(
# at the same time. # at the same time.
block_size_n = 1024 block_size_n = 1024
# Ensure BLOCK_SIZE_K is a divisor of size_k for CUDA kernel compatibility
block_size_k = _ensure_block_size_k_divisible(size_k, block_size_k, group_size)
return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k} return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k}

View File

@ -5,6 +5,7 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm._aiter_ops import rocm_aiter_ops
from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.platforms import current_platform from vllm.platforms import current_platform
@ -45,10 +46,13 @@ class QuantFP8(CustomOp):
super().__init__() super().__init__()
self.static = static self.static = static
self.group_shape = group_shape self.group_shape = group_shape
self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN
self.num_token_padding = num_token_padding self.num_token_padding = num_token_padding
self.column_major_scales = column_major_scales self.column_major_scales = column_major_scales
self.use_ue8m0 = use_ue8m0 self.use_ue8m0 = use_ue8m0
self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled()
self.is_group_quant = group_shape.is_per_group() self.is_group_quant = group_shape.is_per_group()
if self.is_group_quant: if self.is_group_quant:
assert not static, "Group quantization only supports dynamic mode" assert not static, "Group quantization only supports dynamic mode"
@ -92,6 +96,33 @@ class QuantFP8(CustomOp):
use_per_token_if_dynamic=self.use_per_token_if_dynamic, use_per_token_if_dynamic=self.use_per_token_if_dynamic,
) )
def forward_hip(
self,
x: torch.Tensor,
scale: torch.Tensor | None = None,
scale_ub: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
use_aiter_quant = (
not self.is_group_quant
and self.use_aiter
and scale_ub is None
and x.is_contiguous()
)
use_aiter_per_tensor_quant = (
use_aiter_quant and self.group_shape == GroupShape.PER_TENSOR
)
use_aiter_per_token_quant = (
use_aiter_quant and self.group_shape == GroupShape.PER_TOKEN
)
if use_aiter_per_tensor_quant:
return rocm_aiter_ops.per_tensor_quant(x, _FP8_DTYPE, scale)
if use_aiter_per_token_quant:
return rocm_aiter_ops.per_token_quant(x, _FP8_DTYPE, scale)
# Fallback to CUDA implementation
return self.forward_cuda(x, scale, scale_ub)
def forward_native( def forward_native(
self, self,
x: torch.Tensor, x: torch.Tensor,

View File

@ -60,7 +60,7 @@ class MoeWNA16Config(QuantizationConfig):
if self.linear_quant_method == "gptq": if self.linear_quant_method == "gptq":
self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config) self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config)
elif self.linear_quant_method == "awq": elif self.linear_quant_method in ("awq", "awq_marlin"):
capability_tuple = current_platform.get_device_capability() capability_tuple = current_platform.get_device_capability()
device_capability = ( device_capability = (
-1 if capability_tuple is None else capability_tuple.to_int() -1 if capability_tuple is None else capability_tuple.to_int()
@ -107,7 +107,7 @@ class MoeWNA16Config(QuantizationConfig):
if linear_quant_method == "gptq": if linear_quant_method == "gptq":
has_zp = not cls.get_from_keys(config, ["sym"]) has_zp = not cls.get_from_keys(config, ["sym"])
modules_to_not_convert = [] modules_to_not_convert = []
elif linear_quant_method == "awq": elif linear_quant_method in ("awq", "awq_marlin"):
has_zp = cls.get_from_keys(config, ["zero_point"]) has_zp = cls.get_from_keys(config, ["zero_point"])
modules_to_not_convert = cls.get_from_keys_or( modules_to_not_convert = cls.get_from_keys_or(
config, ["modules_to_not_convert"], None config, ["modules_to_not_convert"], None
@ -184,7 +184,7 @@ class MoeWNA16Config(QuantizationConfig):
return GPTQConfig.from_config(self.full_config).get_quant_method( return GPTQConfig.from_config(self.full_config).get_quant_method(
layer, prefix layer, prefix
) )
elif self.linear_quant_method == "awq": elif self.linear_quant_method in ("awq", "awq_marlin"):
if self.use_marlin and check_marlin_supports_layer( if self.use_marlin and check_marlin_supports_layer(
layer, self.group_size layer, self.group_size
): ):
@ -468,7 +468,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
shard_size = layer.intermediate_size_per_partition shard_size = layer.intermediate_size_per_partition
# convert gptq and awq weight to a standard format # convert gptq and awq weight to a standard format
if layer.quant_config.linear_quant_method == "awq": # awq_marlin uses the same weight format as awq
if layer.quant_config.linear_quant_method in ("awq", "awq_marlin"):
assert layer.quant_config.weight_bits == 4 assert layer.quant_config.weight_bits == 4
if "weight" in weight_name: if "weight" in weight_name:
loaded_weight = convert_awq_tensor(loaded_weight, "qweight") loaded_weight = convert_awq_tensor(loaded_weight, "qweight")

View File

@ -367,6 +367,8 @@ class Qwen2MoeModel(nn.Module):
self.embed_tokens = VocabParallelEmbedding( self.embed_tokens = VocabParallelEmbedding(
config.vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
quant_config=quant_config,
prefix=f"{prefix}.embed_tokens",
) )
self.start_layer, self.end_layer, self.layers = make_layers( self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers, config.num_hidden_layers,
@ -512,6 +514,12 @@ class Qwen2MoeModel(nn.Module):
continue continue
else: else:
name = remapped_kv_scale_name name = remapped_kv_scale_name
# GGUF: make sure that shared_expert_gate is a 2D tensor.
if (
"mlp.shared_expert_gate" in name
and len(loaded_weight.shape) == 1
):
loaded_weight = loaded_weight[None, :]
param = params_dict[name] param = params_dict[name]
weight_loader = getattr( weight_loader = getattr(
param, "weight_loader", default_weight_loader param, "weight_loader", default_weight_loader

View File

@ -381,6 +381,8 @@ class RocmPlatform(Platform):
compilation_config = vllm_config.compilation_config compilation_config = vllm_config.compilation_config
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
is_eager_execution = compilation_config == CUDAGraphMode.NONE is_eager_execution = compilation_config == CUDAGraphMode.NONE
use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled()
if compilation_config.cudagraph_mode.has_full_cudagraphs(): if compilation_config.cudagraph_mode.has_full_cudagraphs():
# decode context parallel does not support full cudagraphs # decode context parallel does not support full cudagraphs
@ -400,8 +402,6 @@ class RocmPlatform(Platform):
) )
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
if cache_config and cache_config.block_size is None: if cache_config and cache_config.block_size is None:
cache_config.block_size = 16 cache_config.block_size = 16
@ -415,6 +415,9 @@ class RocmPlatform(Platform):
): ):
compilation_config.custom_ops.append("+rms_norm") compilation_config.custom_ops.append("+rms_norm")
if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops:
compilation_config.custom_ops.append("+quant_fp8")
@classmethod @classmethod
def verify_model_arch(cls, model_arch: str) -> None: def verify_model_arch(cls, model_arch: str) -> None:
if model_arch in _ROCM_UNSUPPORTED_MODELS: if model_arch in _ROCM_UNSUPPORTED_MODELS:

View File

@ -3,26 +3,27 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from contextlib import nullcontext from contextlib import nullcontext
from typing import Literal
import torch import torch
from typing_extensions import override from typing_extensions import override
import vllm.envs as envs from vllm.config import ProfilerConfig
from vllm.logger import init_logger from vllm.logger import init_logger
logger = init_logger(__name__) logger = init_logger(__name__)
class WorkerProfiler(ABC): class WorkerProfiler(ABC):
def __init__(self) -> None: def __init__(self, profiler_config: ProfilerConfig) -> None:
self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS self._delay_iters = profiler_config.delay_iterations
if self._delay_iters > 0: if self._delay_iters > 0:
logger.info_once( logger.info_once(
"GPU profiling will start " "GPU profiling will start "
f"{self._delay_iters} steps after start_profile." f"{self._delay_iters} steps after start_profile."
) )
self._max_iters = envs.VLLM_PROFILER_MAX_ITERS self._max_iters = profiler_config.max_iterations
if self._max_iters > 0: if self._max_iters > 0:
logger.info_once( logger.info_once(
"GPU profiling will stop " "GPU profiling will stop "
@ -133,12 +134,27 @@ class WorkerProfiler(ABC):
return nullcontext() return nullcontext()
TorchProfilerActivity = Literal["CPU", "CUDA", "XPU"]
TorchProfilerActivityMap = {
"CPU": torch.profiler.ProfilerActivity.CPU,
"CUDA": torch.profiler.ProfilerActivity.CUDA,
"XPU": torch.profiler.ProfilerActivity.XPU,
}
class TorchProfilerWrapper(WorkerProfiler): class TorchProfilerWrapper(WorkerProfiler):
def __init__(self, worker_name: str, local_rank: int) -> None: def __init__(
super().__init__() self,
profiler_config: ProfilerConfig,
worker_name: str,
local_rank: int,
activities: list[TorchProfilerActivity],
) -> None:
super().__init__(profiler_config)
self.local_rank = local_rank self.local_rank = local_rank
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR self.profiler_config = profiler_config
torch_profiler_trace_dir = profiler_config.torch_profiler_dir
if local_rank in (None, 0): if local_rank in (None, 0):
logger.info( logger.info(
"Torch profiling enabled. Traces will be saved to: %s", "Torch profiling enabled. Traces will be saved to: %s",
@ -147,24 +163,23 @@ class TorchProfilerWrapper(WorkerProfiler):
logger.debug( logger.debug(
"Profiler config: record_shapes=%s," "Profiler config: record_shapes=%s,"
"profile_memory=%s,with_stack=%s,with_flops=%s", "profile_memory=%s,with_stack=%s,with_flops=%s",
envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, profiler_config.torch_profiler_record_shapes,
envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, profiler_config.torch_profiler_with_memory,
envs.VLLM_TORCH_PROFILER_WITH_STACK, profiler_config.torch_profiler_with_stack,
envs.VLLM_TORCH_PROFILER_WITH_FLOPS, profiler_config.torch_profiler_with_flops,
) )
self.dump_cpu_time_total = "CPU" in activities and len(activities) == 1
self.profiler = torch.profiler.profile( self.profiler = torch.profiler.profile(
activities=[ activities=[TorchProfilerActivityMap[activity] for activity in activities],
torch.profiler.ProfilerActivity.CPU, record_shapes=profiler_config.torch_profiler_record_shapes,
torch.profiler.ProfilerActivity.CUDA, profile_memory=profiler_config.torch_profiler_with_memory,
], with_stack=profiler_config.torch_profiler_with_stack,
record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, with_flops=profiler_config.torch_profiler_with_flops,
profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir, torch_profiler_trace_dir,
worker_name=worker_name, worker_name=worker_name,
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP, use_gzip=profiler_config.torch_profiler_use_gzip,
), ),
) )
@ -176,9 +191,10 @@ class TorchProfilerWrapper(WorkerProfiler):
def _stop(self) -> None: def _stop(self) -> None:
self.profiler.stop() self.profiler.stop()
if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: profiler_config = self.profiler_config
rank = self.local_rank rank = self.local_rank
profiler_dir = envs.VLLM_TORCH_PROFILER_DIR if profiler_config.torch_profiler_dump_cuda_time_total:
profiler_dir = profiler_config.torch_profiler_dir
profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt" profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
sort_key = "self_cuda_time_total" sort_key = "self_cuda_time_total"
table = self.profiler.key_averages().table(sort_by=sort_key) table = self.profiler.key_averages().table(sort_by=sort_key)
@ -189,6 +205,12 @@ class TorchProfilerWrapper(WorkerProfiler):
# only print profiler results on rank 0 # only print profiler results on rank 0
if rank == 0: if rank == 0:
print(table) print(table)
if self.dump_cpu_time_total and rank == 0:
logger.info(
self.profiler.key_averages().table(
sort_by="self_cpu_time_total", row_limit=50
)
)
@override @override
def annotate_context_manager(self, name: str): def annotate_context_manager(self, name: str):
@ -196,8 +218,8 @@ class TorchProfilerWrapper(WorkerProfiler):
class CudaProfilerWrapper(WorkerProfiler): class CudaProfilerWrapper(WorkerProfiler):
def __init__(self) -> None: def __init__(self, profiler_config: ProfilerConfig) -> None:
super().__init__() super().__init__(profiler_config)
# Note: lazy import to avoid dependency issues if CUDA is not available. # Note: lazy import to avoid dependency issues if CUDA is not available.
import torch.cuda.profiler as cuda_profiler import torch.cuda.profiler as cuda_profiler

View File

@ -63,6 +63,31 @@ class ReasoningParser:
True if the reasoning content ends in the input_ids. True if the reasoning content ends in the input_ids.
""" """
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
"""
Check if the reasoning content ends in the input_ids on a
decode step.
It is used in structured engines like `xgrammar` to check if the
reasoning content ends in the model output during a decode step.
`input_ids` the entire model output and `delta_ids` are the last few
computed tokens of the model output (like during a decode step).
Parameters:
input_ids: list[int]
The entire model output.
delta_ids: list[int]
The last few computed tokens of the model output at the current decode step.
Returns:
bool
True if the reasoning content ends in the `delta_ids` on a
decode step.
"""
return self.is_reasoning_end(input_ids)
@abstractmethod @abstractmethod
def extract_content_ids(self, input_ids: list[int]) -> list[int]: def extract_content_ids(self, input_ids: list[int]) -> list[int]:
""" """

View File

@ -74,6 +74,12 @@ class BaseThinkingReasoningParser(ReasoningParser):
return True return True
return False return False
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
end_token_id = self.end_token_id
return end_token_id in delta_ids
def extract_content_ids(self, input_ids: list[int]) -> list[int]: def extract_content_ids(self, input_ids: list[int]) -> list[int]:
""" """
Extract the content after the end tokens Extract the content after the end tokens

View File

@ -35,6 +35,11 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids) return self._parser.is_reasoning_end(input_ids)
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
def extract_content_ids(self, input_ids: list[int]) -> list[int]: def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids) return self._parser.extract_content_ids(input_ids)

View File

@ -56,6 +56,11 @@ class Holo2ReasoningParser(ReasoningParser):
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids) return self._parser.is_reasoning_end(input_ids)
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
def extract_content_ids(self, input_ids: list[int]) -> list[int]: def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids) return self._parser.extract_content_ids(input_ids)

View File

@ -32,6 +32,11 @@ class IdentityReasoningParser(ReasoningParser):
# Always return True, since we never treat reasoning specially # Always return True, since we never treat reasoning specially
return True return True
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
return True
def extract_content_ids(self, input_ids: list[int]) -> list[int]: def extract_content_ids(self, input_ids: list[int]) -> list[int]:
# Identity: return all tokens as content # Identity: return all tokens as content
return input_ids return input_ids

View File

@ -145,7 +145,7 @@ class CudagraphDispatcher:
num_tokens: int, num_tokens: int,
uniform_decode: bool, uniform_decode: bool,
has_lora: bool, has_lora: bool,
use_cascade_attn: bool = False, disable_full: bool = False,
) -> tuple[CUDAGraphMode, BatchDescriptor]: ) -> tuple[CUDAGraphMode, BatchDescriptor]:
""" """
Given conditions(e.g.,batch descriptor and if using cascade attention), Given conditions(e.g.,batch descriptor and if using cascade attention),
@ -165,7 +165,7 @@ class CudagraphDispatcher:
) )
relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs() relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
if not use_cascade_attn: if not disable_full:
# check if key exists for full cudagraph # check if key exists for full cudagraph
if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]: if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
return CUDAGraphMode.FULL, batch_desc return CUDAGraphMode.FULL, batch_desc

View File

@ -166,32 +166,24 @@ class AsyncLLM(EngineClient):
pass pass
if ( if (
envs.VLLM_TORCH_PROFILER_DIR vllm_config.profiler_config.profiler == "torch"
and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM and not vllm_config.profiler_config.ignore_frontend
): ):
profiler_dir = vllm_config.profiler_config.torch_profiler_dir
logger.info( logger.info(
"Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501 "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501
envs.VLLM_TORCH_PROFILER_DIR, profiler_dir,
) )
if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0:
logger.warning_once(
"Torch profiler received max_iters or delay_iters setting. These "
"are not compatible with the AsyncLLM profiler and will be ignored "
"for the AsyncLLM process. Engine process profiling will still "
"respect these settings. Consider setting "
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable "
"AsyncLLM profiling."
)
worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm" worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
self.profiler = torch.profiler.profile( self.profiler = torch.profiler.profile(
activities=[ activities=[
torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CPU,
], ],
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, with_stack=vllm_config.profiler_config.torch_profiler_with_stack,
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
envs.VLLM_TORCH_PROFILER_DIR, profiler_dir,
worker_name=worker_name, worker_name=worker_name,
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP, use_gzip=vllm_config.profiler_config.torch_profiler_use_gzip,
), ),
) )
else: else:

View File

@ -1258,7 +1258,7 @@ class EagleProposer:
num_tokens_padded: int, num_tokens_padded: int,
) -> tuple[int, torch.Tensor]: ) -> tuple[int, torch.Tensor]:
# TODO(Flechman): support DBO ubatching # TODO(Flechman): support DBO ubatching
should_ubatch, num_toks_across_dp = coordinate_batch_across_dp( should_ubatch, num_toks_across_dp, _ = coordinate_batch_across_dp(
num_tokens_unpadded=num_tokens_unpadded, num_tokens_unpadded=num_tokens_unpadded,
parallel_config=self.vllm_config.parallel_config, parallel_config=self.vllm_config.parallel_config,
allow_microbatching=False, allow_microbatching=False,

View File

@ -339,7 +339,9 @@ class StructuredOutputManager:
return True return True
# Check if reasoning ends in *this* step # Check if reasoning ends in *this* step
if self.reasoner.is_reasoning_end(request.all_token_ids): if self.reasoner.is_reasoning_end_streaming(
request.all_token_ids, request.all_token_ids[request.num_computed_tokens :]
):
# Reasoning just ended, so we shouldn't advance til # Reasoning just ended, so we shouldn't advance til
# next pass # next pass
structured_req.reasoning_ended = True structured_req.reasoning_ended = True

View File

@ -13,6 +13,7 @@ from vllm.logger import init_logger
from vllm.model_executor.utils import set_random_seed from vllm.model_executor.utils import set_random_seed
from vllm.platforms import CpuArchEnum, current_platform from vllm.platforms import CpuArchEnum, current_platform
from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo
from vllm.profiler.wrapper import TorchProfilerWrapper
from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.cpu_model_runner import CPUModelRunner
from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
@ -38,30 +39,17 @@ class CPUWorker(Worker):
self.parallel_config.disable_custom_all_reduce = True self.parallel_config.disable_custom_all_reduce = True
# Torch profiler. Enabled and configured through env vars: # Torch profiler. Enabled and configured through profiler_config.
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
self.profiler: Any | None = None self.profiler: Any | None = None
if envs.VLLM_TORCH_PROFILER_DIR: profiler_config = vllm_config.profiler_config
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR if profiler_config.profiler == "torch":
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
logger.info( self.profiler = TorchProfilerWrapper(
"Profiling enabled. Traces will be saved to: %s", profiler_config,
torch_profiler_trace_dir, worker_name=worker_name,
local_rank=self.local_rank,
activities=["CPU"],
) )
self.profiler = torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
],
record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir, worker_name=worker_name, use_gzip=False
),
)
else:
self.profiler = None
def init_device(self): def init_device(self):
# Setup OpenMP threads affinity. # Setup OpenMP threads affinity.
@ -202,9 +190,3 @@ class CPUWorker(Worker):
self.profiler.start() self.profiler.start()
else: else:
self.profiler.stop() self.profiler.stop()
if self.local_rank == 0:
logger.info(
self.profiler.key_averages().table(
sort_by="self_cpu_time_total", row_limit=50
)
)

View File

@ -40,16 +40,18 @@ def _run_ar(
should_dp_pad: bool, should_dp_pad: bool,
orig_num_tokens_per_ubatch: int, orig_num_tokens_per_ubatch: int,
padded_num_tokens_per_ubatch: int, padded_num_tokens_per_ubatch: int,
cudagraph_mode: int,
parallel_config: ParallelConfig, parallel_config: ParallelConfig,
) -> torch.Tensor: ) -> torch.Tensor:
dp_size = parallel_config.data_parallel_size dp_size = parallel_config.data_parallel_size
dp_rank = parallel_config.data_parallel_rank dp_rank = parallel_config.data_parallel_rank
device, group = _get_device_and_group(parallel_config) device, group = _get_device_and_group(parallel_config)
tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32) tensor = torch.zeros(5, dp_size, device=device, dtype=torch.int32)
tensor[0][dp_rank] = orig_num_tokens_per_ubatch tensor[0][dp_rank] = orig_num_tokens_per_ubatch
tensor[1][dp_rank] = padded_num_tokens_per_ubatch tensor[1][dp_rank] = padded_num_tokens_per_ubatch
tensor[2][dp_rank] = 1 if should_ubatch else 0 tensor[2][dp_rank] = 1 if should_ubatch else 0
tensor[3][dp_rank] = 1 if should_dp_pad else 0 tensor[3][dp_rank] = 1 if should_dp_pad else 0
tensor[4][dp_rank] = cudagraph_mode
dist.all_reduce(tensor, group=group) dist.all_reduce(tensor, group=group)
return tensor return tensor
@ -89,13 +91,23 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
return num_tokens_across_dp.cpu() return num_tokens_across_dp.cpu()
def _post_process_cudagraph_mode(tensor: torch.Tensor) -> int:
"""
Synchronize cudagraph_mode across DP ranks by taking the minimum.
If any rank has NONE (0), all ranks use NONE.
This ensures all ranks send consistent values (all padded or all unpadded).
"""
return int(tensor[4, :].min().item())
def _synchronize_dp_ranks( def _synchronize_dp_ranks(
num_tokens_unpadded: int, num_tokens_unpadded: int,
num_tokens_padded: int, num_tokens_padded: int,
should_attempt_ubatching: bool, should_attempt_ubatching: bool,
should_attempt_dp_padding: bool, should_attempt_dp_padding: bool,
cudagraph_mode: int,
parallel_config: ParallelConfig, parallel_config: ParallelConfig,
) -> tuple[bool, torch.Tensor | None]: ) -> tuple[bool, torch.Tensor | None, int]:
""" """
1. Decides if each DP rank is going to microbatch. Either all ranks 1. Decides if each DP rank is going to microbatch. Either all ranks
run with microbatching or none of them do. run with microbatching or none of them do.
@ -104,10 +116,13 @@ def _synchronize_dp_ranks(
When running microbatched or if should_attempt_dp_padding is True, all When running microbatched or if should_attempt_dp_padding is True, all
ranks will be padded out so that the run with the same number of tokens ranks will be padded out so that the run with the same number of tokens
3. Synchronizes cudagraph_mode across ranks by taking the minimum.
Returns: tuple[ Returns: tuple[
should_ubatch: Are all DP ranks going to microbatch should_ubatch: Are all DP ranks going to microbatch
num_tokens_after_padding: A tensor containing the total number of num_tokens_after_padding: A tensor containing the total number of
tokens per-microbatch for each DP rank including any DP padding. tokens per-microbatch for each DP rank including any DP padding.
synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
] ]
""" """
@ -121,6 +136,7 @@ def _synchronize_dp_ranks(
should_dp_pad=should_attempt_dp_padding, should_dp_pad=should_attempt_dp_padding,
orig_num_tokens_per_ubatch=num_tokens_unpadded, orig_num_tokens_per_ubatch=num_tokens_unpadded,
padded_num_tokens_per_ubatch=num_tokens_padded, padded_num_tokens_per_ubatch=num_tokens_padded,
cudagraph_mode=cudagraph_mode,
parallel_config=parallel_config, parallel_config=parallel_config,
) )
@ -148,7 +164,10 @@ def _synchronize_dp_ranks(
should_dp_pad, should_dp_pad,
) )
return should_ubatch, num_tokens_after_padding # Synchronize cudagraph_mode across ranks (take min)
synced_cudagraph_mode = _post_process_cudagraph_mode(tensor)
return should_ubatch, num_tokens_after_padding, synced_cudagraph_mode
def coordinate_batch_across_dp( def coordinate_batch_across_dp(
@ -159,7 +178,8 @@ def coordinate_batch_across_dp(
num_tokens_padded: int | None = None, num_tokens_padded: int | None = None,
uniform_decode: bool | None = None, uniform_decode: bool | None = None,
num_scheduled_tokens_per_request: np.ndarray | None = None, num_scheduled_tokens_per_request: np.ndarray | None = None,
) -> tuple[bool, torch.Tensor | None]: cudagraph_mode: int = 0,
) -> tuple[bool, torch.Tensor | None, int]:
""" """
Coordinates amongst all DP ranks to determine if and how the full batch Coordinates amongst all DP ranks to determine if and how the full batch
should be split into microbatches. should be split into microbatches.
@ -175,6 +195,7 @@ def coordinate_batch_across_dp(
only contains single token decodes only contains single token decodes
num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
number of tokens per request. number of tokens per request.
cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL)
Returns: tuple[ Returns: tuple[
ubatch_slices: if this is set then all DP ranks have agreed to ubatch_slices: if this is set then all DP ranks have agreed to
@ -183,12 +204,13 @@ def coordinate_batch_across_dp(
tokens per-microbatch for each DP rank including padding. Will be tokens per-microbatch for each DP rank including padding. Will be
padded up to the max value across all DP ranks when allow_dp_padding padded up to the max value across all DP ranks when allow_dp_padding
is True. is True.
synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
] ]
""" """
if parallel_config.data_parallel_size == 1: if parallel_config.data_parallel_size == 1:
# Early exit. # Early exit.
return False, None return False, None, cudagraph_mode
# If the caller has explicitly enabled microbatching. # If the caller has explicitly enabled microbatching.
should_attempt_ubatching = False should_attempt_ubatching = False
@ -204,12 +226,15 @@ def coordinate_batch_across_dp(
if num_tokens_padded is None: if num_tokens_padded is None:
num_tokens_padded = num_tokens_unpadded num_tokens_padded = num_tokens_unpadded
(should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks( (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) = (
num_tokens_unpadded, _synchronize_dp_ranks(
num_tokens_padded, num_tokens_unpadded,
should_attempt_ubatching, num_tokens_padded,
allow_dp_padding, should_attempt_ubatching,
parallel_config, allow_dp_padding,
cudagraph_mode,
parallel_config,
)
) )
return (should_ubatch, num_tokens_after_padding) return (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode)

View File

@ -2,14 +2,15 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from contextlib import contextmanager from contextlib import contextmanager
import numpy as np
import torch import torch
from vllm.v1.outputs import ( from vllm.v1.outputs import (
AsyncModelRunnerOutput, AsyncModelRunnerOutput,
LogprobsTensors, LogprobsTensors,
ModelRunnerOutput, ModelRunnerOutput,
SamplerOutput,
) )
from vllm.v1.worker.gpu.sample.output import SamplerOutput
class AsyncOutput(AsyncModelRunnerOutput): class AsyncOutput(AsyncModelRunnerOutput):
@ -34,29 +35,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
with torch.cuda.stream(self.copy_stream): with torch.cuda.stream(self.copy_stream):
self.copy_stream.wait_stream(default_stream) self.copy_stream.wait_stream(default_stream)
# NOTE(woosuk): We must ensure that CPU tensors are not freed self.sampled_token_ids = async_copy_to_np(sampler_output.sampled_token_ids)
# before the device-to-host copy is fully completed. For instance,
# operations like
# self.sampled_token_np = ...to("cpu", non_blocking=True).numpy()
# are unsafe because the underlying CPU tensor can be prematurely freed and
# reused by other tensors before the asynchronous copy finishes, potentially
# causing race conditions. To prevent this, we delay freeing by holding
# references until the copy event signals completion.
# Likewise, we also need to keep the reference to the GPU tensors.
# This is done by keeping the reference to sampler_output and
# model_runner_output.
self.sampled_token_ids = sampler_output.sampled_token_ids.to(
"cpu", non_blocking=True
)
if sampler_output.logprobs_tensors is not None: if sampler_output.logprobs_tensors is not None:
self.logprobs_tensors: LogprobsTensors | None = ( self.logprobs_tensors: LogprobsTensors | None = (
sampler_output.logprobs_tensors.to_cpu_nonblocking() sampler_output.logprobs_tensors.to_cpu_nonblocking()
) )
else: else:
self.logprobs_tensors = None self.logprobs_tensors = None
self.num_sampled_tokens_cpu = num_sampled_tokens.to( if sampler_output.num_nans is not None:
"cpu", non_blocking=True self.num_nans = async_copy_to_np(sampler_output.num_nans)
) else:
self.num_nans = None
self.num_sampled_tokens_np = async_copy_to_np(num_sampled_tokens)
self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {} self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
if self.model_runner_output.prompt_logprobs_dict: if self.model_runner_output.prompt_logprobs_dict:
for k, v in self.model_runner_output.prompt_logprobs_dict.items(): for k, v in self.model_runner_output.prompt_logprobs_dict.items():
@ -68,7 +58,6 @@ class AsyncOutput(AsyncModelRunnerOutput):
def get_output(self) -> ModelRunnerOutput: def get_output(self) -> ModelRunnerOutput:
self.copy_event.synchronize() self.copy_event.synchronize()
num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy()
# NOTE(woosuk): The following code is to ensure compatibility with # NOTE(woosuk): The following code is to ensure compatibility with
# the existing model runner. # the existing model runner.
@ -76,10 +65,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
# rather than Python lists. # rather than Python lists.
sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist() sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
num_reqs = len(sampled_token_ids) num_reqs = len(sampled_token_ids)
num_sampled_tokens = self.num_sampled_tokens_np.tolist()
for i in range(num_reqs): for i in range(num_reqs):
del sampled_token_ids[i][num_sampled_tokens_np[i] :] del sampled_token_ids[i][num_sampled_tokens[i] :]
self.model_runner_output.sampled_token_ids = sampled_token_ids self.model_runner_output.sampled_token_ids = sampled_token_ids
if self.num_nans is not None:
num_nans = self.num_nans.tolist()
self.model_runner_output.num_nans_in_logits = {
req_id: num_nans[i]
for i, req_id in enumerate(self.model_runner_output.req_ids)
}
if self.logprobs_tensors is not None: if self.logprobs_tensors is not None:
self.model_runner_output.logprobs = self.logprobs_tensors.tolists() self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
@ -95,3 +92,7 @@ def async_barrier(event: torch.cuda.Event | None):
finally: finally:
if event is not None: if event is not None:
event.record() event.record()
def async_copy_to_np(x: torch.Tensor) -> np.ndarray:
return x.to("cpu", non_blocking=True).numpy()

View File

View File

@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from torch._inductor.runtime.triton_helpers import libdevice
from vllm.triton_utils import tl, triton
@triton.jit
def _num_nans_kernel(
logits_ptr,
logits_stride,
num_nans_ptr,
vocab_size,
BLOCK_SIZE: tl.constexpr,
):
req_idx = tl.program_id(0)
num_nans = 0
for i in range(0, vocab_size, BLOCK_SIZE):
block = i + tl.arange(0, BLOCK_SIZE)
mask = block < vocab_size
logits = tl.load(
logits_ptr + req_idx * logits_stride + block, mask=mask, other=0
)
logits = logits.to(tl.float32)
is_nan = libdevice.isnan(logits).to(tl.int1)
num_nans += tl.sum(is_nan).to(tl.int32)
tl.store(num_nans_ptr + req_idx, num_nans)
def get_num_nans(logits: torch.Tensor) -> torch.Tensor:
num_reqs, vocab_size = logits.shape
BLOCK_SIZE = 8192
num_nans = torch.empty(num_reqs, dtype=torch.int32, device=logits.device)
_num_nans_kernel[(num_reqs,)](
logits,
logits.stride(0),
num_nans,
vocab_size,
BLOCK_SIZE=BLOCK_SIZE,
)
return num_nans

View File

@ -25,7 +25,6 @@ from vllm.v1.outputs import (
LogprobsTensors, LogprobsTensors,
ModelRunnerOutput, ModelRunnerOutput,
) )
from vllm.v1.sample.sampler import SamplerOutput
from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier
from vllm.v1.worker.gpu.attn_utils import ( from vllm.v1.worker.gpu.attn_utils import (
build_attn_metadata, build_attn_metadata,
@ -53,6 +52,7 @@ from vllm.v1.worker.gpu.sample.metadata import (
SamplingMetadata, SamplingMetadata,
expand_sampling_metadata, expand_sampling_metadata,
) )
from vllm.v1.worker.gpu.sample.output import SamplerOutput
from vllm.v1.worker.gpu.sample.sampler import Sampler from vllm.v1.worker.gpu.sample.sampler import Sampler
from vllm.v1.worker.gpu.spec_decode import init_speculator from vllm.v1.worker.gpu.spec_decode import init_speculator
from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample

View File

@ -39,9 +39,7 @@ def _min_p_kernel(
tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask) tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask)
def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor | None) -> None: def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor) -> None:
if min_p is None:
return
num_reqs, vocab_size = logits.shape num_reqs, vocab_size = logits.shape
BLOCK_SIZE = 1024 BLOCK_SIZE = 1024
_min_p_kernel[(num_reqs,)]( _min_p_kernel[(num_reqs,)](

View File

@ -0,0 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
import torch
from vllm.v1.outputs import LogprobsTensors
@dataclass
class SamplerOutput:
sampled_token_ids: torch.Tensor
logprobs_tensors: LogprobsTensors | None
num_nans: torch.Tensor | None

View File

@ -62,6 +62,7 @@ def _penalties_and_temperature_kernel(
mask=packed_block < tl.cdiv(vocab_size, 32), mask=packed_block < tl.cdiv(vocab_size, 32),
) )
prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1 prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1
prompt_bin_mask = prompt_bin_mask.to(tl.int1)
prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE) prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)
# If token appears in prompt or output, apply, otherwise use 1.0 for no-op. # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.

View File

@ -3,13 +3,15 @@
import torch import torch
import vllm.envs as envs
from vllm.config.model import LogprobsMode from vllm.config.model import LogprobsMode
from vllm.v1.outputs import SamplerOutput
from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
from vllm.v1.worker.gpu.metrics.logits import get_num_nans
from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
from vllm.v1.worker.gpu.sample.min_p import apply_min_p from vllm.v1.worker.gpu.sample.min_p import apply_min_p
from vllm.v1.worker.gpu.sample.output import SamplerOutput
from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature
@ -21,12 +23,16 @@ class Sampler:
if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]: if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}") raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
self.logprobs_mode = logprobs_mode self.logprobs_mode = logprobs_mode
self.compute_nans = envs.VLLM_COMPUTE_NANS_IN_LOGITS # False by default.
def __call__( def __call__(
self, self,
logits: torch.Tensor, logits: torch.Tensor,
sampling_metadata: SamplingMetadata, sampling_metadata: SamplingMetadata,
) -> SamplerOutput: ) -> SamplerOutput:
# NOTE(woosuk): We intentionally compute num_nans before sampling to make clear
# that num_nans is computed before applying penalties and temperature.
num_nans = get_num_nans(logits) if self.compute_nans else None
sampled, processed_logits = self.sample(logits, sampling_metadata) sampled, processed_logits = self.sample(logits, sampling_metadata)
if sampling_metadata.max_num_logprobs is not None: if sampling_metadata.max_num_logprobs is not None:
logits = ( logits = (
@ -49,6 +55,7 @@ class Sampler:
# token per request. # token per request.
sampled_token_ids=sampled.view(-1, 1), sampled_token_ids=sampled.view(-1, 1),
logprobs_tensors=logprobs_tensors, logprobs_tensors=logprobs_tensors,
num_nans=num_nans,
) )
return sampler_output return sampler_output
@ -63,7 +70,8 @@ class Sampler:
# Apply penalties and temperature in place. # Apply penalties and temperature in place.
apply_penalties_and_temperature(logits, sampling_metadata) apply_penalties_and_temperature(logits, sampling_metadata)
# Apply min_p in place. # Apply min_p in place.
apply_min_p(logits, sampling_metadata.min_p) if sampling_metadata.min_p is not None:
apply_min_p(logits, sampling_metadata.min_p)
# Apply top_k and/or top_p. This might return a new tensor. # Apply top_k and/or top_p. This might return a new tensor.
logits = apply_top_k_top_p( logits = apply_top_k_top_p(
logits, sampling_metadata.top_k, sampling_metadata.top_p logits, sampling_metadata.top_k, sampling_metadata.top_p

View File

@ -2788,17 +2788,19 @@ class GPUModelRunner(
) )
dispatch_cudagraph = ( dispatch_cudagraph = (
lambda num_tokens: self.cudagraph_dispatcher.dispatch( lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch(
num_tokens=num_tokens, num_tokens=num_tokens,
has_lora=has_lora, has_lora=has_lora,
use_cascade_attn=use_cascade_attn,
uniform_decode=uniform_decode, uniform_decode=uniform_decode,
disable_full=disable_full,
) )
if not force_eager if not force_eager
else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
) )
cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded) cudagraph_mode, batch_descriptor = dispatch_cudagraph(
num_tokens_padded, use_cascade_attn
)
num_tokens_padded = batch_descriptor.num_tokens num_tokens_padded = batch_descriptor.num_tokens
# Extra coordination when running data-parallel since we need to coordinate # Extra coordination when running data-parallel since we need to coordinate
@ -2813,23 +2815,28 @@ class GPUModelRunner(
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
) )
should_ubatch, num_tokens_across_dp = coordinate_batch_across_dp( should_ubatch, num_tokens_across_dp, synced_cudagraph_mode = (
num_tokens_unpadded=num_tokens, coordinate_batch_across_dp(
parallel_config=self.parallel_config, num_tokens_unpadded=num_tokens,
allow_microbatching=allow_microbatching, parallel_config=self.parallel_config,
allow_dp_padding=allow_dp_padding, allow_microbatching=allow_microbatching,
num_tokens_padded=num_tokens_padded, allow_dp_padding=allow_dp_padding,
uniform_decode=uniform_decode, num_tokens_padded=num_tokens_padded,
num_scheduled_tokens_per_request=num_scheduled_tokens_np, uniform_decode=uniform_decode,
num_scheduled_tokens_per_request=num_scheduled_tokens_np,
cudagraph_mode=cudagraph_mode.value,
)
) )
# Extract DP padding if there is any # Extract DP-synced values
if num_tokens_across_dp is not None: if num_tokens_across_dp is not None:
dp_rank = self.parallel_config.data_parallel_rank dp_rank = self.parallel_config.data_parallel_rank
num_tokens_padded = int(num_tokens_across_dp[dp_rank].item()) num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
# Re-dispatch with DP padding so we have the correct batch_descriptor
# Re-dispatch with DP padding cudagraph_mode, batch_descriptor = dispatch_cudagraph(
cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded) num_tokens_padded,
disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value,
)
# Assert to make sure the agreed upon token count is correct otherwise # Assert to make sure the agreed upon token count is correct otherwise
# num_tokens_across_dp will no-longer be valid # num_tokens_across_dp will no-longer be valid
assert batch_descriptor.num_tokens == num_tokens_padded assert batch_descriptor.num_tokens == num_tokens_padded
@ -4161,10 +4168,19 @@ class GPUModelRunner(
if self.speculative_config and self.speculative_config.use_eagle(): if self.speculative_config and self.speculative_config.use_eagle():
assert isinstance(self.drafter, EagleProposer) assert isinstance(self.drafter, EagleProposer)
# Eagle currently only supports PIECEWISE cudagraphs.
# Therefore only use cudagraphs if the main model uses PIECEWISE
# NOTE(lucas): this is a hack, need to clean up.
use_cudagraphs = ( use_cudagraphs = (
cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE) (
and not self.speculative_config.enforce_eager is_graph_capturing
) and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
)
or (
not is_graph_capturing
and cudagraph_runtime_mode != CUDAGraphMode.NONE
)
) and not self.speculative_config.enforce_eager
# Note(gnovack) - We need to disable cudagraphs for one of the two # Note(gnovack) - We need to disable cudagraphs for one of the two
# lora cases when cudagraph_specialize_lora is enabled. This is a # lora cases when cudagraph_specialize_lora is enabled. This is a

View File

@ -38,7 +38,7 @@ from vllm.model_executor import set_random_seed
from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.models.interfaces import is_mixture_of_experts
from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tasks import SupportedTask from vllm.tasks import SupportedTask
from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_constants import GiB_bytes
@ -79,6 +79,10 @@ class Worker(WorkerBase):
is_driver_worker=is_driver_worker, is_driver_worker=is_driver_worker,
) )
# configure float32 matmul precision according to vLLM env.
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
torch.set_float32_matmul_precision(precision)
if self.model_config.trust_remote_code: if self.model_config.trust_remote_code:
# note: lazy import to avoid importing torch before initializing # note: lazy import to avoid importing torch before initializing
from vllm.utils.import_utils import init_cached_hf_modules from vllm.utils.import_utils import init_cached_hf_modules
@ -88,17 +92,19 @@ class Worker(WorkerBase):
# Buffers saved before sleep # Buffers saved before sleep
self._sleep_saved_buffers: dict[str, torch.Tensor] = {} self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
# Torch/CUDA profiler. Enabled and configured through env vars: # Torch/CUDA profiler. Enabled and configured through profiler_config.
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
# VLLM_TORCH_CUDA_PROFILE=1
self.profiler: Any | None = None self.profiler: Any | None = None
if envs.VLLM_TORCH_PROFILER_DIR: profiler_config = vllm_config.profiler_config
if profiler_config.profiler == "torch":
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
self.profiler = TorchProfilerWrapper( self.profiler = TorchProfilerWrapper(
worker_name=worker_name, local_rank=self.local_rank profiler_config,
worker_name=worker_name,
local_rank=self.local_rank,
activities=["CPU", "CUDA"],
) )
elif envs.VLLM_TORCH_CUDA_PROFILE: elif profiler_config.profiler == "cuda":
self.profiler = CudaProfilerWrapper() self.profiler = CudaProfilerWrapper(profiler_config)
else: else:
self.profiler = None self.profiler = None

View File

@ -98,10 +98,10 @@ class TPUWorker:
# MP runtime is initialized. # MP runtime is initialized.
self.profiler = None self.profiler = None
self.profile_dir = None self.profile_dir = None
if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1: if vllm_config.profiler_config.profiler == "torch" and self.rank < 1:
# For TPU, we can only have 1 active profiler session for 1 profiler # For TPU, we can only have 1 active profiler session for 1 profiler
# server. So we only profile on rank0. # server. So we only profile on rank0.
self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR self.profile_dir = vllm_config.profiler_config.torch_profiler_dir
logger.info( logger.info(
"Profiling enabled. Traces will be saved to: %s", self.profile_dir "Profiling enabled. Traces will be saved to: %s", self.profile_dir
) )

View File

@ -6,12 +6,12 @@ from typing import Any
import torch import torch
import torch.distributed import torch.distributed
import vllm.envs as envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.distributed import get_world_group from vllm.distributed import get_world_group
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor import set_random_seed from vllm.model_executor import set_random_seed
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.profiler.wrapper import TorchProfilerWrapper
from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
from vllm.v1.worker.xpu_model_runner import XPUModelRunner from vllm.v1.worker.xpu_model_runner import XPUModelRunner
@ -36,41 +36,17 @@ class XPUWorker(Worker):
assert device_config.device_type == "xpu" assert device_config.device_type == "xpu"
assert current_platform.is_xpu() assert current_platform.is_xpu()
# Torch profiler. Enabled and configured through env vars: # Torch profiler. Enabled and configured through profiler_config.
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
self.profiler: Any | None = None self.profiler: Any | None = None
if envs.VLLM_TORCH_PROFILER_DIR: profiler_config = vllm_config.profiler_config
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR if profiler_config.profiler == "torch":
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
logger.info( self.profiler = TorchProfilerWrapper(
"Profiling enabled. Traces will be saved to: %s", profiler_config,
torch_profiler_trace_dir, worker_name=worker_name,
local_rank=self.local_rank,
activities=["CPU", "XPU"],
) )
logger.debug(
"Profiler config: record_shapes=%s,"
"profile_memory=%s,with_stack=%s,with_flops=%s",
envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
envs.VLLM_TORCH_PROFILER_WITH_STACK,
envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
)
self.profiler = torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.XPU,
],
record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir,
worker_name=worker_name,
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
),
)
else:
self.profiler = None
# we provide this function due to `torch.xpu.mem_get_info()` doesn't # we provide this function due to `torch.xpu.mem_get_info()` doesn't
# return correct free_gpu_memory on intel client GPU. We need to # return correct free_gpu_memory on intel client GPU. We need to