mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-09 18:19:12 +08:00
Merge branch 'main' into imarkov/eplb_optimizations
This commit is contained in:
commit
60f744d7ce
@ -398,7 +398,8 @@ steps:
|
|||||||
timeout_in_minutes: 25
|
timeout_in_minutes: 25
|
||||||
gpu: h100
|
gpu: h100
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/v1/attention
|
||||||
|
- vllm/model_executor/layers
|
||||||
- tests/v1/determinism/
|
- tests/v1/determinism/
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
@ -440,23 +441,29 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
|
- vllm/multimodal
|
||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
|
# for basic
|
||||||
|
- python3 offline_inference/basic/chat.py
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/basic/chat.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference/prefix_caching.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/llm_engine_example.py
|
- python3 offline_inference/basic/score.py
|
||||||
|
# for multi-modal models
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
# for pooling models
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||||
- python3 offline_inference/basic/score.py
|
# for features demo
|
||||||
|
- python3 offline_inference/prefix_caching.py
|
||||||
|
- python3 offline_inference/llm_engine_example.py
|
||||||
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
@ -718,6 +725,18 @@ steps:
|
|||||||
- uv pip install --system conch-triton-kernels
|
- uv pip install --system conch-triton-kernels
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
|
- label: LM Eval Small Models # 53min
|
||||||
|
timeout_in_minutes: 75
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
# grade: Blocking
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
autorun_on_main: true
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: OpenAI API correctness # 10min
|
- label: OpenAI API correctness # 10min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
@ -727,7 +746,7 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/entrypoints/openai/
|
- vllm/entrypoints/openai/
|
||||||
- vllm/model_executor/models/whisper.py
|
- vllm/model_executor/models/whisper.py
|
||||||
commands: # LMEval
|
commands: # LMEval+Transcription WER check
|
||||||
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
||||||
- pytest -s entrypoints/openai/correctness/
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
@ -963,6 +982,19 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
|
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
|
||||||
|
timeout_in_minutes: 180
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
# grade: Blocking
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/multimodal/
|
||||||
|
- vllm/inputs/
|
||||||
|
- vllm/v1/core/
|
||||||
|
commands:
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||||
timeout_in_minutes: 120
|
timeout_in_minutes: 120
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -1098,7 +1130,6 @@ steps:
|
|||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
- vllm/model_executor/layers/fused_moe/layer.py
|
|
||||||
- tests/compile/test_fusion_attn.py
|
- tests/compile/test_fusion_attn.py
|
||||||
- tests/compile/test_silu_mul_quant_fusion.py
|
- tests/compile/test_silu_mul_quant_fusion.py
|
||||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
@ -1132,12 +1163,25 @@ steps:
|
|||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
- tests/compile/fullgraph/test_full_graph.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all e2e fusion tests
|
# Run all e2e fusion tests
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
|
||||||
|
- label: Blackwell GPT-OSS Eval
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
optional: true # run on nightlies
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/evals/gpt_oss
|
||||||
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
commands:
|
||||||
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
|
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||||
|
|
||||||
- label: Blackwell Quantized MoE Test
|
- label: Blackwell Quantized MoE Test
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
@ -1155,6 +1199,16 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
|
- label: Blackwell LM Eval Small Models
|
||||||
|
timeout_in_minutes: 120
|
||||||
|
gpu: b200
|
||||||
|
optional: true # run on nightlies
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
@ -1397,6 +1451,39 @@ steps:
|
|||||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
|
|
||||||
|
- label: LM Eval Large Models # optional
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
|
##### H100 test #####
|
||||||
|
- label: LM Eval Large Models (H100) # optional
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||||
|
|
||||||
|
|
||||||
##### H200 test #####
|
##### H200 test #####
|
||||||
- label: Distributed Tests (H200) # optional
|
- label: Distributed Tests (H200) # optional
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -1440,29 +1527,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: Blackwell LM Eval Small Models
|
|
||||||
timeout_in_minutes: 120
|
|
||||||
gpu: b200
|
|
||||||
optional: true # run on nightlies
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
|
||||||
|
|
||||||
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
|
||||||
timeout_in_minutes: 70
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_1
|
|
||||||
# grade: Blocking
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/multimodal/
|
|
||||||
- vllm/inputs/
|
|
||||||
- vllm/v1/core/
|
|
||||||
commands:
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
|
||||||
|
|
||||||
- label: LM Eval Large Models (4 Card)
|
- label: LM Eval Large Models (4 Card)
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
@ -1478,21 +1542,6 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
- label: LM Eval Large Models (H100) # optional
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_4
|
|
||||||
# grade: Blocking
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
|
||||||
|
|
||||||
- label: ROCm LM Eval Large Models (8 Card)
|
- label: ROCm LM Eval Large Models (8 Card)
|
||||||
mirror_hardwares: [amdproduction]
|
mirror_hardwares: [amdproduction]
|
||||||
agent_pool: mi325_8
|
agent_pool: mi325_8
|
||||||
@ -1517,6 +1566,20 @@ steps:
|
|||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||||
|
|
||||||
|
##### RL Integration Tests #####
|
||||||
|
- label: Prime-RL Integration Test # 15min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_2
|
||||||
|
# grade: Blocking
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
@ -1550,17 +1613,26 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
##### RL Integration Tests #####
|
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||||
- label: Prime-RL Integration Test # 15min
|
timeout_in_minutes: 60
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_2
|
agent_pool: mi325_4
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
timeout_in_minutes: 30
|
gpu: h100
|
||||||
optional: true
|
optional: true
|
||||||
num_gpus: 2
|
num_gpus: 4
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||||
|
|
||||||
|
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||||
|
|||||||
@ -468,7 +468,9 @@ steps:
|
|||||||
# tests covered elsewhere.
|
# tests covered elsewhere.
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||||
|
# (using -0 for proper path handling)
|
||||||
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@ -482,7 +484,9 @@ steps:
|
|||||||
# as it is a heavy test that is covered in other steps.
|
# as it is a heavy test that is covered in other steps.
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||||
|
# (using -0 for proper path handling)
|
||||||
|
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 27min
|
- label: PyTorch Fullgraph Test # 27min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
|
|||||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||||
|
|||||||
2
.github/workflows/macos-smoke-test.yml
vendored
2
.github/workflows/macos-smoke-test.yml
vendored
@ -12,7 +12,7 @@ jobs:
|
|||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v6.0.1
|
||||||
|
|
||||||
- uses: astral-sh/setup-uv@v7
|
- uses: astral-sh/setup-uv@v7
|
||||||
with:
|
with:
|
||||||
|
|||||||
2
.github/workflows/pre-commit.yml
vendored
2
.github/workflows/pre-commit.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
|||||||
pre-commit:
|
pre-commit:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||||
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||||
with:
|
with:
|
||||||
python-version: "3.12"
|
python-version: "3.12"
|
||||||
|
|||||||
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -15,7 +15,7 @@ jobs:
|
|||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
|||||||
@ -96,8 +96,9 @@ start_server() {
|
|||||||
# This correctly passes each element as a separate argument.
|
# This correctly passes each element as a separate argument.
|
||||||
if [[ -n "$profile_dir" ]]; then
|
if [[ -n "$profile_dir" ]]; then
|
||||||
# Start server with profiling enabled
|
# Start server with profiling enabled
|
||||||
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
VLLM_SERVER_DEV_MODE=1 \
|
||||||
|
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
else
|
else
|
||||||
# Start server without profiling
|
# Start server without profiling
|
||||||
VLLM_SERVER_DEV_MODE=1 \
|
VLLM_SERVER_DEV_MODE=1 \
|
||||||
|
|||||||
@ -963,8 +963,7 @@ def create_argument_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--profile",
|
"--profile",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Use Torch Profiler. The endpoint must be launched with "
|
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
||||||
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--result-dir",
|
"--result-dir",
|
||||||
|
|||||||
@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes.
|
|||||||
- [vllm.config.MultiModalConfig][]
|
- [vllm.config.MultiModalConfig][]
|
||||||
- [vllm.config.PoolerConfig][]
|
- [vllm.config.PoolerConfig][]
|
||||||
- [vllm.config.StructuredOutputsConfig][]
|
- [vllm.config.StructuredOutputsConfig][]
|
||||||
|
- [vllm.config.ProfilerConfig][]
|
||||||
- [vllm.config.ObservabilityConfig][]
|
- [vllm.config.ObservabilityConfig][]
|
||||||
- [vllm.config.KVTransferConfig][]
|
- [vllm.config.KVTransferConfig][]
|
||||||
- [vllm.config.CompilationConfig][]
|
- [vllm.config.CompilationConfig][]
|
||||||
|
|||||||
@ -5,16 +5,15 @@
|
|||||||
|
|
||||||
## Profile with PyTorch Profiler
|
## Profile with PyTorch Profiler
|
||||||
|
|
||||||
We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables:
|
We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config`
|
||||||
|
when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:
|
||||||
|
|
||||||
- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default
|
- `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
|
||||||
- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
|
- `torch_profiler_with_memory` to record memory, off by default
|
||||||
- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
|
- `torch_profiler_with_stack` to enable recording stack information, on by default
|
||||||
- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
|
- `torch_profiler_with_flops` to enable recording FLOPs, off by default
|
||||||
- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
|
- `torch_profiler_use_gzip` to control gzip-compressing profiling files, on by default
|
||||||
- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
|
- `torch_profiler_dump_cuda_time_total` to control dumping and printing the aggregated CUDA self time table, on by default
|
||||||
|
|
||||||
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
|
|
||||||
|
|
||||||
When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
|
When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
|
||||||
|
|
||||||
@ -40,8 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline
|
|||||||
#### OpenAI Server
|
#### OpenAI Server
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}'
|
||||||
vllm serve meta-llama/Llama-3.1-8B-Instruct
|
|
||||||
```
|
```
|
||||||
|
|
||||||
vllm bench command:
|
vllm bench command:
|
||||||
@ -104,13 +102,12 @@ To profile the server, you will want to prepend your `vllm serve` command with `
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# server
|
# server
|
||||||
VLLM_TORCH_CUDA_PROFILE=1 \
|
|
||||||
nsys profile \
|
nsys profile \
|
||||||
--trace-fork-before-exec=true \
|
--trace-fork-before-exec=true \
|
||||||
--cuda-graph-trace=node \
|
--cuda-graph-trace=node \
|
||||||
--capture-range=cudaProfilerApi \
|
--capture-range=cudaProfilerApi \
|
||||||
--capture-range-end repeat \
|
--capture-range-end repeat \
|
||||||
vllm serve meta-llama/Llama-3.1-8B-Instruct
|
vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config.profiler cuda
|
||||||
|
|
||||||
# client
|
# client
|
||||||
vllm bench serve \
|
vllm bench serve \
|
||||||
|
|||||||
@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
|
|||||||
|
|
||||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||||
return self.end_token_id in input_ids
|
return self.end_token_id in input_ids
|
||||||
|
|
||||||
|
def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool:
|
||||||
|
return self.end_token_id in delta_token_ids
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -1,14 +1,10 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import os
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
# enable torch profiler, can also be set on cmd line
|
|
||||||
os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
|
|
||||||
|
|
||||||
# Sample prompts.
|
# Sample prompts.
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@ -22,7 +18,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Create an LLM.
|
# Create an LLM.
|
||||||
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
|
llm = LLM(
|
||||||
|
model="facebook/opt-125m",
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
profiler_config={
|
||||||
|
"profiler": "torch",
|
||||||
|
"torch_profiler_dir": "./vllm_profile",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
llm.start_profile()
|
llm.start_profile()
|
||||||
|
|
||||||
|
|||||||
@ -17,7 +17,6 @@ def test_compile():
|
|||||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||||
@pytest.mark.forked
|
@pytest.mark.forked
|
||||||
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
|
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
|
def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
|
||||||
"""Test that Qwen2.5-VL vision submodules are compiled.
|
"""Test that Qwen2.5-VL vision submodules are compiled.
|
||||||
|
|
||||||
|
|||||||
@ -80,6 +80,8 @@ def test_compile_ranges(use_fresh_inductor_cache):
|
|||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
scheduler_config=SchedulerConfig(
|
scheduler_config=SchedulerConfig(
|
||||||
max_num_batched_tokens=8192,
|
max_num_batched_tokens=8192,
|
||||||
|
max_model_len=8192,
|
||||||
|
is_encoder_decoder=False,
|
||||||
),
|
),
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
@ -112,6 +114,8 @@ def test_compile_config_get_compile_ranges():
|
|||||||
VllmConfig(
|
VllmConfig(
|
||||||
scheduler_config=SchedulerConfig(
|
scheduler_config=SchedulerConfig(
|
||||||
max_num_batched_tokens=8192,
|
max_num_batched_tokens=8192,
|
||||||
|
max_model_len=8192,
|
||||||
|
is_encoder_decoder=False,
|
||||||
),
|
),
|
||||||
compilation_config=compilation_config,
|
compilation_config=compilation_config,
|
||||||
)
|
)
|
||||||
@ -134,6 +138,8 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
|
|||||||
)
|
)
|
||||||
scheduler_config = SchedulerConfig(
|
scheduler_config = SchedulerConfig(
|
||||||
max_num_batched_tokens=8192,
|
max_num_batched_tokens=8192,
|
||||||
|
max_model_len=8192,
|
||||||
|
is_encoder_decoder=False,
|
||||||
)
|
)
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
|
|||||||
@ -5,9 +5,14 @@ import copy
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
from vllm.compilation.inductor_pass import (
|
||||||
|
CallableInductorPass,
|
||||||
|
InductorPass,
|
||||||
|
pass_context,
|
||||||
|
)
|
||||||
from vllm.compilation.pass_manager import PostGradPassManager
|
from vllm.compilation.pass_manager import PostGradPassManager
|
||||||
from vllm.config import ModelConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
|
from vllm.config.utils import Range
|
||||||
|
|
||||||
|
|
||||||
# dummy custom pass that doesn't inherit
|
# dummy custom pass that doesn't inherit
|
||||||
@ -42,35 +47,37 @@ class ProperPass(InductorPass):
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_pass_manager_uuid(callable):
|
def test_pass_manager_uuid(callable):
|
||||||
# Some passes need dtype to be set
|
# Set the pass context as PassManager uuid uses it
|
||||||
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
|
with pass_context(Range(start=1, end=8)):
|
||||||
|
# Some passes need dtype to be set
|
||||||
|
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
|
||||||
|
|
||||||
pass_manager = PostGradPassManager()
|
pass_manager = PostGradPassManager()
|
||||||
pass_manager.configure(config)
|
pass_manager.configure(config)
|
||||||
|
|
||||||
# Check that UUID is different if the same pass is added 2x
|
# Check that UUID is different if the same pass is added 2x
|
||||||
pass_manager.add(callable)
|
pass_manager.add(callable)
|
||||||
uuid1 = pass_manager.uuid()
|
uuid1 = pass_manager.uuid()
|
||||||
pass_manager.add(callable)
|
pass_manager.add(callable)
|
||||||
uuid2 = pass_manager.uuid()
|
uuid2 = pass_manager.uuid()
|
||||||
assert uuid1 != uuid2
|
assert uuid1 != uuid2
|
||||||
|
|
||||||
# UUID should be the same as the original one,
|
# UUID should be the same as the original one,
|
||||||
# as we constructed in the same way.
|
# as we constructed in the same way.
|
||||||
pass_manager2 = PostGradPassManager()
|
pass_manager2 = PostGradPassManager()
|
||||||
pass_manager2.configure(config)
|
pass_manager2.configure(config)
|
||||||
pass_manager2.add(callable)
|
pass_manager2.add(callable)
|
||||||
assert uuid1 == pass_manager2.uuid()
|
assert uuid1 == pass_manager2.uuid()
|
||||||
|
|
||||||
# UUID should be different due to config change
|
# UUID should be different due to config change
|
||||||
config2 = copy.deepcopy(config)
|
config2 = copy.deepcopy(config)
|
||||||
config2.compilation_config.pass_config.fuse_norm_quant = (
|
config2.compilation_config.pass_config.fuse_norm_quant = (
|
||||||
not config2.compilation_config.pass_config.fuse_norm_quant
|
not config2.compilation_config.pass_config.fuse_norm_quant
|
||||||
)
|
)
|
||||||
config2.compilation_config.pass_config.fuse_act_quant = (
|
config2.compilation_config.pass_config.fuse_act_quant = (
|
||||||
not config2.compilation_config.pass_config.fuse_act_quant
|
not config2.compilation_config.pass_config.fuse_act_quant
|
||||||
)
|
)
|
||||||
pass_manager3 = PostGradPassManager()
|
pass_manager3 = PostGradPassManager()
|
||||||
pass_manager3.configure(config2)
|
pass_manager3.configure(config2)
|
||||||
pass_manager3.add(callable)
|
pass_manager3.add(callable)
|
||||||
assert uuid1 != pass_manager3.uuid()
|
assert uuid1 != pass_manager3.uuid()
|
||||||
|
|||||||
@ -26,7 +26,14 @@ def clear_cache():
|
|||||||
_cached_get_attn_backend.cache_clear()
|
_cached_get_attn_backend.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
|
devices = ["cpu"]
|
||||||
|
if current_platform.is_cuda():
|
||||||
|
devices.append("cuda")
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
devices.append("hip")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("device", devices)
|
||||||
def test_mha_attn_platform(device: str):
|
def test_mha_attn_platform(device: str):
|
||||||
"""
|
"""
|
||||||
Test the attention selector between different platform and device.
|
Test the attention selector between different platform and device.
|
||||||
@ -46,7 +53,7 @@ def test_mha_attn_platform(device: str):
|
|||||||
patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
|
patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
|
||||||
):
|
):
|
||||||
attn = MultiHeadAttention(16, 64, scale=1)
|
attn = MultiHeadAttention(16, 64, scale=1)
|
||||||
assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
|
assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
|
||||||
else:
|
else:
|
||||||
# Test CUDA with head_size=64 (divisible by 32)
|
# Test CUDA with head_size=64 (divisible by 32)
|
||||||
# - should use vLLM's FlashAttention
|
# - should use vLLM's FlashAttention
|
||||||
|
|||||||
@ -103,7 +103,7 @@ def ref_dynamic_per_tensor_fp8_quant(
|
|||||||
.clamp(fp8_traits_min, fp8_traits_max)
|
.clamp(fp8_traits_min, fp8_traits_max)
|
||||||
.to(FP8_DTYPE)
|
.to(FP8_DTYPE)
|
||||||
)
|
)
|
||||||
return ref_out, ref_scale.view((1, 1))
|
return ref_out, ref_scale.view(1)
|
||||||
|
|
||||||
|
|
||||||
def native_w8a8_block_matmul(
|
def native_w8a8_block_matmul(
|
||||||
|
|||||||
@ -132,6 +132,41 @@ class TestBaseThinkingReasoningParserMethods:
|
|||||||
is False
|
is False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_is_reasoning_end_streaming(self, test_tokenizer):
|
||||||
|
"""Test the is_reasoning_end_streaming method."""
|
||||||
|
parser = TestThinkingReasoningParser(test_tokenizer)
|
||||||
|
end_token_id = parser.end_token_id
|
||||||
|
start_token_id = parser.start_token_id
|
||||||
|
|
||||||
|
assert (
|
||||||
|
parser.is_reasoning_end_streaming([1, 2, end_token_id], [end_token_id])
|
||||||
|
is True
|
||||||
|
)
|
||||||
|
assert parser.is_reasoning_end_streaming([1, 2, 3, 4], [4]) is False
|
||||||
|
assert parser.is_reasoning_end_streaming([], []) is False
|
||||||
|
assert (
|
||||||
|
parser.is_reasoning_end_streaming(
|
||||||
|
[1, start_token_id, 2, end_token_id], [end_token_id]
|
||||||
|
)
|
||||||
|
is True
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
parser.is_reasoning_end_streaming([1, start_token_id, 2, 3], [3]) is False
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
parser.is_reasoning_end_streaming(
|
||||||
|
[1, start_token_id, 2, end_token_id, 2, start_token_id, 2],
|
||||||
|
[2],
|
||||||
|
)
|
||||||
|
is False
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
parser.is_reasoning_end_streaming(
|
||||||
|
[1, start_token_id, 2, end_token_id, 2, 2], [2]
|
||||||
|
)
|
||||||
|
is False
|
||||||
|
)
|
||||||
|
|
||||||
def test_extract_content_ids(self, test_tokenizer):
|
def test_extract_content_ids(self, test_tokenizer):
|
||||||
"""Test the extract_content_ids method."""
|
"""Test the extract_content_ids method."""
|
||||||
parser = TestThinkingReasoningParser(test_tokenizer)
|
parser = TestThinkingReasoningParser(test_tokenizer)
|
||||||
|
|||||||
@ -40,6 +40,7 @@ def test_identity_reasoning_parser_basic(tokenizer):
|
|||||||
input_tokens = tokenizer.tokenize(input_text)
|
input_tokens = tokenizer.tokenize(input_text)
|
||||||
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
|
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
|
||||||
assert parser.is_reasoning_end(input_ids) is True
|
assert parser.is_reasoning_end(input_ids) is True
|
||||||
|
assert parser.is_reasoning_end_streaming(input_ids, input_ids) is True
|
||||||
|
|
||||||
# Test extract_content_ids returns all input_ids
|
# Test extract_content_ids returns all input_ids
|
||||||
assert parser.extract_content_ids(input_ids) == input_ids
|
assert parser.extract_content_ids(input_ids) == input_ids
|
||||||
|
|||||||
@ -615,6 +615,7 @@ def test_extract_tool_calls_streaming(
|
|||||||
"single_tool_weather",
|
"single_tool_weather",
|
||||||
"multiple_tool_calls",
|
"multiple_tool_calls",
|
||||||
"content_before_tool",
|
"content_before_tool",
|
||||||
|
"complex",
|
||||||
],
|
],
|
||||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||||
argvalues=[
|
argvalues=[
|
||||||
@ -673,6 +674,21 @@ def test_extract_tool_calls_streaming(
|
|||||||
],
|
],
|
||||||
"bla",
|
"bla",
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
# Complex
|
||||||
|
"""[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
|
||||||
|
[
|
||||||
|
ToolCall(
|
||||||
|
function=FunctionCall(
|
||||||
|
name="bash",
|
||||||
|
arguments=json.dumps(
|
||||||
|
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
],
|
||||||
|
"",
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_extract_tool_calls_streaming_one_chunk(
|
def test_extract_tool_calls_streaming_one_chunk(
|
||||||
|
|||||||
@ -161,10 +161,10 @@ class TestCudagraphDispatcher:
|
|||||||
assert rt_mode == CUDAGraphMode.NONE
|
assert rt_mode == CUDAGraphMode.NONE
|
||||||
assert key == BatchDescriptor(num_tokens=15)
|
assert key == BatchDescriptor(num_tokens=15)
|
||||||
|
|
||||||
# 4. Cascade attention should have a fall back mode
|
# 4. disable_full should have a fall back mode (e.g., cascade attention)
|
||||||
desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False)
|
desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False)
|
||||||
rt_mode, key = dispatcher.dispatch(
|
rt_mode, key = dispatcher.dispatch(
|
||||||
num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True
|
num_tokens=8, uniform_decode=False, has_lora=False, disable_full=True
|
||||||
)
|
)
|
||||||
if "PIECEWISE" in cudagraph_mode_str: # string contains check
|
if "PIECEWISE" in cudagraph_mode_str: # string contains check
|
||||||
assert rt_mode == CUDAGraphMode.PIECEWISE
|
assert rt_mode == CUDAGraphMode.PIECEWISE
|
||||||
|
|||||||
@ -10,6 +10,7 @@ from utils import (
|
|||||||
BACKENDS,
|
BACKENDS,
|
||||||
_extract_step_logprobs,
|
_extract_step_logprobs,
|
||||||
_random_prompt,
|
_random_prompt,
|
||||||
|
is_device_capability_below_90,
|
||||||
resolve_model_name,
|
resolve_model_name,
|
||||||
skip_unsupported,
|
skip_unsupported,
|
||||||
)
|
)
|
||||||
@ -17,6 +18,8 @@ from utils import (
|
|||||||
import vllm.model_executor.layers.batch_invariant as batch_invariant
|
import vllm.model_executor.layers.batch_invariant as batch_invariant
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
|
||||||
|
|
||||||
|
|
||||||
@skip_unsupported
|
@skip_unsupported
|
||||||
@pytest.mark.timeout(1000)
|
@pytest.mark.timeout(1000)
|
||||||
@ -190,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
|||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
dtype="bfloat16", # not everything is supported
|
dtype="bfloat16", # not everything is supported
|
||||||
gpu_memory_utilization=0.9,
|
gpu_memory_utilization=0.9,
|
||||||
|
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use more realistic prompts for better token generation
|
# Use more realistic prompts for better token generation
|
||||||
@ -393,6 +397,8 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
|
|||||||
gpu_memory_utilization=0.9,
|
gpu_memory_utilization=0.9,
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
|
enable_prefix_caching=False,
|
||||||
|
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||||
)
|
)
|
||||||
|
|
||||||
prompt = "the capital of france is"
|
prompt = "the capital of france is"
|
||||||
@ -459,6 +465,7 @@ def test_logprobs_without_batch_invariance_should_fail(
|
|||||||
max_num_seqs=32,
|
max_num_seqs=32,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
|
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||||
)
|
)
|
||||||
|
|
||||||
# build ragged prompts to change shapes significantly across BS=1 vs BS=N
|
# build ragged prompts to change shapes significantly across BS=1 vs BS=N
|
||||||
@ -682,6 +689,7 @@ def test_decode_logprobs_match_prefill_logprobs(
|
|||||||
max_num_seqs=32,
|
max_num_seqs=32,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
|
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use a few test prompts
|
# Use a few test prompts
|
||||||
@ -925,6 +933,8 @@ def LLM_with_max_seqs(
|
|||||||
max_model_len=max_model_len,
|
max_model_len=max_model_len,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
|
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
|
||||||
|
enable_prefix_caching=False,
|
||||||
|
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||||
# Enable for MOE models
|
# Enable for MOE models
|
||||||
# enable_expert_parallel=True,
|
# enable_expert_parallel=True,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -11,8 +11,10 @@ from vllm.platforms import current_platform
|
|||||||
from vllm.utils.flashinfer import has_flashinfer
|
from vllm.utils.flashinfer import has_flashinfer
|
||||||
|
|
||||||
skip_unsupported = pytest.mark.skipif(
|
skip_unsupported = pytest.mark.skipif(
|
||||||
not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
|
not (current_platform.is_cuda() and current_platform.has_device_capability(80)),
|
||||||
reason="Requires CUDA and >= Hopper (SM90)",
|
# Supports testing on Ampere and Ada Lovelace devices.
|
||||||
|
# Note: For devices with SM < 90, batch invariance does not support CUDA Graphs.
|
||||||
|
reason="Requires CUDA and >= Ampere (SM80)",
|
||||||
)
|
)
|
||||||
|
|
||||||
BACKENDS: list[str] = [
|
BACKENDS: list[str] = [
|
||||||
@ -97,3 +99,7 @@ def _extract_step_logprobs(request_output):
|
|||||||
return t, inner.token_ids
|
return t, inner.token_ids
|
||||||
|
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
def is_device_capability_below_90() -> bool:
|
||||||
|
return not current_platform.has_device_capability(90)
|
||||||
|
|||||||
@ -124,6 +124,8 @@ def run_tests(
|
|||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
# avoid precision errors
|
# avoid precision errors
|
||||||
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
||||||
|
# lock matmul precision to full FP32
|
||||||
|
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
|
||||||
# m.setenv("VLLM_BATCH_INVARIANT", "1")
|
# m.setenv("VLLM_BATCH_INVARIANT", "1")
|
||||||
outputs: list[tuple[str, list, list]] = []
|
outputs: list[tuple[str, list, list]] = []
|
||||||
for n, (
|
for n, (
|
||||||
|
|||||||
@ -70,6 +70,7 @@ class TestReasoningStructuredOutput:
|
|||||||
request.use_structured_output = True
|
request.use_structured_output = True
|
||||||
request.prompt_token_ids = [1, 2, 3, 4, 5]
|
request.prompt_token_ids = [1, 2, 3, 4, 5]
|
||||||
request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8]
|
request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8]
|
||||||
|
request.num_computed_tokens = 5
|
||||||
return request
|
return request
|
||||||
|
|
||||||
def test_should_fill_bitmask_with_enable_in_reasoning(
|
def test_should_fill_bitmask_with_enable_in_reasoning(
|
||||||
|
|||||||
@ -2,8 +2,8 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import vllm.envs as envs
|
from vllm.config import ProfilerConfig
|
||||||
from vllm.profiler.gpu_profiler import WorkerProfiler
|
from vllm.profiler.wrapper import WorkerProfiler
|
||||||
|
|
||||||
|
|
||||||
class ConcreteWorkerProfiler(WorkerProfiler):
|
class ConcreteWorkerProfiler(WorkerProfiler):
|
||||||
@ -11,11 +11,11 @@ class ConcreteWorkerProfiler(WorkerProfiler):
|
|||||||
A basic implementation of a worker profiler for testing purposes.
|
A basic implementation of a worker profiler for testing purposes.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, profiler_config: ProfilerConfig):
|
||||||
self.start_call_count = 0
|
self.start_call_count = 0
|
||||||
self.stop_call_count = 0
|
self.stop_call_count = 0
|
||||||
self.should_fail_start = False
|
self.should_fail_start = False
|
||||||
super().__init__()
|
super().__init__(profiler_config)
|
||||||
|
|
||||||
def _start(self) -> None:
|
def _start(self) -> None:
|
||||||
if self.should_fail_start:
|
if self.should_fail_start:
|
||||||
@ -26,17 +26,19 @@ class ConcreteWorkerProfiler(WorkerProfiler):
|
|||||||
self.stop_call_count += 1
|
self.stop_call_count += 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture
|
||||||
def reset_mocks():
|
def default_profiler_config():
|
||||||
"""Fixture to reset mocks and env variables before each test."""
|
return ProfilerConfig(
|
||||||
envs.VLLM_PROFILER_DELAY_ITERS = 0
|
profiler="torch",
|
||||||
envs.VLLM_PROFILER_MAX_ITERS = 0
|
torch_profiler_dir="/tmp/mock",
|
||||||
|
delay_iterations=0,
|
||||||
|
max_iterations=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_immediate_start_stop():
|
def test_immediate_start_stop(default_profiler_config):
|
||||||
"""Test standard start without delay."""
|
"""Test standard start without delay."""
|
||||||
profiler = ConcreteWorkerProfiler()
|
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||||
|
|
||||||
profiler.start()
|
profiler.start()
|
||||||
assert profiler._running is True
|
assert profiler._running is True
|
||||||
assert profiler._active is True
|
assert profiler._active is True
|
||||||
@ -48,10 +50,10 @@ def test_immediate_start_stop():
|
|||||||
assert profiler.stop_call_count == 1
|
assert profiler.stop_call_count == 1
|
||||||
|
|
||||||
|
|
||||||
def test_delayed_start():
|
def test_delayed_start(default_profiler_config):
|
||||||
"""Test that profiler waits for N steps before actually starting."""
|
"""Test that profiler waits for N steps before actually starting."""
|
||||||
envs.VLLM_PROFILER_DELAY_ITERS = 2
|
default_profiler_config.delay_iterations = 2
|
||||||
profiler = ConcreteWorkerProfiler()
|
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||||
|
|
||||||
# User requests start
|
# User requests start
|
||||||
profiler.start()
|
profiler.start()
|
||||||
@ -71,10 +73,10 @@ def test_delayed_start():
|
|||||||
assert profiler.start_call_count == 1
|
assert profiler.start_call_count == 1
|
||||||
|
|
||||||
|
|
||||||
def test_max_iterations():
|
def test_max_iterations(default_profiler_config):
|
||||||
"""Test that profiler stops automatically after max iterations."""
|
"""Test that profiler stops automatically after max iterations."""
|
||||||
envs.VLLM_PROFILER_MAX_ITERS = 2
|
default_profiler_config.max_iterations = 2
|
||||||
profiler = ConcreteWorkerProfiler()
|
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||||
|
|
||||||
profiler.start()
|
profiler.start()
|
||||||
assert profiler._running is True
|
assert profiler._running is True
|
||||||
@ -95,12 +97,11 @@ def test_max_iterations():
|
|||||||
assert profiler.stop_call_count == 1
|
assert profiler.stop_call_count == 1
|
||||||
|
|
||||||
|
|
||||||
def test_delayed_start_and_max_iters():
|
def test_delayed_start_and_max_iters(default_profiler_config):
|
||||||
"""Test combined delayed start and max iterations."""
|
"""Test combined delayed start and max iterations."""
|
||||||
envs.VLLM_PROFILER_DELAY_ITERS = 2
|
default_profiler_config.delay_iterations = 2
|
||||||
envs.VLLM_PROFILER_MAX_ITERS = 2
|
default_profiler_config.max_iterations = 2
|
||||||
profiler = ConcreteWorkerProfiler()
|
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||||
|
|
||||||
profiler.start()
|
profiler.start()
|
||||||
|
|
||||||
# Step 1
|
# Step 1
|
||||||
@ -127,9 +128,9 @@ def test_delayed_start_and_max_iters():
|
|||||||
assert profiler.stop_call_count == 1
|
assert profiler.stop_call_count == 1
|
||||||
|
|
||||||
|
|
||||||
def test_idempotency():
|
def test_idempotency(default_profiler_config):
|
||||||
"""Test that calling start/stop multiple times doesn't break logic."""
|
"""Test that calling start/stop multiple times doesn't break logic."""
|
||||||
profiler = ConcreteWorkerProfiler()
|
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||||
|
|
||||||
# Double Start
|
# Double Start
|
||||||
profiler.start()
|
profiler.start()
|
||||||
@ -142,10 +143,10 @@ def test_idempotency():
|
|||||||
assert profiler.stop_call_count == 1 # Should only stop once
|
assert profiler.stop_call_count == 1 # Should only stop once
|
||||||
|
|
||||||
|
|
||||||
def test_step_inactive():
|
def test_step_inactive(default_profiler_config):
|
||||||
"""Test that stepping while inactive does nothing."""
|
"""Test that stepping while inactive does nothing."""
|
||||||
envs.VLLM_PROFILER_DELAY_ITERS = 2
|
default_profiler_config.delay_iterations = 2
|
||||||
profiler = ConcreteWorkerProfiler()
|
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||||
|
|
||||||
# Not started yet
|
# Not started yet
|
||||||
profiler.step()
|
profiler.step()
|
||||||
@ -155,9 +156,9 @@ def test_step_inactive():
|
|||||||
assert profiler.start_call_count == 0
|
assert profiler.start_call_count == 0
|
||||||
|
|
||||||
|
|
||||||
def test_start_failure():
|
def test_start_failure(default_profiler_config):
|
||||||
"""Test behavior when the underlying _start method raises exception."""
|
"""Test behavior when the underlying _start method raises exception."""
|
||||||
profiler = ConcreteWorkerProfiler()
|
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||||
profiler.should_fail_start = True
|
profiler.should_fail_start = True
|
||||||
|
|
||||||
profiler.start()
|
profiler.start()
|
||||||
@ -168,9 +169,9 @@ def test_start_failure():
|
|||||||
assert profiler.start_call_count == 0 # Logic failed inside start
|
assert profiler.start_call_count == 0 # Logic failed inside start
|
||||||
|
|
||||||
|
|
||||||
def test_shutdown():
|
def test_shutdown(default_profiler_config):
|
||||||
"""Test that shutdown calls stop only if running."""
|
"""Test that shutdown calls stop only if running."""
|
||||||
profiler = ConcreteWorkerProfiler()
|
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||||
|
|
||||||
# Case 1: Not running
|
# Case 1: Not running
|
||||||
profiler.shutdown()
|
profiler.shutdown()
|
||||||
@ -182,10 +183,10 @@ def test_shutdown():
|
|||||||
assert profiler.stop_call_count == 1
|
assert profiler.stop_call_count == 1
|
||||||
|
|
||||||
|
|
||||||
def test_mixed_delay_and_stop():
|
def test_mixed_delay_and_stop(default_profiler_config):
|
||||||
"""Test manual stop during the delay period."""
|
"""Test manual stop during the delay period."""
|
||||||
envs.VLLM_PROFILER_DELAY_ITERS = 5
|
default_profiler_config.delay_iterations = 5
|
||||||
profiler = ConcreteWorkerProfiler()
|
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||||
|
|
||||||
profiler.start()
|
profiler.start()
|
||||||
profiler.step()
|
profiler.step()
|
||||||
|
|||||||
@ -9,6 +9,8 @@ import vllm.envs as envs
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
|
from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
|
||||||
|
|
||||||
|
_FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
|
|
||||||
|
|
||||||
def is_aiter_found() -> bool:
|
def is_aiter_found() -> bool:
|
||||||
from importlib.util import find_spec
|
from importlib.util import find_spec
|
||||||
@ -467,6 +469,59 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
|
|||||||
return torch.empty_like(x), torch.empty_like(residual)
|
return torch.empty_like(x), torch.empty_like(residual)
|
||||||
|
|
||||||
|
|
||||||
|
def _rocm_aiter_per_tensor_quant_impl(
|
||||||
|
x: torch.Tensor,
|
||||||
|
quant_dtype: torch.dtype,
|
||||||
|
scale: torch.Tensor | None = None,
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
from aiter.ops.quant import per_tensor_quant_hip
|
||||||
|
|
||||||
|
return per_tensor_quant_hip(x, scale, quant_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def _rocm_aiter_per_tensor_quant_fake(
|
||||||
|
x: torch.Tensor,
|
||||||
|
quant_dtype: torch.dtype,
|
||||||
|
scale: torch.Tensor | None = None,
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
return torch.empty_like(x, dtype=quant_dtype), torch.empty(
|
||||||
|
1, dtype=torch.float32, device=x.device
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _rocm_aiter_per_token_quant_impl(
|
||||||
|
x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
from aiter.ops.quant import dynamic_per_token_scaled_quant
|
||||||
|
|
||||||
|
assert quant_dtype in [torch.int8, _FP8_DTYPE]
|
||||||
|
|
||||||
|
out_shape = x.shape
|
||||||
|
out = torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device)
|
||||||
|
if scale is None:
|
||||||
|
scale = torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device)
|
||||||
|
dynamic_per_token_scaled_quant(
|
||||||
|
out,
|
||||||
|
x,
|
||||||
|
scale,
|
||||||
|
scale_ub=None,
|
||||||
|
shuffle_scale=False,
|
||||||
|
num_rows=None,
|
||||||
|
num_rows_factor=1,
|
||||||
|
)
|
||||||
|
return out, scale
|
||||||
|
|
||||||
|
|
||||||
|
def _rocm_aiter_per_token_quant_fake(
|
||||||
|
x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
out_shape = x.shape
|
||||||
|
return (
|
||||||
|
torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device),
|
||||||
|
torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Global flag to ensure ops are registered only once
|
# Global flag to ensure ops are registered only once
|
||||||
_OPS_REGISTERED = False
|
_OPS_REGISTERED = False
|
||||||
|
|
||||||
@ -665,6 +720,22 @@ class rocm_aiter_ops:
|
|||||||
dispatch_key=current_platform.dispatch_key,
|
dispatch_key=current_platform.dispatch_key,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
direct_register_custom_op(
|
||||||
|
op_name="rocm_aiter_per_tensor_quant",
|
||||||
|
op_func=_rocm_aiter_per_tensor_quant_impl,
|
||||||
|
mutates_args=[],
|
||||||
|
fake_impl=_rocm_aiter_per_tensor_quant_fake,
|
||||||
|
dispatch_key=current_platform.dispatch_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
direct_register_custom_op(
|
||||||
|
op_name="rocm_aiter_per_token_quant",
|
||||||
|
op_func=_rocm_aiter_per_token_quant_impl,
|
||||||
|
mutates_args=["scale"],
|
||||||
|
fake_impl=_rocm_aiter_per_token_quant_fake,
|
||||||
|
dispatch_key=current_platform.dispatch_key,
|
||||||
|
)
|
||||||
|
|
||||||
_OPS_REGISTERED = True
|
_OPS_REGISTERED = True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -859,6 +930,22 @@ class rocm_aiter_ops:
|
|||||||
kv_scale=kv_scale,
|
kv_scale=kv_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def per_tensor_quant(
|
||||||
|
x: torch.Tensor,
|
||||||
|
quant_dtype: torch.dtype,
|
||||||
|
scale: torch.Tensor | None = None,
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
return torch.ops.vllm.rocm_aiter_per_tensor_quant(x, quant_dtype, scale)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def per_token_quant(
|
||||||
|
x: torch.Tensor,
|
||||||
|
quant_dtype: torch.dtype,
|
||||||
|
scale: torch.Tensor | None = None,
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
return torch.ops.vllm.rocm_aiter_per_token_quant(x, quant_dtype, scale)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def triton_fp4_gemm_dynamic_qaunt(
|
def triton_fp4_gemm_dynamic_qaunt(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
|
|||||||
@ -1726,7 +1726,7 @@ def scaled_fp8_quant(
|
|||||||
output, input, scale, scale_ub
|
output, input, scale, scale_ub
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
scale = torch.empty((1, 1), device=input.device, dtype=torch.float32)
|
scale = torch.empty(1, device=input.device, dtype=torch.float32)
|
||||||
torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
|
torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
|
||||||
else:
|
else:
|
||||||
assert scale.numel() == 1, f"{scale.shape}"
|
assert scale.numel() == 1, f"{scale.shape}"
|
||||||
|
|||||||
@ -89,7 +89,10 @@ def maybe_get_vit_flash_attn_backend(
|
|||||||
if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
|
if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
|
||||||
from aiter import flash_attn_varlen_func
|
from aiter import flash_attn_varlen_func
|
||||||
else:
|
else:
|
||||||
from vllm.attention.utils.fa_utils import flash_attn_varlen_func
|
try:
|
||||||
|
from vllm.attention.utils.fa_utils import flash_attn_varlen_func
|
||||||
|
except ImportError:
|
||||||
|
flash_attn_varlen_func = None
|
||||||
else:
|
else:
|
||||||
flash_attn_varlen_func = None
|
flash_attn_varlen_func = None
|
||||||
|
|
||||||
|
|||||||
@ -12,7 +12,6 @@ from typing import Any
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
|
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.inputs import PromptType
|
from vllm.inputs import PromptType
|
||||||
@ -79,12 +78,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
|
||||||
raise OSError(
|
|
||||||
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
|
||||||
"Please set it to a valid path to use torch profiler."
|
|
||||||
)
|
|
||||||
engine_args = EngineArgs.from_cli_args(args)
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
|
if args.profile and not engine_args.profiler_config.profiler == "torch":
|
||||||
|
raise ValueError(
|
||||||
|
"The torch profiler is not enabled. Please provide profiler_config."
|
||||||
|
)
|
||||||
|
|
||||||
# Lazy import to avoid importing LLM when the bench command is not selected.
|
# Lazy import to avoid importing LLM when the bench command is not selected.
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -144,7 +142,7 @@ def main(args: argparse.Namespace):
|
|||||||
run_to_completion(profile_dir=None)
|
run_to_completion(profile_dir=None)
|
||||||
|
|
||||||
if args.profile:
|
if args.profile:
|
||||||
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
profile_dir = engine_args.profiler_config.torch_profiler_dir
|
||||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||||
run_to_completion(profile_dir=profile_dir)
|
run_to_completion(profile_dir=profile_dir)
|
||||||
return
|
return
|
||||||
|
|||||||
@ -1097,8 +1097,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--profile",
|
"--profile",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Use Torch Profiler. The endpoint must be launched with "
|
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
||||||
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--save-result",
|
"--save-result",
|
||||||
|
|||||||
@ -655,8 +655,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
"--profile",
|
"--profile",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
default=False,
|
default=False,
|
||||||
help="Use Torch Profiler. The env variable "
|
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
||||||
"VLLM_TORCH_PROFILER_DIR must be set to enable profiler.",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# prefix repetition dataset
|
# prefix repetition dataset
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
import hashlib
|
import hashlib
|
||||||
import inspect
|
import inspect
|
||||||
@ -8,15 +10,17 @@ import json
|
|||||||
import types
|
import types
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from typing import Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import fx
|
from torch import fx
|
||||||
from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
|
from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
|
||||||
|
|
||||||
from vllm.config.utils import Range
|
|
||||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from vllm.config.utils import Range
|
||||||
|
|
||||||
if is_torch_equal_or_newer("2.6"):
|
if is_torch_equal_or_newer("2.6"):
|
||||||
from torch._inductor.custom_graph_pass import CustomGraphPass
|
from torch._inductor.custom_graph_pass import CustomGraphPass
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -53,8 +53,27 @@ class PiecewiseBackend:
|
|||||||
self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
|
self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
|
||||||
|
|
||||||
self.is_full_graph = total_piecewise_compiles == 1
|
self.is_full_graph = total_piecewise_compiles == 1
|
||||||
|
# TODO: we need to generalize encoder compilation to other models
|
||||||
|
self.is_encoder_compilation = vllm_backend.prefix in [
|
||||||
|
"Qwen2_5_VisionPatchEmbed",
|
||||||
|
"Qwen2_5_VisionPatchMerger",
|
||||||
|
"Qwen2_5_VisionBlock",
|
||||||
|
]
|
||||||
|
|
||||||
self.compile_ranges = self.compilation_config.get_compile_ranges()
|
self.compile_ranges = self.compilation_config.get_compile_ranges()
|
||||||
|
if self.is_encoder_compilation:
|
||||||
|
# For encoder compilation we use the max int32 value
|
||||||
|
# to set the upper bound of the compile ranges
|
||||||
|
max_int32 = 2**31 - 1
|
||||||
|
last_compile_range = self.compile_ranges[-1]
|
||||||
|
assert (
|
||||||
|
last_compile_range.end
|
||||||
|
== vllm_config.scheduler_config.max_num_batched_tokens
|
||||||
|
)
|
||||||
|
self.compile_ranges[-1] = Range(
|
||||||
|
start=last_compile_range.start, end=max_int32
|
||||||
|
)
|
||||||
|
|
||||||
log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
|
log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
|
||||||
logger.debug_once(log_string)
|
logger.debug_once(log_string)
|
||||||
|
|
||||||
|
|||||||
@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig
|
|||||||
from vllm.config.observability import ObservabilityConfig
|
from vllm.config.observability import ObservabilityConfig
|
||||||
from vllm.config.parallel import EPLBConfig, ParallelConfig
|
from vllm.config.parallel import EPLBConfig, ParallelConfig
|
||||||
from vllm.config.pooler import PoolerConfig
|
from vllm.config.pooler import PoolerConfig
|
||||||
|
from vllm.config.profiler import ProfilerConfig
|
||||||
from vllm.config.scheduler import SchedulerConfig
|
from vllm.config.scheduler import SchedulerConfig
|
||||||
from vllm.config.speculative import SpeculativeConfig
|
from vllm.config.speculative import SpeculativeConfig
|
||||||
from vllm.config.speech_to_text import SpeechToTextConfig
|
from vllm.config.speech_to_text import SpeechToTextConfig
|
||||||
@ -89,6 +90,8 @@ __all__ = [
|
|||||||
"SpeechToTextConfig",
|
"SpeechToTextConfig",
|
||||||
# From vllm.config.structured_outputs
|
# From vllm.config.structured_outputs
|
||||||
"StructuredOutputsConfig",
|
"StructuredOutputsConfig",
|
||||||
|
# From vllm.config.profiler
|
||||||
|
"ProfilerConfig",
|
||||||
# From vllm.config.utils
|
# From vllm.config.utils
|
||||||
"ConfigType",
|
"ConfigType",
|
||||||
"SupportsMetricsInfo",
|
"SupportsMetricsInfo",
|
||||||
|
|||||||
199
vllm/config/profiler.py
Normal file
199
vllm/config/profiler.py
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Any, Literal
|
||||||
|
|
||||||
|
from pydantic import Field, model_validator
|
||||||
|
from pydantic.dataclasses import dataclass
|
||||||
|
from typing_extensions import Self
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
|
from vllm.config.utils import config
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.utils.hashing import safe_hash
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
ProfilerKind = Literal["torch", "cuda"]
|
||||||
|
|
||||||
|
|
||||||
|
@config
|
||||||
|
@dataclass
|
||||||
|
class ProfilerConfig:
|
||||||
|
"""Dataclass which contains profiler config for the engine."""
|
||||||
|
|
||||||
|
profiler: ProfilerKind | None = None
|
||||||
|
"""Which profiler to use. Defaults to None. Options are:
|
||||||
|
|
||||||
|
- 'torch': Use PyTorch profiler.\n
|
||||||
|
- 'cuda': Use CUDA profiler."""
|
||||||
|
|
||||||
|
torch_profiler_dir: str = ""
|
||||||
|
"""Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
|
||||||
|
worker's traces (CPU & GPU) will be saved under this directory. Note that
|
||||||
|
it must be an absolute path."""
|
||||||
|
|
||||||
|
torch_profiler_with_stack: bool = True
|
||||||
|
"""If `True`, enables stack tracing in the torch profiler. Enabled by default."""
|
||||||
|
|
||||||
|
torch_profiler_with_flops: bool = False
|
||||||
|
"""If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""
|
||||||
|
|
||||||
|
torch_profiler_use_gzip: bool = True
|
||||||
|
"""If `True`, saves torch profiler traces in gzip format. Enabled by default"""
|
||||||
|
|
||||||
|
torch_profiler_dump_cuda_time_total: bool = True
|
||||||
|
"""If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""
|
||||||
|
|
||||||
|
torch_profiler_record_shapes: bool = False
|
||||||
|
"""If `True`, records tensor shapes in the torch profiler. Disabled by default."""
|
||||||
|
|
||||||
|
torch_profiler_with_memory: bool = False
|
||||||
|
"""If `True`, enables memory profiling in the torch profiler.
|
||||||
|
Disabled by default."""
|
||||||
|
|
||||||
|
ignore_frontend: bool = False
|
||||||
|
"""If `True`, disables the front-end profiling of AsyncLLM when using the
|
||||||
|
'torch' profiler. This is needed to reduce overhead when using delay/limit options,
|
||||||
|
since the front-end profiling does not track iterations and will capture the
|
||||||
|
entire range.
|
||||||
|
"""
|
||||||
|
|
||||||
|
delay_iterations: int = Field(default=0, ge=0)
|
||||||
|
"""Number of engine iterations to skip before starting profiling.
|
||||||
|
Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
|
||||||
|
"""
|
||||||
|
|
||||||
|
max_iterations: int = Field(default=0, ge=0)
|
||||||
|
"""Maximum number of engine iterations to profile after starting profiling.
|
||||||
|
Defaults to 0, meaning no limit.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def compute_hash(self) -> str:
|
||||||
|
"""
|
||||||
|
WARNING: Whenever a new field is added to this config,
|
||||||
|
ensure that it is included in the factors list if
|
||||||
|
it affects the computation graph.
|
||||||
|
|
||||||
|
Provide a hash that uniquely identifies all the configs
|
||||||
|
that affect the structure of the computation
|
||||||
|
graph from input ids/embeddings to the final hidden states,
|
||||||
|
excluding anything before input ids/embeddings and after
|
||||||
|
the final hidden states.
|
||||||
|
"""
|
||||||
|
# no factors to consider.
|
||||||
|
# this config will not affect the computation graph.
|
||||||
|
factors: list[Any] = []
|
||||||
|
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||||
|
return hash_str
|
||||||
|
|
||||||
|
def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
|
||||||
|
"""Get field from env var if set, with deprecation warning."""
|
||||||
|
|
||||||
|
if envs.is_set(env_var_name):
|
||||||
|
value = getattr(envs, env_var_name)
|
||||||
|
logger.warning_once(
|
||||||
|
"Using %s environment variable is deprecated and will be removed in "
|
||||||
|
"v0.14.0 or v1.0.0, whichever is soonest. Please use "
|
||||||
|
"--profiler-config.%s command line argument or "
|
||||||
|
"ProfilerConfig(%s=...) config field instead.",
|
||||||
|
env_var_name,
|
||||||
|
field_name,
|
||||||
|
field_name,
|
||||||
|
)
|
||||||
|
return value
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _set_from_env_if_set(
|
||||||
|
self,
|
||||||
|
field_name: str,
|
||||||
|
env_var_name: str,
|
||||||
|
to_bool: bool = True,
|
||||||
|
to_int: bool = False,
|
||||||
|
) -> None:
|
||||||
|
"""Set field from env var if set, with deprecation warning."""
|
||||||
|
value = self._get_from_env_if_set(field_name, env_var_name)
|
||||||
|
if value is not None:
|
||||||
|
if to_bool:
|
||||||
|
value = value == "1"
|
||||||
|
if to_int:
|
||||||
|
value = int(value)
|
||||||
|
setattr(self, field_name, value)
|
||||||
|
|
||||||
|
@model_validator(mode="after")
|
||||||
|
def _validate_profiler_config(self) -> Self:
|
||||||
|
maybe_use_cuda_profiler = self._get_from_env_if_set(
|
||||||
|
"profiler", "VLLM_TORCH_CUDA_PROFILE"
|
||||||
|
)
|
||||||
|
if maybe_use_cuda_profiler is not None:
|
||||||
|
self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None
|
||||||
|
else:
|
||||||
|
self._set_from_env_if_set(
|
||||||
|
"torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False
|
||||||
|
)
|
||||||
|
if self.torch_profiler_dir:
|
||||||
|
self.profiler = "torch"
|
||||||
|
self._set_from_env_if_set(
|
||||||
|
"torch_profiler_record_shapes",
|
||||||
|
"VLLM_TORCH_PROFILER_RECORD_SHAPES",
|
||||||
|
)
|
||||||
|
self._set_from_env_if_set(
|
||||||
|
"torch_profiler_with_memory",
|
||||||
|
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY",
|
||||||
|
)
|
||||||
|
self._set_from_env_if_set(
|
||||||
|
"torch_profiler_with_stack",
|
||||||
|
"VLLM_TORCH_PROFILER_WITH_STACK",
|
||||||
|
)
|
||||||
|
self._set_from_env_if_set(
|
||||||
|
"torch_profiler_with_flops",
|
||||||
|
"VLLM_TORCH_PROFILER_WITH_FLOPS",
|
||||||
|
)
|
||||||
|
self._set_from_env_if_set(
|
||||||
|
"ignore_frontend",
|
||||||
|
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM",
|
||||||
|
)
|
||||||
|
self._set_from_env_if_set(
|
||||||
|
"torch_profiler_use_gzip",
|
||||||
|
"VLLM_TORCH_PROFILER_USE_GZIP",
|
||||||
|
)
|
||||||
|
self._set_from_env_if_set(
|
||||||
|
"torch_profiler_dump_cuda_time_total",
|
||||||
|
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL",
|
||||||
|
)
|
||||||
|
|
||||||
|
self._set_from_env_if_set(
|
||||||
|
"delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True
|
||||||
|
)
|
||||||
|
self._set_from_env_if_set(
|
||||||
|
"max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True
|
||||||
|
)
|
||||||
|
|
||||||
|
has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
|
||||||
|
if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
|
||||||
|
logger.warning_once(
|
||||||
|
"Using 'torch' profiler with delay_iterations or max_iterations "
|
||||||
|
"while ignore_frontend is False may result in high overhead."
|
||||||
|
)
|
||||||
|
|
||||||
|
profiler_dir = self.torch_profiler_dir
|
||||||
|
if profiler_dir and self.profiler != "torch":
|
||||||
|
raise ValueError(
|
||||||
|
"torch_profiler_dir is only applicable when profiler is set to 'torch'"
|
||||||
|
)
|
||||||
|
if self.profiler == "torch" and not profiler_dir:
|
||||||
|
raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")
|
||||||
|
|
||||||
|
if profiler_dir:
|
||||||
|
is_gs_path = (
|
||||||
|
profiler_dir.startswith("gs://")
|
||||||
|
and profiler_dir[5:]
|
||||||
|
and profiler_dir[5] != "/"
|
||||||
|
)
|
||||||
|
if not is_gs_path:
|
||||||
|
self.torch_profiler_dir = os.path.abspath(
|
||||||
|
os.path.expanduser(profiler_dir)
|
||||||
|
)
|
||||||
|
|
||||||
|
return self
|
||||||
@ -39,6 +39,7 @@ from .lora import LoRAConfig
|
|||||||
from .model import ModelConfig
|
from .model import ModelConfig
|
||||||
from .observability import ObservabilityConfig
|
from .observability import ObservabilityConfig
|
||||||
from .parallel import ParallelConfig
|
from .parallel import ParallelConfig
|
||||||
|
from .profiler import ProfilerConfig
|
||||||
from .scheduler import SchedulerConfig
|
from .scheduler import SchedulerConfig
|
||||||
from .speculative import SpeculativeConfig
|
from .speculative import SpeculativeConfig
|
||||||
from .structured_outputs import StructuredOutputsConfig
|
from .structured_outputs import StructuredOutputsConfig
|
||||||
@ -218,6 +219,8 @@ class VllmConfig:
|
|||||||
You can specify the full compilation config like so:
|
You can specify the full compilation config like so:
|
||||||
`{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
|
`{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
|
||||||
"""
|
"""
|
||||||
|
profiler_config: ProfilerConfig = Field(default_factory=ProfilerConfig)
|
||||||
|
"""Profiling configuration."""
|
||||||
kv_transfer_config: KVTransferConfig | None = None
|
kv_transfer_config: KVTransferConfig | None = None
|
||||||
"""The configurations for distributed KV cache transfer."""
|
"""The configurations for distributed KV cache transfer."""
|
||||||
kv_events_config: KVEventsConfig | None = None
|
kv_events_config: KVEventsConfig | None = None
|
||||||
@ -296,6 +299,8 @@ class VllmConfig:
|
|||||||
vllm_factors.append("None")
|
vllm_factors.append("None")
|
||||||
if self.structured_outputs_config:
|
if self.structured_outputs_config:
|
||||||
vllm_factors.append(self.structured_outputs_config.compute_hash())
|
vllm_factors.append(self.structured_outputs_config.compute_hash())
|
||||||
|
if self.profiler_config:
|
||||||
|
vllm_factors.append(self.profiler_config.compute_hash())
|
||||||
else:
|
else:
|
||||||
vllm_factors.append("None")
|
vllm_factors.append("None")
|
||||||
vllm_factors.append(self.observability_config.compute_hash())
|
vllm_factors.append(self.observability_config.compute_hash())
|
||||||
|
|||||||
@ -50,6 +50,7 @@ from vllm.config import (
|
|||||||
ObservabilityConfig,
|
ObservabilityConfig,
|
||||||
ParallelConfig,
|
ParallelConfig,
|
||||||
PoolerConfig,
|
PoolerConfig,
|
||||||
|
ProfilerConfig,
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
SpeculativeConfig,
|
SpeculativeConfig,
|
||||||
StructuredOutputsConfig,
|
StructuredOutputsConfig,
|
||||||
@ -536,6 +537,8 @@ class EngineArgs:
|
|||||||
worker_cls: str = ParallelConfig.worker_cls
|
worker_cls: str = ParallelConfig.worker_cls
|
||||||
worker_extension_cls: str = ParallelConfig.worker_extension_cls
|
worker_extension_cls: str = ParallelConfig.worker_extension_cls
|
||||||
|
|
||||||
|
profiler_config: ProfilerConfig = get_field(VllmConfig, "profiler_config")
|
||||||
|
|
||||||
kv_transfer_config: KVTransferConfig | None = None
|
kv_transfer_config: KVTransferConfig | None = None
|
||||||
kv_events_config: KVEventsConfig | None = None
|
kv_events_config: KVEventsConfig | None = None
|
||||||
|
|
||||||
@ -1168,7 +1171,7 @@ class EngineArgs:
|
|||||||
vllm_group.add_argument(
|
vllm_group.add_argument(
|
||||||
"--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
|
"--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
|
||||||
)
|
)
|
||||||
|
vllm_group.add_argument("--profiler-config", **vllm_kwargs["profiler_config"])
|
||||||
vllm_group.add_argument(
|
vllm_group.add_argument(
|
||||||
"--optimization-level", **vllm_kwargs["optimization_level"]
|
"--optimization-level", **vllm_kwargs["optimization_level"]
|
||||||
)
|
)
|
||||||
@ -1786,6 +1789,7 @@ class EngineArgs:
|
|||||||
kv_transfer_config=self.kv_transfer_config,
|
kv_transfer_config=self.kv_transfer_config,
|
||||||
kv_events_config=self.kv_events_config,
|
kv_events_config=self.kv_events_config,
|
||||||
ec_transfer_config=self.ec_transfer_config,
|
ec_transfer_config=self.ec_transfer_config,
|
||||||
|
profiler_config=self.profiler_config,
|
||||||
additional_config=self.additional_config,
|
additional_config=self.additional_config,
|
||||||
optimization_level=self.optimization_level,
|
optimization_level=self.optimization_level,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -20,6 +20,7 @@ from vllm.beam_search import (
|
|||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
PoolerConfig,
|
PoolerConfig,
|
||||||
|
ProfilerConfig,
|
||||||
StructuredOutputsConfig,
|
StructuredOutputsConfig,
|
||||||
is_init_field,
|
is_init_field,
|
||||||
)
|
)
|
||||||
@ -211,6 +212,7 @@ class LLM:
|
|||||||
structured_outputs_config: dict[str, Any]
|
structured_outputs_config: dict[str, Any]
|
||||||
| StructuredOutputsConfig
|
| StructuredOutputsConfig
|
||||||
| None = None,
|
| None = None,
|
||||||
|
profiler_config: dict[str, Any] | ProfilerConfig | None = None,
|
||||||
kv_cache_memory_bytes: int | None = None,
|
kv_cache_memory_bytes: int | None = None,
|
||||||
compilation_config: int | dict[str, Any] | CompilationConfig | None = None,
|
compilation_config: int | dict[str, Any] | CompilationConfig | None = None,
|
||||||
logits_processors: list[str | type[LogitsProcessor]] | None = None,
|
logits_processors: list[str | type[LogitsProcessor]] | None = None,
|
||||||
@ -282,6 +284,20 @@ class LLM:
|
|||||||
else:
|
else:
|
||||||
structured_outputs_instance = StructuredOutputsConfig()
|
structured_outputs_instance = StructuredOutputsConfig()
|
||||||
|
|
||||||
|
if profiler_config is not None:
|
||||||
|
if isinstance(profiler_config, dict):
|
||||||
|
profiler_config_instance = ProfilerConfig(
|
||||||
|
**{
|
||||||
|
k: v
|
||||||
|
for k, v in profiler_config.items()
|
||||||
|
if is_init_field(ProfilerConfig, k)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
profiler_config_instance = profiler_config
|
||||||
|
else:
|
||||||
|
profiler_config_instance = ProfilerConfig()
|
||||||
|
|
||||||
# warn about single-process data parallel usage.
|
# warn about single-process data parallel usage.
|
||||||
_dp_size = int(kwargs.get("data_parallel_size", 1))
|
_dp_size = int(kwargs.get("data_parallel_size", 1))
|
||||||
_distributed_executor_backend = kwargs.get("distributed_executor_backend")
|
_distributed_executor_backend = kwargs.get("distributed_executor_backend")
|
||||||
@ -324,6 +340,7 @@ class LLM:
|
|||||||
mm_processor_kwargs=mm_processor_kwargs,
|
mm_processor_kwargs=mm_processor_kwargs,
|
||||||
pooler_config=pooler_config,
|
pooler_config=pooler_config,
|
||||||
structured_outputs_config=structured_outputs_instance,
|
structured_outputs_config=structured_outputs_instance,
|
||||||
|
profiler_config=profiler_config_instance,
|
||||||
compilation_config=compilation_config_instance,
|
compilation_config=compilation_config_instance,
|
||||||
logits_processors=logits_processors,
|
logits_processors=logits_processors,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
|
|||||||
@ -99,12 +99,7 @@ class MistralToolParser(ToolParser):
|
|||||||
self.bot_token = "[TOOL_CALLS]"
|
self.bot_token = "[TOOL_CALLS]"
|
||||||
self.bot_token_id = self.vocab.get(self.bot_token)
|
self.bot_token_id = self.vocab.get(self.bot_token)
|
||||||
self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
|
self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
|
||||||
if not _is_pre_v11_tokeniser(self.model_tokenizer):
|
self._is_pre_v11 = _is_pre_v11_tokeniser(self.model_tokenizer)
|
||||||
self.fn_name_regex = re.compile(
|
|
||||||
r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.fn_name_regex = None
|
|
||||||
|
|
||||||
if self.bot_token_id is None:
|
if self.bot_token_id is None:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
@ -148,23 +143,24 @@ class MistralToolParser(ToolParser):
|
|||||||
tool_content = model_output.replace(self.bot_token, "").strip()
|
tool_content = model_output.replace(self.bot_token, "").strip()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# we first try to directly load the json as parsing very nested
|
|
||||||
# jsons is difficult
|
|
||||||
try:
|
try:
|
||||||
if self.fn_name_regex:
|
if not self._is_pre_v11:
|
||||||
function_call_arr = []
|
function_call_arr = []
|
||||||
for single_tool_content in model_output.split(self.bot_token):
|
for single_tool_content in model_output.split(self.bot_token):
|
||||||
matches = self.fn_name_regex.findall(single_tool_content)
|
if "{" not in single_tool_content:
|
||||||
|
continue
|
||||||
|
|
||||||
for match in matches:
|
end_name = single_tool_content.find("{")
|
||||||
fn_name = match[0]
|
fn_name, args = (
|
||||||
args = match[1]
|
single_tool_content[:end_name],
|
||||||
|
single_tool_content[end_name:],
|
||||||
|
)
|
||||||
|
|
||||||
# fn_name is encoded outside serialized json dump
|
# fn_name is encoded outside serialized json dump
|
||||||
# only arguments are serialized
|
# only arguments are serialized
|
||||||
function_call_arr.append(
|
function_call_arr.append(
|
||||||
{"name": fn_name, "arguments": json.loads(args)}
|
{"name": fn_name, "arguments": json.loads(args)}
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
function_call_arr = json.loads(tool_content)
|
function_call_arr = json.loads(tool_content)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
from fastapi import APIRouter, FastAPI, Request
|
from fastapi import APIRouter, FastAPI, Request
|
||||||
from fastapi.responses import Response
|
from fastapi.responses import Response
|
||||||
|
|
||||||
import vllm.envs as envs
|
from vllm.config import ProfilerConfig
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
@ -35,15 +35,12 @@ async def stop_profile(raw_request: Request):
|
|||||||
|
|
||||||
|
|
||||||
def attach_router(app: FastAPI):
|
def attach_router(app: FastAPI):
|
||||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
profiler_config = getattr(app.state.args, "profiler_config", None)
|
||||||
|
assert profiler_config is None or isinstance(profiler_config, ProfilerConfig)
|
||||||
|
if profiler_config is not None and profiler_config.profiler is not None:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Torch Profiler is enabled in the API server. This should ONLY be "
|
"Profiler with mode '%s' is enabled in the "
|
||||||
"used for local development!"
|
"API server. This should ONLY be used for local development!",
|
||||||
|
profiler_config.profiler,
|
||||||
)
|
)
|
||||||
elif envs.VLLM_TORCH_CUDA_PROFILE:
|
|
||||||
logger.warning_once(
|
|
||||||
"CUDA Profiler is enabled in the API server. This should ONLY be "
|
|
||||||
"used for local development!"
|
|
||||||
)
|
|
||||||
if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
|
|
||||||
app.include_router(router)
|
app.include_router(router)
|
||||||
|
|||||||
119
vllm/envs.py
119
vllm/envs.py
@ -75,6 +75,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_MM_INPUT_CACHE_GIB: int = 4
|
VLLM_MM_INPUT_CACHE_GIB: int = 4
|
||||||
VLLM_TARGET_DEVICE: str = "cuda"
|
VLLM_TARGET_DEVICE: str = "cuda"
|
||||||
VLLM_MAIN_CUDA_VERSION: str = "12.9"
|
VLLM_MAIN_CUDA_VERSION: str = "12.9"
|
||||||
|
VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
|
||||||
MAX_JOBS: str | None = None
|
MAX_JOBS: str | None = None
|
||||||
NVCC_THREADS: str | None = None
|
NVCC_THREADS: str | None = None
|
||||||
VLLM_USE_PRECOMPILED: bool = False
|
VLLM_USE_PRECOMPILED: bool = False
|
||||||
@ -88,20 +89,23 @@ if TYPE_CHECKING:
|
|||||||
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
|
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
|
||||||
VLLM_PLUGINS: list[str] | None = None
|
VLLM_PLUGINS: list[str] | None = None
|
||||||
VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
|
VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
|
||||||
VLLM_TORCH_CUDA_PROFILE: bool = False
|
# Deprecated env variables for profiling, kept for backward compatibility
|
||||||
|
# See also vllm/config/profiler.py and `--profiler-config` argument
|
||||||
|
VLLM_TORCH_CUDA_PROFILE: str | None = None
|
||||||
VLLM_TORCH_PROFILER_DIR: str | None = None
|
VLLM_TORCH_PROFILER_DIR: str | None = None
|
||||||
VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
|
VLLM_TORCH_PROFILER_RECORD_SHAPES: str | None = None
|
||||||
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
|
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: str | None = None
|
||||||
VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False
|
VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: str | None = None
|
||||||
|
VLLM_TORCH_PROFILER_WITH_STACK: str | None = None
|
||||||
|
VLLM_TORCH_PROFILER_WITH_FLOPS: str | None = None
|
||||||
|
VLLM_TORCH_PROFILER_USE_GZIP: str | None = None
|
||||||
|
VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: str | None = None
|
||||||
|
VLLM_PROFILER_DELAY_ITERS: str | None = None
|
||||||
|
VLLM_PROFILER_MAX_ITERS: str | None = None
|
||||||
|
# End of deprecated env variables for profiling
|
||||||
VLLM_USE_AOT_COMPILE: bool = False
|
VLLM_USE_AOT_COMPILE: bool = False
|
||||||
VLLM_USE_BYTECODE_HOOK: bool = False
|
VLLM_USE_BYTECODE_HOOK: bool = False
|
||||||
VLLM_FORCE_AOT_LOAD: bool = False
|
VLLM_FORCE_AOT_LOAD: bool = False
|
||||||
VLLM_TORCH_PROFILER_WITH_STACK: bool = True
|
|
||||||
VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
|
|
||||||
VLLM_PROFILER_DELAY_ITERS: int = 0
|
|
||||||
VLLM_PROFILER_MAX_ITERS: int = 0
|
|
||||||
VLLM_TORCH_PROFILER_USE_GZIP: bool = True
|
|
||||||
VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True
|
|
||||||
VLLM_USE_TRITON_AWQ: bool = False
|
VLLM_USE_TRITON_AWQ: bool = False
|
||||||
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
||||||
VLLM_SKIP_P2P_CHECK: bool = False
|
VLLM_SKIP_P2P_CHECK: bool = False
|
||||||
@ -453,6 +457,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# Main CUDA version of vLLM. This follows PyTorch but can be overridden.
|
# Main CUDA version of vLLM. This follows PyTorch but can be overridden.
|
||||||
"VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
|
"VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
|
||||||
or "12.9",
|
or "12.9",
|
||||||
|
# Controls PyTorch float32 matmul precision mode within vLLM workers.
|
||||||
|
# Valid options mirror torch.set_float32_matmul_precision
|
||||||
|
"VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
|
||||||
|
"VLLM_FLOAT32_MATMUL_PRECISION",
|
||||||
|
"highest",
|
||||||
|
["highest", "high", "medium"],
|
||||||
|
case_sensitive=False,
|
||||||
|
),
|
||||||
# Maximum number of compilation jobs to run in parallel.
|
# Maximum number of compilation jobs to run in parallel.
|
||||||
# By default this is the number of CPUs
|
# By default this is the number of CPUs
|
||||||
"MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
|
"MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
|
||||||
@ -842,71 +854,52 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
|
"VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
|
||||||
"VLLM_LORA_RESOLVER_CACHE_DIR", None
|
"VLLM_LORA_RESOLVER_CACHE_DIR", None
|
||||||
),
|
),
|
||||||
# Enables torch CUDA profiling if set.
|
# Enables torch CUDA profiling if set to 1.
|
||||||
# On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered.
|
# Deprecated, see profiler_config.
|
||||||
"VLLM_TORCH_CUDA_PROFILE": lambda: bool(
|
"VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"),
|
||||||
os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0"
|
|
||||||
),
|
|
||||||
# Enables torch profiler if set.
|
# Enables torch profiler if set.
|
||||||
# Both AsyncLLM's CPU traces as well as workers'
|
# Deprecated, see profiler_config.
|
||||||
# traces (CPU & GPU) will be saved under this directory.
|
"VLLM_TORCH_PROFILER_DIR": lambda: os.getenv("VLLM_TORCH_PROFILER_DIR"),
|
||||||
# Note that it must be an absolute path.
|
# Enable torch profiler to record shapes if set to 1.
|
||||||
"VLLM_TORCH_PROFILER_DIR": lambda: (
|
# Deprecated, see profiler_config.
|
||||||
None
|
"VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: (
|
||||||
if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None
|
os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES")
|
||||||
else (
|
|
||||||
val
|
|
||||||
if val.startswith("gs://") and val[5:] and val[5] != "/"
|
|
||||||
else os.path.abspath(os.path.expanduser(val))
|
|
||||||
)
|
|
||||||
),
|
),
|
||||||
# Enable torch profiler to record shapes if set
|
# Enable torch profiler to profile memory if set to 1.
|
||||||
# VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will
|
# Deprecated, see profiler_config.
|
||||||
# not record shapes.
|
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: (
|
||||||
"VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool(
|
os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY")
|
||||||
os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"
|
|
||||||
),
|
),
|
||||||
# Enable torch profiler to profile memory if set
|
# Enable torch profiler to profile stack if set to 1.
|
||||||
# VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler
|
# Deprecated, see profiler_config.
|
||||||
# will not profile memory.
|
"VLLM_TORCH_PROFILER_WITH_STACK": lambda: (
|
||||||
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool(
|
os.getenv("VLLM_TORCH_PROFILER_WITH_STACK")
|
||||||
os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"
|
|
||||||
),
|
),
|
||||||
# Enable torch profiler to profile stack if set
|
# Enable torch profiler to profile flops if set to 1.
|
||||||
# VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL
|
# Deprecated, see profiler_config.
|
||||||
# profile stack by default.
|
"VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: (
|
||||||
"VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool(
|
os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS")
|
||||||
os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"
|
|
||||||
),
|
),
|
||||||
# Enable torch profiler to profile flops if set
|
# Disable torch profiling of the AsyncLLMEngine process if set to 1.
|
||||||
# VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will
|
# Deprecated, see profiler_config.
|
||||||
# not profile flops.
|
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: (
|
||||||
"VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
|
os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM")
|
||||||
os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
|
|
||||||
),
|
|
||||||
# Disable torch profiling of the AsyncLLMEngine process.
|
|
||||||
# If set to 1, will not profile the engine process.
|
|
||||||
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool(
|
|
||||||
os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0"
|
|
||||||
),
|
),
|
||||||
# Delay number of iterations before starting profiling when using
|
# Delay number of iterations before starting profiling when using
|
||||||
# the torch/torch CUDA profiler. If set to 0, will start profiling immediately.
|
# the torch/torch CUDA profiler. If set to 0, will start profiling immediately.
|
||||||
"VLLM_PROFILER_DELAY_ITERS": lambda: int(
|
# Deprecated, see profiler_config.
|
||||||
os.getenv("VLLM_PROFILER_DELAY_ITERS", "0")
|
"VLLM_PROFILER_DELAY_ITERS": lambda: (os.getenv("VLLM_PROFILER_DELAY_ITERS")),
|
||||||
),
|
|
||||||
# Maximum number of iterations to profile when using the torch/torch CUDA profiler.
|
# Maximum number of iterations to profile when using the torch/torch CUDA profiler.
|
||||||
# If set to 0, will not limit the number of iterations.
|
# If set to 0, will not limit the number of iterations.
|
||||||
"VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
|
"VLLM_PROFILER_MAX_ITERS": lambda: os.getenv("VLLM_PROFILER_MAX_ITERS"),
|
||||||
# Control whether torch profiler gzip-compresses profiling files.
|
# Control whether torch profiler gzip-compresses profiling files.
|
||||||
# Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default).
|
# Deprecated, see profiler_config.
|
||||||
"VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool(
|
"VLLM_TORCH_PROFILER_USE_GZIP": lambda: os.getenv("VLLM_TORCH_PROFILER_USE_GZIP"),
|
||||||
os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0"
|
|
||||||
),
|
|
||||||
# Control whether torch profiler dumps the self_cuda_time_total table.
|
# Control whether torch profiler dumps the self_cuda_time_total table.
|
||||||
# Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping
|
# Set to 0 to disable dumping the table.
|
||||||
# (enabled by default).
|
# Deprecated, see profiler_config.
|
||||||
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool(
|
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: (
|
||||||
os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0"
|
os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL")
|
||||||
),
|
),
|
||||||
# If set, vLLM will use Triton implementations of AWQ.
|
# If set, vLLM will use Triton implementations of AWQ.
|
||||||
"VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
|
"VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
|
||||||
|
|||||||
@ -292,7 +292,7 @@ def set_forward_context(
|
|||||||
if num_tokens_across_dp is None:
|
if num_tokens_across_dp is None:
|
||||||
assert ubatch_slices is None
|
assert ubatch_slices is None
|
||||||
assert num_tokens is not None
|
assert num_tokens is not None
|
||||||
_, num_tokens_across_dp = coordinate_batch_across_dp(
|
_, num_tokens_across_dp, _ = coordinate_batch_across_dp(
|
||||||
num_tokens_unpadded=num_tokens,
|
num_tokens_unpadded=num_tokens,
|
||||||
parallel_config=vllm_config.parallel_config,
|
parallel_config=vllm_config.parallel_config,
|
||||||
allow_microbatching=False,
|
allow_microbatching=False,
|
||||||
|
|||||||
@ -935,7 +935,11 @@ def enable_batch_invariant_mode():
|
|||||||
|
|
||||||
# Batch invariant matmuls are no longer needed after cublas overrides
|
# Batch invariant matmuls are no longer needed after cublas overrides
|
||||||
if not is_torch_equal_or_newer("2.10.0.dev"):
|
if not is_torch_equal_or_newer("2.10.0.dev"):
|
||||||
if current_platform.is_device_capability(100):
|
if (
|
||||||
|
current_platform.is_device_capability(100)
|
||||||
|
or current_platform.is_device_capability(80)
|
||||||
|
or current_platform.is_device_capability(89)
|
||||||
|
):
|
||||||
# For PyTorch 2.9, B200 uses GEMV for bs=1
|
# For PyTorch 2.9, B200 uses GEMV for bs=1
|
||||||
# Requires https://github.com/pytorch/pytorch/pull/166735
|
# Requires https://github.com/pytorch/pytorch/pull/166735
|
||||||
_batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
|
_batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
|
||||||
|
|||||||
@ -895,6 +895,48 @@ def get_moe_configs(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_block_size_k_divisible(
|
||||||
|
size_k: int, block_size_k: int, group_size: int
|
||||||
|
) -> int:
|
||||||
|
"""Ensure block_size_k is a divisor of size_k and divisible by group_size.
|
||||||
|
|
||||||
|
This ensures BLOCK_SIZE_K compatibility with MoeWNA16 CUDA kernel which
|
||||||
|
requires size_k % BLOCK_SIZE_K == 0 and BLOCK_SIZE_K % group_size == 0.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
size_k: The size_k dimension that must be divisible by result.
|
||||||
|
block_size_k: Preferred block size (will be adjusted if needed).
|
||||||
|
group_size: The result must be divisible by this.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A valid BLOCK_SIZE_K that divides size_k and is divisible by group_size.
|
||||||
|
"""
|
||||||
|
# Fast path: already valid
|
||||||
|
if size_k % block_size_k == 0 and block_size_k % group_size == 0:
|
||||||
|
return block_size_k
|
||||||
|
|
||||||
|
# Find the largest value that:
|
||||||
|
# 1. Divides size_k (size_k % candidate == 0)
|
||||||
|
# 2. Is divisible by group_size (candidate % group_size == 0)
|
||||||
|
# 3. Is <= block_size_k (prefer smaller values close to block_size_k)
|
||||||
|
#
|
||||||
|
# Strategy: Search from min(block_size_k, size_k) down to group_size,
|
||||||
|
# stepping by group_size to ensure divisibility by group_size
|
||||||
|
max_search = min(block_size_k, size_k)
|
||||||
|
start = (max_search // group_size) * group_size
|
||||||
|
for candidate in range(start, group_size - 1, -group_size):
|
||||||
|
if size_k % candidate == 0:
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
# Fallback: if group_size divides size_k, use it
|
||||||
|
# This should always be true with correct group_size configuration
|
||||||
|
if size_k % group_size == 0:
|
||||||
|
return group_size
|
||||||
|
|
||||||
|
# This should not happen with correct group_size, but ensure divisibility
|
||||||
|
return size_k
|
||||||
|
|
||||||
|
|
||||||
def get_moe_wna16_block_config(
|
def get_moe_wna16_block_config(
|
||||||
config: dict[str, int],
|
config: dict[str, int],
|
||||||
use_moe_wna16_cuda: bool,
|
use_moe_wna16_cuda: bool,
|
||||||
@ -960,6 +1002,9 @@ def get_moe_wna16_block_config(
|
|||||||
# at the same time.
|
# at the same time.
|
||||||
block_size_n = 1024
|
block_size_n = 1024
|
||||||
|
|
||||||
|
# Ensure BLOCK_SIZE_K is a divisor of size_k for CUDA kernel compatibility
|
||||||
|
block_size_k = _ensure_block_size_k_divisible(size_k, block_size_k, group_size)
|
||||||
|
|
||||||
return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k}
|
return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -5,6 +5,7 @@ import torch
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm._aiter_ops import rocm_aiter_ops
|
||||||
from vllm.model_executor.custom_op import CustomOp
|
from vllm.model_executor.custom_op import CustomOp
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
@ -45,10 +46,13 @@ class QuantFP8(CustomOp):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.static = static
|
self.static = static
|
||||||
self.group_shape = group_shape
|
self.group_shape = group_shape
|
||||||
|
self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN
|
||||||
self.num_token_padding = num_token_padding
|
self.num_token_padding = num_token_padding
|
||||||
self.column_major_scales = column_major_scales
|
self.column_major_scales = column_major_scales
|
||||||
self.use_ue8m0 = use_ue8m0
|
self.use_ue8m0 = use_ue8m0
|
||||||
|
|
||||||
|
self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled()
|
||||||
|
|
||||||
self.is_group_quant = group_shape.is_per_group()
|
self.is_group_quant = group_shape.is_per_group()
|
||||||
if self.is_group_quant:
|
if self.is_group_quant:
|
||||||
assert not static, "Group quantization only supports dynamic mode"
|
assert not static, "Group quantization only supports dynamic mode"
|
||||||
@ -92,6 +96,33 @@ class QuantFP8(CustomOp):
|
|||||||
use_per_token_if_dynamic=self.use_per_token_if_dynamic,
|
use_per_token_if_dynamic=self.use_per_token_if_dynamic,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def forward_hip(
|
||||||
|
self,
|
||||||
|
x: torch.Tensor,
|
||||||
|
scale: torch.Tensor | None = None,
|
||||||
|
scale_ub: torch.Tensor | None = None,
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
use_aiter_quant = (
|
||||||
|
not self.is_group_quant
|
||||||
|
and self.use_aiter
|
||||||
|
and scale_ub is None
|
||||||
|
and x.is_contiguous()
|
||||||
|
)
|
||||||
|
use_aiter_per_tensor_quant = (
|
||||||
|
use_aiter_quant and self.group_shape == GroupShape.PER_TENSOR
|
||||||
|
)
|
||||||
|
use_aiter_per_token_quant = (
|
||||||
|
use_aiter_quant and self.group_shape == GroupShape.PER_TOKEN
|
||||||
|
)
|
||||||
|
|
||||||
|
if use_aiter_per_tensor_quant:
|
||||||
|
return rocm_aiter_ops.per_tensor_quant(x, _FP8_DTYPE, scale)
|
||||||
|
if use_aiter_per_token_quant:
|
||||||
|
return rocm_aiter_ops.per_token_quant(x, _FP8_DTYPE, scale)
|
||||||
|
|
||||||
|
# Fallback to CUDA implementation
|
||||||
|
return self.forward_cuda(x, scale, scale_ub)
|
||||||
|
|
||||||
def forward_native(
|
def forward_native(
|
||||||
self,
|
self,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
|
|||||||
@ -60,7 +60,7 @@ class MoeWNA16Config(QuantizationConfig):
|
|||||||
|
|
||||||
if self.linear_quant_method == "gptq":
|
if self.linear_quant_method == "gptq":
|
||||||
self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config)
|
self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config)
|
||||||
elif self.linear_quant_method == "awq":
|
elif self.linear_quant_method in ("awq", "awq_marlin"):
|
||||||
capability_tuple = current_platform.get_device_capability()
|
capability_tuple = current_platform.get_device_capability()
|
||||||
device_capability = (
|
device_capability = (
|
||||||
-1 if capability_tuple is None else capability_tuple.to_int()
|
-1 if capability_tuple is None else capability_tuple.to_int()
|
||||||
@ -107,7 +107,7 @@ class MoeWNA16Config(QuantizationConfig):
|
|||||||
if linear_quant_method == "gptq":
|
if linear_quant_method == "gptq":
|
||||||
has_zp = not cls.get_from_keys(config, ["sym"])
|
has_zp = not cls.get_from_keys(config, ["sym"])
|
||||||
modules_to_not_convert = []
|
modules_to_not_convert = []
|
||||||
elif linear_quant_method == "awq":
|
elif linear_quant_method in ("awq", "awq_marlin"):
|
||||||
has_zp = cls.get_from_keys(config, ["zero_point"])
|
has_zp = cls.get_from_keys(config, ["zero_point"])
|
||||||
modules_to_not_convert = cls.get_from_keys_or(
|
modules_to_not_convert = cls.get_from_keys_or(
|
||||||
config, ["modules_to_not_convert"], None
|
config, ["modules_to_not_convert"], None
|
||||||
@ -184,7 +184,7 @@ class MoeWNA16Config(QuantizationConfig):
|
|||||||
return GPTQConfig.from_config(self.full_config).get_quant_method(
|
return GPTQConfig.from_config(self.full_config).get_quant_method(
|
||||||
layer, prefix
|
layer, prefix
|
||||||
)
|
)
|
||||||
elif self.linear_quant_method == "awq":
|
elif self.linear_quant_method in ("awq", "awq_marlin"):
|
||||||
if self.use_marlin and check_marlin_supports_layer(
|
if self.use_marlin and check_marlin_supports_layer(
|
||||||
layer, self.group_size
|
layer, self.group_size
|
||||||
):
|
):
|
||||||
@ -468,7 +468,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
|||||||
shard_size = layer.intermediate_size_per_partition
|
shard_size = layer.intermediate_size_per_partition
|
||||||
|
|
||||||
# convert gptq and awq weight to a standard format
|
# convert gptq and awq weight to a standard format
|
||||||
if layer.quant_config.linear_quant_method == "awq":
|
# awq_marlin uses the same weight format as awq
|
||||||
|
if layer.quant_config.linear_quant_method in ("awq", "awq_marlin"):
|
||||||
assert layer.quant_config.weight_bits == 4
|
assert layer.quant_config.weight_bits == 4
|
||||||
if "weight" in weight_name:
|
if "weight" in weight_name:
|
||||||
loaded_weight = convert_awq_tensor(loaded_weight, "qweight")
|
loaded_weight = convert_awq_tensor(loaded_weight, "qweight")
|
||||||
|
|||||||
@ -367,6 +367,8 @@ class Qwen2MoeModel(nn.Module):
|
|||||||
self.embed_tokens = VocabParallelEmbedding(
|
self.embed_tokens = VocabParallelEmbedding(
|
||||||
config.vocab_size,
|
config.vocab_size,
|
||||||
config.hidden_size,
|
config.hidden_size,
|
||||||
|
quant_config=quant_config,
|
||||||
|
prefix=f"{prefix}.embed_tokens",
|
||||||
)
|
)
|
||||||
self.start_layer, self.end_layer, self.layers = make_layers(
|
self.start_layer, self.end_layer, self.layers = make_layers(
|
||||||
config.num_hidden_layers,
|
config.num_hidden_layers,
|
||||||
@ -512,6 +514,12 @@ class Qwen2MoeModel(nn.Module):
|
|||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
name = remapped_kv_scale_name
|
name = remapped_kv_scale_name
|
||||||
|
# GGUF: make sure that shared_expert_gate is a 2D tensor.
|
||||||
|
if (
|
||||||
|
"mlp.shared_expert_gate" in name
|
||||||
|
and len(loaded_weight.shape) == 1
|
||||||
|
):
|
||||||
|
loaded_weight = loaded_weight[None, :]
|
||||||
param = params_dict[name]
|
param = params_dict[name]
|
||||||
weight_loader = getattr(
|
weight_loader = getattr(
|
||||||
param, "weight_loader", default_weight_loader
|
param, "weight_loader", default_weight_loader
|
||||||
|
|||||||
@ -381,6 +381,8 @@ class RocmPlatform(Platform):
|
|||||||
compilation_config = vllm_config.compilation_config
|
compilation_config = vllm_config.compilation_config
|
||||||
parallel_config = vllm_config.parallel_config
|
parallel_config = vllm_config.parallel_config
|
||||||
is_eager_execution = compilation_config == CUDAGraphMode.NONE
|
is_eager_execution = compilation_config == CUDAGraphMode.NONE
|
||||||
|
use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
|
||||||
|
use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled()
|
||||||
|
|
||||||
if compilation_config.cudagraph_mode.has_full_cudagraphs():
|
if compilation_config.cudagraph_mode.has_full_cudagraphs():
|
||||||
# decode context parallel does not support full cudagraphs
|
# decode context parallel does not support full cudagraphs
|
||||||
@ -400,8 +402,6 @@ class RocmPlatform(Platform):
|
|||||||
)
|
)
|
||||||
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||||
|
|
||||||
use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
|
|
||||||
|
|
||||||
if cache_config and cache_config.block_size is None:
|
if cache_config and cache_config.block_size is None:
|
||||||
cache_config.block_size = 16
|
cache_config.block_size = 16
|
||||||
|
|
||||||
@ -415,6 +415,9 @@ class RocmPlatform(Platform):
|
|||||||
):
|
):
|
||||||
compilation_config.custom_ops.append("+rms_norm")
|
compilation_config.custom_ops.append("+rms_norm")
|
||||||
|
|
||||||
|
if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops:
|
||||||
|
compilation_config.custom_ops.append("+quant_fp8")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def verify_model_arch(cls, model_arch: str) -> None:
|
def verify_model_arch(cls, model_arch: str) -> None:
|
||||||
if model_arch in _ROCM_UNSUPPORTED_MODELS:
|
if model_arch in _ROCM_UNSUPPORTED_MODELS:
|
||||||
|
|||||||
@ -3,26 +3,27 @@
|
|||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
import vllm.envs as envs
|
from vllm.config import ProfilerConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class WorkerProfiler(ABC):
|
class WorkerProfiler(ABC):
|
||||||
def __init__(self) -> None:
|
def __init__(self, profiler_config: ProfilerConfig) -> None:
|
||||||
self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS
|
self._delay_iters = profiler_config.delay_iterations
|
||||||
if self._delay_iters > 0:
|
if self._delay_iters > 0:
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
"GPU profiling will start "
|
"GPU profiling will start "
|
||||||
f"{self._delay_iters} steps after start_profile."
|
f"{self._delay_iters} steps after start_profile."
|
||||||
)
|
)
|
||||||
|
|
||||||
self._max_iters = envs.VLLM_PROFILER_MAX_ITERS
|
self._max_iters = profiler_config.max_iterations
|
||||||
if self._max_iters > 0:
|
if self._max_iters > 0:
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
"GPU profiling will stop "
|
"GPU profiling will stop "
|
||||||
@ -133,12 +134,27 @@ class WorkerProfiler(ABC):
|
|||||||
return nullcontext()
|
return nullcontext()
|
||||||
|
|
||||||
|
|
||||||
|
TorchProfilerActivity = Literal["CPU", "CUDA", "XPU"]
|
||||||
|
TorchProfilerActivityMap = {
|
||||||
|
"CPU": torch.profiler.ProfilerActivity.CPU,
|
||||||
|
"CUDA": torch.profiler.ProfilerActivity.CUDA,
|
||||||
|
"XPU": torch.profiler.ProfilerActivity.XPU,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class TorchProfilerWrapper(WorkerProfiler):
|
class TorchProfilerWrapper(WorkerProfiler):
|
||||||
def __init__(self, worker_name: str, local_rank: int) -> None:
|
def __init__(
|
||||||
super().__init__()
|
self,
|
||||||
|
profiler_config: ProfilerConfig,
|
||||||
|
worker_name: str,
|
||||||
|
local_rank: int,
|
||||||
|
activities: list[TorchProfilerActivity],
|
||||||
|
) -> None:
|
||||||
|
super().__init__(profiler_config)
|
||||||
|
|
||||||
self.local_rank = local_rank
|
self.local_rank = local_rank
|
||||||
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
|
self.profiler_config = profiler_config
|
||||||
|
torch_profiler_trace_dir = profiler_config.torch_profiler_dir
|
||||||
if local_rank in (None, 0):
|
if local_rank in (None, 0):
|
||||||
logger.info(
|
logger.info(
|
||||||
"Torch profiling enabled. Traces will be saved to: %s",
|
"Torch profiling enabled. Traces will be saved to: %s",
|
||||||
@ -147,24 +163,23 @@ class TorchProfilerWrapper(WorkerProfiler):
|
|||||||
logger.debug(
|
logger.debug(
|
||||||
"Profiler config: record_shapes=%s,"
|
"Profiler config: record_shapes=%s,"
|
||||||
"profile_memory=%s,with_stack=%s,with_flops=%s",
|
"profile_memory=%s,with_stack=%s,with_flops=%s",
|
||||||
envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
profiler_config.torch_profiler_record_shapes,
|
||||||
envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
profiler_config.torch_profiler_with_memory,
|
||||||
envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
profiler_config.torch_profiler_with_stack,
|
||||||
envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
profiler_config.torch_profiler_with_flops,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.dump_cpu_time_total = "CPU" in activities and len(activities) == 1
|
||||||
self.profiler = torch.profiler.profile(
|
self.profiler = torch.profiler.profile(
|
||||||
activities=[
|
activities=[TorchProfilerActivityMap[activity] for activity in activities],
|
||||||
torch.profiler.ProfilerActivity.CPU,
|
record_shapes=profiler_config.torch_profiler_record_shapes,
|
||||||
torch.profiler.ProfilerActivity.CUDA,
|
profile_memory=profiler_config.torch_profiler_with_memory,
|
||||||
],
|
with_stack=profiler_config.torch_profiler_with_stack,
|
||||||
record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
with_flops=profiler_config.torch_profiler_with_flops,
|
||||||
profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
|
||||||
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
|
||||||
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
torch_profiler_trace_dir,
|
torch_profiler_trace_dir,
|
||||||
worker_name=worker_name,
|
worker_name=worker_name,
|
||||||
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
|
use_gzip=profiler_config.torch_profiler_use_gzip,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -176,9 +191,10 @@ class TorchProfilerWrapper(WorkerProfiler):
|
|||||||
def _stop(self) -> None:
|
def _stop(self) -> None:
|
||||||
self.profiler.stop()
|
self.profiler.stop()
|
||||||
|
|
||||||
if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL:
|
profiler_config = self.profiler_config
|
||||||
rank = self.local_rank
|
rank = self.local_rank
|
||||||
profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
|
if profiler_config.torch_profiler_dump_cuda_time_total:
|
||||||
|
profiler_dir = profiler_config.torch_profiler_dir
|
||||||
profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
|
profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
|
||||||
sort_key = "self_cuda_time_total"
|
sort_key = "self_cuda_time_total"
|
||||||
table = self.profiler.key_averages().table(sort_by=sort_key)
|
table = self.profiler.key_averages().table(sort_by=sort_key)
|
||||||
@ -189,6 +205,12 @@ class TorchProfilerWrapper(WorkerProfiler):
|
|||||||
# only print profiler results on rank 0
|
# only print profiler results on rank 0
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
print(table)
|
print(table)
|
||||||
|
if self.dump_cpu_time_total and rank == 0:
|
||||||
|
logger.info(
|
||||||
|
self.profiler.key_averages().table(
|
||||||
|
sort_by="self_cpu_time_total", row_limit=50
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def annotate_context_manager(self, name: str):
|
def annotate_context_manager(self, name: str):
|
||||||
@ -196,8 +218,8 @@ class TorchProfilerWrapper(WorkerProfiler):
|
|||||||
|
|
||||||
|
|
||||||
class CudaProfilerWrapper(WorkerProfiler):
|
class CudaProfilerWrapper(WorkerProfiler):
|
||||||
def __init__(self) -> None:
|
def __init__(self, profiler_config: ProfilerConfig) -> None:
|
||||||
super().__init__()
|
super().__init__(profiler_config)
|
||||||
# Note: lazy import to avoid dependency issues if CUDA is not available.
|
# Note: lazy import to avoid dependency issues if CUDA is not available.
|
||||||
import torch.cuda.profiler as cuda_profiler
|
import torch.cuda.profiler as cuda_profiler
|
||||||
|
|
||||||
@ -63,6 +63,31 @@ class ReasoningParser:
|
|||||||
True if the reasoning content ends in the input_ids.
|
True if the reasoning content ends in the input_ids.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def is_reasoning_end_streaming(
|
||||||
|
self, input_ids: list[int], delta_ids: list[int]
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the reasoning content ends in the input_ids on a
|
||||||
|
decode step.
|
||||||
|
|
||||||
|
It is used in structured engines like `xgrammar` to check if the
|
||||||
|
reasoning content ends in the model output during a decode step.
|
||||||
|
`input_ids` the entire model output and `delta_ids` are the last few
|
||||||
|
computed tokens of the model output (like during a decode step).
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
input_ids: list[int]
|
||||||
|
The entire model output.
|
||||||
|
delta_ids: list[int]
|
||||||
|
The last few computed tokens of the model output at the current decode step.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool
|
||||||
|
True if the reasoning content ends in the `delta_ids` on a
|
||||||
|
decode step.
|
||||||
|
"""
|
||||||
|
return self.is_reasoning_end(input_ids)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -74,6 +74,12 @@ class BaseThinkingReasoningParser(ReasoningParser):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def is_reasoning_end_streaming(
|
||||||
|
self, input_ids: list[int], delta_ids: list[int]
|
||||||
|
) -> bool:
|
||||||
|
end_token_id = self.end_token_id
|
||||||
|
return end_token_id in delta_ids
|
||||||
|
|
||||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||||
"""
|
"""
|
||||||
Extract the content after the end tokens
|
Extract the content after the end tokens
|
||||||
|
|||||||
@ -35,6 +35,11 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
|
|||||||
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
|
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
|
||||||
return self._parser.is_reasoning_end(input_ids)
|
return self._parser.is_reasoning_end(input_ids)
|
||||||
|
|
||||||
|
def is_reasoning_end_streaming(
|
||||||
|
self, input_ids: list[int], delta_ids: list[int]
|
||||||
|
) -> bool:
|
||||||
|
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
|
||||||
|
|
||||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||||
return self._parser.extract_content_ids(input_ids)
|
return self._parser.extract_content_ids(input_ids)
|
||||||
|
|
||||||
|
|||||||
@ -56,6 +56,11 @@ class Holo2ReasoningParser(ReasoningParser):
|
|||||||
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
|
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
|
||||||
return self._parser.is_reasoning_end(input_ids)
|
return self._parser.is_reasoning_end(input_ids)
|
||||||
|
|
||||||
|
def is_reasoning_end_streaming(
|
||||||
|
self, input_ids: list[int], delta_ids: list[int]
|
||||||
|
) -> bool:
|
||||||
|
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
|
||||||
|
|
||||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||||
return self._parser.extract_content_ids(input_ids)
|
return self._parser.extract_content_ids(input_ids)
|
||||||
|
|
||||||
|
|||||||
@ -32,6 +32,11 @@ class IdentityReasoningParser(ReasoningParser):
|
|||||||
# Always return True, since we never treat reasoning specially
|
# Always return True, since we never treat reasoning specially
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def is_reasoning_end_streaming(
|
||||||
|
self, input_ids: list[int], delta_ids: list[int]
|
||||||
|
) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||||
# Identity: return all tokens as content
|
# Identity: return all tokens as content
|
||||||
return input_ids
|
return input_ids
|
||||||
|
|||||||
@ -145,7 +145,7 @@ class CudagraphDispatcher:
|
|||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
uniform_decode: bool,
|
uniform_decode: bool,
|
||||||
has_lora: bool,
|
has_lora: bool,
|
||||||
use_cascade_attn: bool = False,
|
disable_full: bool = False,
|
||||||
) -> tuple[CUDAGraphMode, BatchDescriptor]:
|
) -> tuple[CUDAGraphMode, BatchDescriptor]:
|
||||||
"""
|
"""
|
||||||
Given conditions(e.g.,batch descriptor and if using cascade attention),
|
Given conditions(e.g.,batch descriptor and if using cascade attention),
|
||||||
@ -165,7 +165,7 @@ class CudagraphDispatcher:
|
|||||||
)
|
)
|
||||||
relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
|
relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
|
||||||
|
|
||||||
if not use_cascade_attn:
|
if not disable_full:
|
||||||
# check if key exists for full cudagraph
|
# check if key exists for full cudagraph
|
||||||
if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
|
if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
|
||||||
return CUDAGraphMode.FULL, batch_desc
|
return CUDAGraphMode.FULL, batch_desc
|
||||||
|
|||||||
@ -166,32 +166,24 @@ class AsyncLLM(EngineClient):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
if (
|
if (
|
||||||
envs.VLLM_TORCH_PROFILER_DIR
|
vllm_config.profiler_config.profiler == "torch"
|
||||||
and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM
|
and not vllm_config.profiler_config.ignore_frontend
|
||||||
):
|
):
|
||||||
|
profiler_dir = vllm_config.profiler_config.torch_profiler_dir
|
||||||
logger.info(
|
logger.info(
|
||||||
"Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501
|
"Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501
|
||||||
envs.VLLM_TORCH_PROFILER_DIR,
|
profiler_dir,
|
||||||
)
|
)
|
||||||
if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0:
|
|
||||||
logger.warning_once(
|
|
||||||
"Torch profiler received max_iters or delay_iters setting. These "
|
|
||||||
"are not compatible with the AsyncLLM profiler and will be ignored "
|
|
||||||
"for the AsyncLLM process. Engine process profiling will still "
|
|
||||||
"respect these settings. Consider setting "
|
|
||||||
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable "
|
|
||||||
"AsyncLLM profiling."
|
|
||||||
)
|
|
||||||
worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
|
worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
|
||||||
self.profiler = torch.profiler.profile(
|
self.profiler = torch.profiler.profile(
|
||||||
activities=[
|
activities=[
|
||||||
torch.profiler.ProfilerActivity.CPU,
|
torch.profiler.ProfilerActivity.CPU,
|
||||||
],
|
],
|
||||||
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
with_stack=vllm_config.profiler_config.torch_profiler_with_stack,
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
envs.VLLM_TORCH_PROFILER_DIR,
|
profiler_dir,
|
||||||
worker_name=worker_name,
|
worker_name=worker_name,
|
||||||
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
|
use_gzip=vllm_config.profiler_config.torch_profiler_use_gzip,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -1258,7 +1258,7 @@ class EagleProposer:
|
|||||||
num_tokens_padded: int,
|
num_tokens_padded: int,
|
||||||
) -> tuple[int, torch.Tensor]:
|
) -> tuple[int, torch.Tensor]:
|
||||||
# TODO(Flechman): support DBO ubatching
|
# TODO(Flechman): support DBO ubatching
|
||||||
should_ubatch, num_toks_across_dp = coordinate_batch_across_dp(
|
should_ubatch, num_toks_across_dp, _ = coordinate_batch_across_dp(
|
||||||
num_tokens_unpadded=num_tokens_unpadded,
|
num_tokens_unpadded=num_tokens_unpadded,
|
||||||
parallel_config=self.vllm_config.parallel_config,
|
parallel_config=self.vllm_config.parallel_config,
|
||||||
allow_microbatching=False,
|
allow_microbatching=False,
|
||||||
|
|||||||
@ -339,7 +339,9 @@ class StructuredOutputManager:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
# Check if reasoning ends in *this* step
|
# Check if reasoning ends in *this* step
|
||||||
if self.reasoner.is_reasoning_end(request.all_token_ids):
|
if self.reasoner.is_reasoning_end_streaming(
|
||||||
|
request.all_token_ids, request.all_token_ids[request.num_computed_tokens :]
|
||||||
|
):
|
||||||
# Reasoning just ended, so we shouldn't advance til
|
# Reasoning just ended, so we shouldn't advance til
|
||||||
# next pass
|
# next pass
|
||||||
structured_req.reasoning_ended = True
|
structured_req.reasoning_ended = True
|
||||||
|
|||||||
@ -13,6 +13,7 @@ from vllm.logger import init_logger
|
|||||||
from vllm.model_executor.utils import set_random_seed
|
from vllm.model_executor.utils import set_random_seed
|
||||||
from vllm.platforms import CpuArchEnum, current_platform
|
from vllm.platforms import CpuArchEnum, current_platform
|
||||||
from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo
|
from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo
|
||||||
|
from vllm.profiler.wrapper import TorchProfilerWrapper
|
||||||
from vllm.v1.worker.cpu_model_runner import CPUModelRunner
|
from vllm.v1.worker.cpu_model_runner import CPUModelRunner
|
||||||
from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
|
from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
|
||||||
|
|
||||||
@ -38,30 +39,17 @@ class CPUWorker(Worker):
|
|||||||
|
|
||||||
self.parallel_config.disable_custom_all_reduce = True
|
self.parallel_config.disable_custom_all_reduce = True
|
||||||
|
|
||||||
# Torch profiler. Enabled and configured through env vars:
|
# Torch profiler. Enabled and configured through profiler_config.
|
||||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
|
||||||
self.profiler: Any | None = None
|
self.profiler: Any | None = None
|
||||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
profiler_config = vllm_config.profiler_config
|
||||||
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
|
if profiler_config.profiler == "torch":
|
||||||
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
|
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
|
||||||
logger.info(
|
self.profiler = TorchProfilerWrapper(
|
||||||
"Profiling enabled. Traces will be saved to: %s",
|
profiler_config,
|
||||||
torch_profiler_trace_dir,
|
worker_name=worker_name,
|
||||||
|
local_rank=self.local_rank,
|
||||||
|
activities=["CPU"],
|
||||||
)
|
)
|
||||||
self.profiler = torch.profiler.profile(
|
|
||||||
activities=[
|
|
||||||
torch.profiler.ProfilerActivity.CPU,
|
|
||||||
],
|
|
||||||
record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
|
||||||
profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
|
||||||
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
|
||||||
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
|
||||||
torch_profiler_trace_dir, worker_name=worker_name, use_gzip=False
|
|
||||||
),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.profiler = None
|
|
||||||
|
|
||||||
def init_device(self):
|
def init_device(self):
|
||||||
# Setup OpenMP threads affinity.
|
# Setup OpenMP threads affinity.
|
||||||
@ -202,9 +190,3 @@ class CPUWorker(Worker):
|
|||||||
self.profiler.start()
|
self.profiler.start()
|
||||||
else:
|
else:
|
||||||
self.profiler.stop()
|
self.profiler.stop()
|
||||||
if self.local_rank == 0:
|
|
||||||
logger.info(
|
|
||||||
self.profiler.key_averages().table(
|
|
||||||
sort_by="self_cpu_time_total", row_limit=50
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|||||||
@ -40,16 +40,18 @@ def _run_ar(
|
|||||||
should_dp_pad: bool,
|
should_dp_pad: bool,
|
||||||
orig_num_tokens_per_ubatch: int,
|
orig_num_tokens_per_ubatch: int,
|
||||||
padded_num_tokens_per_ubatch: int,
|
padded_num_tokens_per_ubatch: int,
|
||||||
|
cudagraph_mode: int,
|
||||||
parallel_config: ParallelConfig,
|
parallel_config: ParallelConfig,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
dp_size = parallel_config.data_parallel_size
|
dp_size = parallel_config.data_parallel_size
|
||||||
dp_rank = parallel_config.data_parallel_rank
|
dp_rank = parallel_config.data_parallel_rank
|
||||||
device, group = _get_device_and_group(parallel_config)
|
device, group = _get_device_and_group(parallel_config)
|
||||||
tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32)
|
tensor = torch.zeros(5, dp_size, device=device, dtype=torch.int32)
|
||||||
tensor[0][dp_rank] = orig_num_tokens_per_ubatch
|
tensor[0][dp_rank] = orig_num_tokens_per_ubatch
|
||||||
tensor[1][dp_rank] = padded_num_tokens_per_ubatch
|
tensor[1][dp_rank] = padded_num_tokens_per_ubatch
|
||||||
tensor[2][dp_rank] = 1 if should_ubatch else 0
|
tensor[2][dp_rank] = 1 if should_ubatch else 0
|
||||||
tensor[3][dp_rank] = 1 if should_dp_pad else 0
|
tensor[3][dp_rank] = 1 if should_dp_pad else 0
|
||||||
|
tensor[4][dp_rank] = cudagraph_mode
|
||||||
dist.all_reduce(tensor, group=group)
|
dist.all_reduce(tensor, group=group)
|
||||||
return tensor
|
return tensor
|
||||||
|
|
||||||
@ -89,13 +91,23 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
|
|||||||
return num_tokens_across_dp.cpu()
|
return num_tokens_across_dp.cpu()
|
||||||
|
|
||||||
|
|
||||||
|
def _post_process_cudagraph_mode(tensor: torch.Tensor) -> int:
|
||||||
|
"""
|
||||||
|
Synchronize cudagraph_mode across DP ranks by taking the minimum.
|
||||||
|
If any rank has NONE (0), all ranks use NONE.
|
||||||
|
This ensures all ranks send consistent values (all padded or all unpadded).
|
||||||
|
"""
|
||||||
|
return int(tensor[4, :].min().item())
|
||||||
|
|
||||||
|
|
||||||
def _synchronize_dp_ranks(
|
def _synchronize_dp_ranks(
|
||||||
num_tokens_unpadded: int,
|
num_tokens_unpadded: int,
|
||||||
num_tokens_padded: int,
|
num_tokens_padded: int,
|
||||||
should_attempt_ubatching: bool,
|
should_attempt_ubatching: bool,
|
||||||
should_attempt_dp_padding: bool,
|
should_attempt_dp_padding: bool,
|
||||||
|
cudagraph_mode: int,
|
||||||
parallel_config: ParallelConfig,
|
parallel_config: ParallelConfig,
|
||||||
) -> tuple[bool, torch.Tensor | None]:
|
) -> tuple[bool, torch.Tensor | None, int]:
|
||||||
"""
|
"""
|
||||||
1. Decides if each DP rank is going to microbatch. Either all ranks
|
1. Decides if each DP rank is going to microbatch. Either all ranks
|
||||||
run with microbatching or none of them do.
|
run with microbatching or none of them do.
|
||||||
@ -104,10 +116,13 @@ def _synchronize_dp_ranks(
|
|||||||
When running microbatched or if should_attempt_dp_padding is True, all
|
When running microbatched or if should_attempt_dp_padding is True, all
|
||||||
ranks will be padded out so that the run with the same number of tokens
|
ranks will be padded out so that the run with the same number of tokens
|
||||||
|
|
||||||
|
3. Synchronizes cudagraph_mode across ranks by taking the minimum.
|
||||||
|
|
||||||
Returns: tuple[
|
Returns: tuple[
|
||||||
should_ubatch: Are all DP ranks going to microbatch
|
should_ubatch: Are all DP ranks going to microbatch
|
||||||
num_tokens_after_padding: A tensor containing the total number of
|
num_tokens_after_padding: A tensor containing the total number of
|
||||||
tokens per-microbatch for each DP rank including any DP padding.
|
tokens per-microbatch for each DP rank including any DP padding.
|
||||||
|
synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
|
||||||
]
|
]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -121,6 +136,7 @@ def _synchronize_dp_ranks(
|
|||||||
should_dp_pad=should_attempt_dp_padding,
|
should_dp_pad=should_attempt_dp_padding,
|
||||||
orig_num_tokens_per_ubatch=num_tokens_unpadded,
|
orig_num_tokens_per_ubatch=num_tokens_unpadded,
|
||||||
padded_num_tokens_per_ubatch=num_tokens_padded,
|
padded_num_tokens_per_ubatch=num_tokens_padded,
|
||||||
|
cudagraph_mode=cudagraph_mode,
|
||||||
parallel_config=parallel_config,
|
parallel_config=parallel_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -148,7 +164,10 @@ def _synchronize_dp_ranks(
|
|||||||
should_dp_pad,
|
should_dp_pad,
|
||||||
)
|
)
|
||||||
|
|
||||||
return should_ubatch, num_tokens_after_padding
|
# Synchronize cudagraph_mode across ranks (take min)
|
||||||
|
synced_cudagraph_mode = _post_process_cudagraph_mode(tensor)
|
||||||
|
|
||||||
|
return should_ubatch, num_tokens_after_padding, synced_cudagraph_mode
|
||||||
|
|
||||||
|
|
||||||
def coordinate_batch_across_dp(
|
def coordinate_batch_across_dp(
|
||||||
@ -159,7 +178,8 @@ def coordinate_batch_across_dp(
|
|||||||
num_tokens_padded: int | None = None,
|
num_tokens_padded: int | None = None,
|
||||||
uniform_decode: bool | None = None,
|
uniform_decode: bool | None = None,
|
||||||
num_scheduled_tokens_per_request: np.ndarray | None = None,
|
num_scheduled_tokens_per_request: np.ndarray | None = None,
|
||||||
) -> tuple[bool, torch.Tensor | None]:
|
cudagraph_mode: int = 0,
|
||||||
|
) -> tuple[bool, torch.Tensor | None, int]:
|
||||||
"""
|
"""
|
||||||
Coordinates amongst all DP ranks to determine if and how the full batch
|
Coordinates amongst all DP ranks to determine if and how the full batch
|
||||||
should be split into microbatches.
|
should be split into microbatches.
|
||||||
@ -175,6 +195,7 @@ def coordinate_batch_across_dp(
|
|||||||
only contains single token decodes
|
only contains single token decodes
|
||||||
num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
|
num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
|
||||||
number of tokens per request.
|
number of tokens per request.
|
||||||
|
cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL)
|
||||||
|
|
||||||
Returns: tuple[
|
Returns: tuple[
|
||||||
ubatch_slices: if this is set then all DP ranks have agreed to
|
ubatch_slices: if this is set then all DP ranks have agreed to
|
||||||
@ -183,12 +204,13 @@ def coordinate_batch_across_dp(
|
|||||||
tokens per-microbatch for each DP rank including padding. Will be
|
tokens per-microbatch for each DP rank including padding. Will be
|
||||||
padded up to the max value across all DP ranks when allow_dp_padding
|
padded up to the max value across all DP ranks when allow_dp_padding
|
||||||
is True.
|
is True.
|
||||||
|
synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
|
||||||
]
|
]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if parallel_config.data_parallel_size == 1:
|
if parallel_config.data_parallel_size == 1:
|
||||||
# Early exit.
|
# Early exit.
|
||||||
return False, None
|
return False, None, cudagraph_mode
|
||||||
|
|
||||||
# If the caller has explicitly enabled microbatching.
|
# If the caller has explicitly enabled microbatching.
|
||||||
should_attempt_ubatching = False
|
should_attempt_ubatching = False
|
||||||
@ -204,12 +226,15 @@ def coordinate_batch_across_dp(
|
|||||||
if num_tokens_padded is None:
|
if num_tokens_padded is None:
|
||||||
num_tokens_padded = num_tokens_unpadded
|
num_tokens_padded = num_tokens_unpadded
|
||||||
|
|
||||||
(should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks(
|
(should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) = (
|
||||||
num_tokens_unpadded,
|
_synchronize_dp_ranks(
|
||||||
num_tokens_padded,
|
num_tokens_unpadded,
|
||||||
should_attempt_ubatching,
|
num_tokens_padded,
|
||||||
allow_dp_padding,
|
should_attempt_ubatching,
|
||||||
parallel_config,
|
allow_dp_padding,
|
||||||
|
cudagraph_mode,
|
||||||
|
parallel_config,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
return (should_ubatch, num_tokens_after_padding)
|
return (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode)
|
||||||
|
|||||||
@ -2,14 +2,15 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.v1.outputs import (
|
from vllm.v1.outputs import (
|
||||||
AsyncModelRunnerOutput,
|
AsyncModelRunnerOutput,
|
||||||
LogprobsTensors,
|
LogprobsTensors,
|
||||||
ModelRunnerOutput,
|
ModelRunnerOutput,
|
||||||
SamplerOutput,
|
|
||||||
)
|
)
|
||||||
|
from vllm.v1.worker.gpu.sample.output import SamplerOutput
|
||||||
|
|
||||||
|
|
||||||
class AsyncOutput(AsyncModelRunnerOutput):
|
class AsyncOutput(AsyncModelRunnerOutput):
|
||||||
@ -34,29 +35,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
|
|||||||
with torch.cuda.stream(self.copy_stream):
|
with torch.cuda.stream(self.copy_stream):
|
||||||
self.copy_stream.wait_stream(default_stream)
|
self.copy_stream.wait_stream(default_stream)
|
||||||
|
|
||||||
# NOTE(woosuk): We must ensure that CPU tensors are not freed
|
self.sampled_token_ids = async_copy_to_np(sampler_output.sampled_token_ids)
|
||||||
# before the device-to-host copy is fully completed. For instance,
|
|
||||||
# operations like
|
|
||||||
# self.sampled_token_np = ...to("cpu", non_blocking=True).numpy()
|
|
||||||
# are unsafe because the underlying CPU tensor can be prematurely freed and
|
|
||||||
# reused by other tensors before the asynchronous copy finishes, potentially
|
|
||||||
# causing race conditions. To prevent this, we delay freeing by holding
|
|
||||||
# references until the copy event signals completion.
|
|
||||||
# Likewise, we also need to keep the reference to the GPU tensors.
|
|
||||||
# This is done by keeping the reference to sampler_output and
|
|
||||||
# model_runner_output.
|
|
||||||
self.sampled_token_ids = sampler_output.sampled_token_ids.to(
|
|
||||||
"cpu", non_blocking=True
|
|
||||||
)
|
|
||||||
if sampler_output.logprobs_tensors is not None:
|
if sampler_output.logprobs_tensors is not None:
|
||||||
self.logprobs_tensors: LogprobsTensors | None = (
|
self.logprobs_tensors: LogprobsTensors | None = (
|
||||||
sampler_output.logprobs_tensors.to_cpu_nonblocking()
|
sampler_output.logprobs_tensors.to_cpu_nonblocking()
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.logprobs_tensors = None
|
self.logprobs_tensors = None
|
||||||
self.num_sampled_tokens_cpu = num_sampled_tokens.to(
|
if sampler_output.num_nans is not None:
|
||||||
"cpu", non_blocking=True
|
self.num_nans = async_copy_to_np(sampler_output.num_nans)
|
||||||
)
|
else:
|
||||||
|
self.num_nans = None
|
||||||
|
self.num_sampled_tokens_np = async_copy_to_np(num_sampled_tokens)
|
||||||
self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
|
self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
|
||||||
if self.model_runner_output.prompt_logprobs_dict:
|
if self.model_runner_output.prompt_logprobs_dict:
|
||||||
for k, v in self.model_runner_output.prompt_logprobs_dict.items():
|
for k, v in self.model_runner_output.prompt_logprobs_dict.items():
|
||||||
@ -68,7 +58,6 @@ class AsyncOutput(AsyncModelRunnerOutput):
|
|||||||
|
|
||||||
def get_output(self) -> ModelRunnerOutput:
|
def get_output(self) -> ModelRunnerOutput:
|
||||||
self.copy_event.synchronize()
|
self.copy_event.synchronize()
|
||||||
num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy()
|
|
||||||
|
|
||||||
# NOTE(woosuk): The following code is to ensure compatibility with
|
# NOTE(woosuk): The following code is to ensure compatibility with
|
||||||
# the existing model runner.
|
# the existing model runner.
|
||||||
@ -76,10 +65,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
|
|||||||
# rather than Python lists.
|
# rather than Python lists.
|
||||||
sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
|
sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
|
||||||
num_reqs = len(sampled_token_ids)
|
num_reqs = len(sampled_token_ids)
|
||||||
|
num_sampled_tokens = self.num_sampled_tokens_np.tolist()
|
||||||
for i in range(num_reqs):
|
for i in range(num_reqs):
|
||||||
del sampled_token_ids[i][num_sampled_tokens_np[i] :]
|
del sampled_token_ids[i][num_sampled_tokens[i] :]
|
||||||
self.model_runner_output.sampled_token_ids = sampled_token_ids
|
self.model_runner_output.sampled_token_ids = sampled_token_ids
|
||||||
|
|
||||||
|
if self.num_nans is not None:
|
||||||
|
num_nans = self.num_nans.tolist()
|
||||||
|
self.model_runner_output.num_nans_in_logits = {
|
||||||
|
req_id: num_nans[i]
|
||||||
|
for i, req_id in enumerate(self.model_runner_output.req_ids)
|
||||||
|
}
|
||||||
|
|
||||||
if self.logprobs_tensors is not None:
|
if self.logprobs_tensors is not None:
|
||||||
self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
|
self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
|
||||||
self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
|
self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
|
||||||
@ -95,3 +92,7 @@ def async_barrier(event: torch.cuda.Event | None):
|
|||||||
finally:
|
finally:
|
||||||
if event is not None:
|
if event is not None:
|
||||||
event.record()
|
event.record()
|
||||||
|
|
||||||
|
|
||||||
|
def async_copy_to_np(x: torch.Tensor) -> np.ndarray:
|
||||||
|
return x.to("cpu", non_blocking=True).numpy()
|
||||||
|
|||||||
0
vllm/v1/worker/gpu/metrics/__init__.py
Normal file
0
vllm/v1/worker/gpu/metrics/__init__.py
Normal file
42
vllm/v1/worker/gpu/metrics/logits.py
Normal file
42
vllm/v1/worker/gpu/metrics/logits.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import torch
|
||||||
|
from torch._inductor.runtime.triton_helpers import libdevice
|
||||||
|
|
||||||
|
from vllm.triton_utils import tl, triton
|
||||||
|
|
||||||
|
|
||||||
|
@triton.jit
|
||||||
|
def _num_nans_kernel(
|
||||||
|
logits_ptr,
|
||||||
|
logits_stride,
|
||||||
|
num_nans_ptr,
|
||||||
|
vocab_size,
|
||||||
|
BLOCK_SIZE: tl.constexpr,
|
||||||
|
):
|
||||||
|
req_idx = tl.program_id(0)
|
||||||
|
num_nans = 0
|
||||||
|
for i in range(0, vocab_size, BLOCK_SIZE):
|
||||||
|
block = i + tl.arange(0, BLOCK_SIZE)
|
||||||
|
mask = block < vocab_size
|
||||||
|
logits = tl.load(
|
||||||
|
logits_ptr + req_idx * logits_stride + block, mask=mask, other=0
|
||||||
|
)
|
||||||
|
logits = logits.to(tl.float32)
|
||||||
|
is_nan = libdevice.isnan(logits).to(tl.int1)
|
||||||
|
num_nans += tl.sum(is_nan).to(tl.int32)
|
||||||
|
tl.store(num_nans_ptr + req_idx, num_nans)
|
||||||
|
|
||||||
|
|
||||||
|
def get_num_nans(logits: torch.Tensor) -> torch.Tensor:
|
||||||
|
num_reqs, vocab_size = logits.shape
|
||||||
|
BLOCK_SIZE = 8192
|
||||||
|
num_nans = torch.empty(num_reqs, dtype=torch.int32, device=logits.device)
|
||||||
|
_num_nans_kernel[(num_reqs,)](
|
||||||
|
logits,
|
||||||
|
logits.stride(0),
|
||||||
|
num_nans,
|
||||||
|
vocab_size,
|
||||||
|
BLOCK_SIZE=BLOCK_SIZE,
|
||||||
|
)
|
||||||
|
return num_nans
|
||||||
@ -25,7 +25,6 @@ from vllm.v1.outputs import (
|
|||||||
LogprobsTensors,
|
LogprobsTensors,
|
||||||
ModelRunnerOutput,
|
ModelRunnerOutput,
|
||||||
)
|
)
|
||||||
from vllm.v1.sample.sampler import SamplerOutput
|
|
||||||
from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier
|
from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier
|
||||||
from vllm.v1.worker.gpu.attn_utils import (
|
from vllm.v1.worker.gpu.attn_utils import (
|
||||||
build_attn_metadata,
|
build_attn_metadata,
|
||||||
@ -53,6 +52,7 @@ from vllm.v1.worker.gpu.sample.metadata import (
|
|||||||
SamplingMetadata,
|
SamplingMetadata,
|
||||||
expand_sampling_metadata,
|
expand_sampling_metadata,
|
||||||
)
|
)
|
||||||
|
from vllm.v1.worker.gpu.sample.output import SamplerOutput
|
||||||
from vllm.v1.worker.gpu.sample.sampler import Sampler
|
from vllm.v1.worker.gpu.sample.sampler import Sampler
|
||||||
from vllm.v1.worker.gpu.spec_decode import init_speculator
|
from vllm.v1.worker.gpu.spec_decode import init_speculator
|
||||||
from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
|
from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
|
||||||
|
|||||||
@ -39,9 +39,7 @@ def _min_p_kernel(
|
|||||||
tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask)
|
tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask)
|
||||||
|
|
||||||
|
|
||||||
def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor | None) -> None:
|
def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor) -> None:
|
||||||
if min_p is None:
|
|
||||||
return
|
|
||||||
num_reqs, vocab_size = logits.shape
|
num_reqs, vocab_size = logits.shape
|
||||||
BLOCK_SIZE = 1024
|
BLOCK_SIZE = 1024
|
||||||
_min_p_kernel[(num_reqs,)](
|
_min_p_kernel[(num_reqs,)](
|
||||||
|
|||||||
14
vllm/v1/worker/gpu/sample/output.py
Normal file
14
vllm/v1/worker/gpu/sample/output.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.v1.outputs import LogprobsTensors
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SamplerOutput:
|
||||||
|
sampled_token_ids: torch.Tensor
|
||||||
|
logprobs_tensors: LogprobsTensors | None
|
||||||
|
num_nans: torch.Tensor | None
|
||||||
@ -62,6 +62,7 @@ def _penalties_and_temperature_kernel(
|
|||||||
mask=packed_block < tl.cdiv(vocab_size, 32),
|
mask=packed_block < tl.cdiv(vocab_size, 32),
|
||||||
)
|
)
|
||||||
prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1
|
prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1
|
||||||
|
prompt_bin_mask = prompt_bin_mask.to(tl.int1)
|
||||||
prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)
|
prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)
|
||||||
|
|
||||||
# If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
|
# If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
|
||||||
|
|||||||
@ -3,13 +3,15 @@
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
from vllm.config.model import LogprobsMode
|
from vllm.config.model import LogprobsMode
|
||||||
from vllm.v1.outputs import SamplerOutput
|
|
||||||
from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
|
from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
|
||||||
|
from vllm.v1.worker.gpu.metrics.logits import get_num_nans
|
||||||
from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
|
from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
|
||||||
from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
|
from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
|
||||||
from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
|
from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
|
||||||
from vllm.v1.worker.gpu.sample.min_p import apply_min_p
|
from vllm.v1.worker.gpu.sample.min_p import apply_min_p
|
||||||
|
from vllm.v1.worker.gpu.sample.output import SamplerOutput
|
||||||
from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature
|
from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature
|
||||||
|
|
||||||
|
|
||||||
@ -21,12 +23,16 @@ class Sampler:
|
|||||||
if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
|
if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
|
||||||
raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
|
raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
|
||||||
self.logprobs_mode = logprobs_mode
|
self.logprobs_mode = logprobs_mode
|
||||||
|
self.compute_nans = envs.VLLM_COMPUTE_NANS_IN_LOGITS # False by default.
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
logits: torch.Tensor,
|
logits: torch.Tensor,
|
||||||
sampling_metadata: SamplingMetadata,
|
sampling_metadata: SamplingMetadata,
|
||||||
) -> SamplerOutput:
|
) -> SamplerOutput:
|
||||||
|
# NOTE(woosuk): We intentionally compute num_nans before sampling to make clear
|
||||||
|
# that num_nans is computed before applying penalties and temperature.
|
||||||
|
num_nans = get_num_nans(logits) if self.compute_nans else None
|
||||||
sampled, processed_logits = self.sample(logits, sampling_metadata)
|
sampled, processed_logits = self.sample(logits, sampling_metadata)
|
||||||
if sampling_metadata.max_num_logprobs is not None:
|
if sampling_metadata.max_num_logprobs is not None:
|
||||||
logits = (
|
logits = (
|
||||||
@ -49,6 +55,7 @@ class Sampler:
|
|||||||
# token per request.
|
# token per request.
|
||||||
sampled_token_ids=sampled.view(-1, 1),
|
sampled_token_ids=sampled.view(-1, 1),
|
||||||
logprobs_tensors=logprobs_tensors,
|
logprobs_tensors=logprobs_tensors,
|
||||||
|
num_nans=num_nans,
|
||||||
)
|
)
|
||||||
return sampler_output
|
return sampler_output
|
||||||
|
|
||||||
@ -63,7 +70,8 @@ class Sampler:
|
|||||||
# Apply penalties and temperature in place.
|
# Apply penalties and temperature in place.
|
||||||
apply_penalties_and_temperature(logits, sampling_metadata)
|
apply_penalties_and_temperature(logits, sampling_metadata)
|
||||||
# Apply min_p in place.
|
# Apply min_p in place.
|
||||||
apply_min_p(logits, sampling_metadata.min_p)
|
if sampling_metadata.min_p is not None:
|
||||||
|
apply_min_p(logits, sampling_metadata.min_p)
|
||||||
# Apply top_k and/or top_p. This might return a new tensor.
|
# Apply top_k and/or top_p. This might return a new tensor.
|
||||||
logits = apply_top_k_top_p(
|
logits = apply_top_k_top_p(
|
||||||
logits, sampling_metadata.top_k, sampling_metadata.top_p
|
logits, sampling_metadata.top_k, sampling_metadata.top_p
|
||||||
|
|||||||
@ -2788,17 +2788,19 @@ class GPUModelRunner(
|
|||||||
)
|
)
|
||||||
|
|
||||||
dispatch_cudagraph = (
|
dispatch_cudagraph = (
|
||||||
lambda num_tokens: self.cudagraph_dispatcher.dispatch(
|
lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch(
|
||||||
num_tokens=num_tokens,
|
num_tokens=num_tokens,
|
||||||
has_lora=has_lora,
|
has_lora=has_lora,
|
||||||
use_cascade_attn=use_cascade_attn,
|
|
||||||
uniform_decode=uniform_decode,
|
uniform_decode=uniform_decode,
|
||||||
|
disable_full=disable_full,
|
||||||
)
|
)
|
||||||
if not force_eager
|
if not force_eager
|
||||||
else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
|
else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
|
||||||
)
|
)
|
||||||
|
|
||||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
|
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
||||||
|
num_tokens_padded, use_cascade_attn
|
||||||
|
)
|
||||||
num_tokens_padded = batch_descriptor.num_tokens
|
num_tokens_padded = batch_descriptor.num_tokens
|
||||||
|
|
||||||
# Extra coordination when running data-parallel since we need to coordinate
|
# Extra coordination when running data-parallel since we need to coordinate
|
||||||
@ -2813,23 +2815,28 @@ class GPUModelRunner(
|
|||||||
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||||
)
|
)
|
||||||
|
|
||||||
should_ubatch, num_tokens_across_dp = coordinate_batch_across_dp(
|
should_ubatch, num_tokens_across_dp, synced_cudagraph_mode = (
|
||||||
num_tokens_unpadded=num_tokens,
|
coordinate_batch_across_dp(
|
||||||
parallel_config=self.parallel_config,
|
num_tokens_unpadded=num_tokens,
|
||||||
allow_microbatching=allow_microbatching,
|
parallel_config=self.parallel_config,
|
||||||
allow_dp_padding=allow_dp_padding,
|
allow_microbatching=allow_microbatching,
|
||||||
num_tokens_padded=num_tokens_padded,
|
allow_dp_padding=allow_dp_padding,
|
||||||
uniform_decode=uniform_decode,
|
num_tokens_padded=num_tokens_padded,
|
||||||
num_scheduled_tokens_per_request=num_scheduled_tokens_np,
|
uniform_decode=uniform_decode,
|
||||||
|
num_scheduled_tokens_per_request=num_scheduled_tokens_np,
|
||||||
|
cudagraph_mode=cudagraph_mode.value,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract DP padding if there is any
|
# Extract DP-synced values
|
||||||
if num_tokens_across_dp is not None:
|
if num_tokens_across_dp is not None:
|
||||||
dp_rank = self.parallel_config.data_parallel_rank
|
dp_rank = self.parallel_config.data_parallel_rank
|
||||||
num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
|
num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
|
||||||
|
# Re-dispatch with DP padding so we have the correct batch_descriptor
|
||||||
# Re-dispatch with DP padding
|
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
||||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
|
num_tokens_padded,
|
||||||
|
disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value,
|
||||||
|
)
|
||||||
# Assert to make sure the agreed upon token count is correct otherwise
|
# Assert to make sure the agreed upon token count is correct otherwise
|
||||||
# num_tokens_across_dp will no-longer be valid
|
# num_tokens_across_dp will no-longer be valid
|
||||||
assert batch_descriptor.num_tokens == num_tokens_padded
|
assert batch_descriptor.num_tokens == num_tokens_padded
|
||||||
@ -4161,10 +4168,19 @@ class GPUModelRunner(
|
|||||||
|
|
||||||
if self.speculative_config and self.speculative_config.use_eagle():
|
if self.speculative_config and self.speculative_config.use_eagle():
|
||||||
assert isinstance(self.drafter, EagleProposer)
|
assert isinstance(self.drafter, EagleProposer)
|
||||||
|
# Eagle currently only supports PIECEWISE cudagraphs.
|
||||||
|
# Therefore only use cudagraphs if the main model uses PIECEWISE
|
||||||
|
# NOTE(lucas): this is a hack, need to clean up.
|
||||||
use_cudagraphs = (
|
use_cudagraphs = (
|
||||||
cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE)
|
(
|
||||||
and not self.speculative_config.enforce_eager
|
is_graph_capturing
|
||||||
)
|
and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
|
||||||
|
)
|
||||||
|
or (
|
||||||
|
not is_graph_capturing
|
||||||
|
and cudagraph_runtime_mode != CUDAGraphMode.NONE
|
||||||
|
)
|
||||||
|
) and not self.speculative_config.enforce_eager
|
||||||
|
|
||||||
# Note(gnovack) - We need to disable cudagraphs for one of the two
|
# Note(gnovack) - We need to disable cudagraphs for one of the two
|
||||||
# lora cases when cudagraph_specialize_lora is enabled. This is a
|
# lora cases when cudagraph_specialize_lora is enabled. This is a
|
||||||
|
|||||||
@ -38,7 +38,7 @@ from vllm.model_executor import set_random_seed
|
|||||||
from vllm.model_executor.models.interfaces import is_mixture_of_experts
|
from vllm.model_executor.models.interfaces import is_mixture_of_experts
|
||||||
from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
|
from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper
|
from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.tasks import SupportedTask
|
from vllm.tasks import SupportedTask
|
||||||
from vllm.utils.mem_constants import GiB_bytes
|
from vllm.utils.mem_constants import GiB_bytes
|
||||||
@ -79,6 +79,10 @@ class Worker(WorkerBase):
|
|||||||
is_driver_worker=is_driver_worker,
|
is_driver_worker=is_driver_worker,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# configure float32 matmul precision according to vLLM env.
|
||||||
|
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
|
||||||
|
torch.set_float32_matmul_precision(precision)
|
||||||
|
|
||||||
if self.model_config.trust_remote_code:
|
if self.model_config.trust_remote_code:
|
||||||
# note: lazy import to avoid importing torch before initializing
|
# note: lazy import to avoid importing torch before initializing
|
||||||
from vllm.utils.import_utils import init_cached_hf_modules
|
from vllm.utils.import_utils import init_cached_hf_modules
|
||||||
@ -88,17 +92,19 @@ class Worker(WorkerBase):
|
|||||||
# Buffers saved before sleep
|
# Buffers saved before sleep
|
||||||
self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
|
self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
|
||||||
|
|
||||||
# Torch/CUDA profiler. Enabled and configured through env vars:
|
# Torch/CUDA profiler. Enabled and configured through profiler_config.
|
||||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
|
||||||
# VLLM_TORCH_CUDA_PROFILE=1
|
|
||||||
self.profiler: Any | None = None
|
self.profiler: Any | None = None
|
||||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
profiler_config = vllm_config.profiler_config
|
||||||
|
if profiler_config.profiler == "torch":
|
||||||
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
|
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
|
||||||
self.profiler = TorchProfilerWrapper(
|
self.profiler = TorchProfilerWrapper(
|
||||||
worker_name=worker_name, local_rank=self.local_rank
|
profiler_config,
|
||||||
|
worker_name=worker_name,
|
||||||
|
local_rank=self.local_rank,
|
||||||
|
activities=["CPU", "CUDA"],
|
||||||
)
|
)
|
||||||
elif envs.VLLM_TORCH_CUDA_PROFILE:
|
elif profiler_config.profiler == "cuda":
|
||||||
self.profiler = CudaProfilerWrapper()
|
self.profiler = CudaProfilerWrapper(profiler_config)
|
||||||
else:
|
else:
|
||||||
self.profiler = None
|
self.profiler = None
|
||||||
|
|
||||||
|
|||||||
@ -98,10 +98,10 @@ class TPUWorker:
|
|||||||
# MP runtime is initialized.
|
# MP runtime is initialized.
|
||||||
self.profiler = None
|
self.profiler = None
|
||||||
self.profile_dir = None
|
self.profile_dir = None
|
||||||
if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
|
if vllm_config.profiler_config.profiler == "torch" and self.rank < 1:
|
||||||
# For TPU, we can only have 1 active profiler session for 1 profiler
|
# For TPU, we can only have 1 active profiler session for 1 profiler
|
||||||
# server. So we only profile on rank0.
|
# server. So we only profile on rank0.
|
||||||
self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
self.profile_dir = vllm_config.profiler_config.torch_profiler_dir
|
||||||
logger.info(
|
logger.info(
|
||||||
"Profiling enabled. Traces will be saved to: %s", self.profile_dir
|
"Profiling enabled. Traces will be saved to: %s", self.profile_dir
|
||||||
)
|
)
|
||||||
|
|||||||
@ -6,12 +6,12 @@ from typing import Any
|
|||||||
import torch
|
import torch
|
||||||
import torch.distributed
|
import torch.distributed
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.distributed import get_world_group
|
from vllm.distributed import get_world_group
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor import set_random_seed
|
from vllm.model_executor import set_random_seed
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.profiler.wrapper import TorchProfilerWrapper
|
||||||
from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
|
from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
|
||||||
from vllm.v1.worker.xpu_model_runner import XPUModelRunner
|
from vllm.v1.worker.xpu_model_runner import XPUModelRunner
|
||||||
|
|
||||||
@ -36,41 +36,17 @@ class XPUWorker(Worker):
|
|||||||
assert device_config.device_type == "xpu"
|
assert device_config.device_type == "xpu"
|
||||||
assert current_platform.is_xpu()
|
assert current_platform.is_xpu()
|
||||||
|
|
||||||
# Torch profiler. Enabled and configured through env vars:
|
# Torch profiler. Enabled and configured through profiler_config.
|
||||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
|
||||||
self.profiler: Any | None = None
|
self.profiler: Any | None = None
|
||||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
profiler_config = vllm_config.profiler_config
|
||||||
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
|
if profiler_config.profiler == "torch":
|
||||||
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
|
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
|
||||||
logger.info(
|
self.profiler = TorchProfilerWrapper(
|
||||||
"Profiling enabled. Traces will be saved to: %s",
|
profiler_config,
|
||||||
torch_profiler_trace_dir,
|
worker_name=worker_name,
|
||||||
|
local_rank=self.local_rank,
|
||||||
|
activities=["CPU", "XPU"],
|
||||||
)
|
)
|
||||||
logger.debug(
|
|
||||||
"Profiler config: record_shapes=%s,"
|
|
||||||
"profile_memory=%s,with_stack=%s,with_flops=%s",
|
|
||||||
envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
|
||||||
envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
|
||||||
envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
|
||||||
envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
|
||||||
)
|
|
||||||
self.profiler = torch.profiler.profile(
|
|
||||||
activities=[
|
|
||||||
torch.profiler.ProfilerActivity.CPU,
|
|
||||||
torch.profiler.ProfilerActivity.XPU,
|
|
||||||
],
|
|
||||||
record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
|
||||||
profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
|
||||||
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
|
||||||
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
|
||||||
torch_profiler_trace_dir,
|
|
||||||
worker_name=worker_name,
|
|
||||||
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.profiler = None
|
|
||||||
|
|
||||||
# we provide this function due to `torch.xpu.mem_get_info()` doesn't
|
# we provide this function due to `torch.xpu.mem_get_info()` doesn't
|
||||||
# return correct free_gpu_memory on intel client GPU. We need to
|
# return correct free_gpu_memory on intel client GPU. We need to
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user