mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-23 09:33:40 +08:00
Merge branch 'main' into imarkov/eplb_optimizations
This commit is contained in:
commit
60f744d7ce
@ -398,7 +398,8 @@ steps:
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
@ -440,23 +441,29 @@ steps:
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- vllm/multimodal
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
@ -718,6 +725,18 @@ steps:
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
autorun_on_main: true
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: OpenAI API correctness # 10min
|
||||
timeout_in_minutes: 15
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
@ -727,7 +746,7 @@ steps:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/model_executor/models/whisper.py
|
||||
commands: # LMEval
|
||||
commands: # LMEval+Transcription WER check
|
||||
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
@ -963,6 +982,19 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
|
||||
timeout_in_minutes: 180
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||
timeout_in_minutes: 120
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -1098,7 +1130,6 @@ steps:
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- vllm/model_executor/layers/fused_moe/layer.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
@ -1132,12 +1163,25 @@ steps:
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Blackwell Quantized MoE Test
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
@ -1155,6 +1199,16 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Blackwell LM Eval Small Models
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
|
||||
@ -1397,6 +1451,39 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
gpu: a100
|
||||
optional: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H100 test #####
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
gpu: h100
|
||||
optional: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -1440,29 +1527,6 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: Blackwell LM Eval Small Models
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
||||
timeout_in_minutes: 70
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: LM Eval Large Models (4 Card)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
@ -1478,21 +1542,6 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
- label: ROCm LM Eval Large Models (8 Card)
|
||||
mirror_hardwares: [amdproduction]
|
||||
agent_pool: mi325_8
|
||||
@ -1517,6 +1566,20 @@ steps:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
##### RL Integration Tests #####
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
@ -1550,17 +1613,26 @@ steps:
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
##### RL Integration Tests #####
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_2
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
|
||||
@ -468,7 +468,9 @@ steps:
|
||||
# tests covered elsewhere.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@ -482,7 +484,9 @@ steps:
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Fullgraph Test # 27min
|
||||
timeout_in_minutes: 40
|
||||
|
||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
|
||||
2
.github/workflows/macos-smoke-test.yml
vendored
2
.github/workflows/macos-smoke-test.yml
vendored
@ -12,7 +12,7 @@ jobs:
|
||||
timeout-minutes: 30
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v6.0.1
|
||||
|
||||
- uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
|
||||
2
.github/workflows/pre-commit.yml
vendored
2
.github/workflows/pre-commit.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -15,7 +15,7 @@ jobs:
|
||||
actions: write
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
||||
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
|
||||
with:
|
||||
# Increasing this value ensures that changes to this workflow
|
||||
# propagate to all issues and PRs in days rather than months
|
||||
|
||||
@ -96,8 +96,9 @@ start_server() {
|
||||
# This correctly passes each element as a separate argument.
|
||||
if [[ -n "$profile_dir" ]]; then
|
||||
# Start server with profiling enabled
|
||||
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
|
||||
VLLM_SERVER_DEV_MODE=1 \
|
||||
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
else
|
||||
# Start server without profiling
|
||||
VLLM_SERVER_DEV_MODE=1 \
|
||||
|
||||
@ -963,8 +963,7 @@ def create_argument_parser():
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
help="Use Torch Profiler. The endpoint must be launched with "
|
||||
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
||||
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--result-dir",
|
||||
|
||||
@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes.
|
||||
- [vllm.config.MultiModalConfig][]
|
||||
- [vllm.config.PoolerConfig][]
|
||||
- [vllm.config.StructuredOutputsConfig][]
|
||||
- [vllm.config.ProfilerConfig][]
|
||||
- [vllm.config.ObservabilityConfig][]
|
||||
- [vllm.config.KVTransferConfig][]
|
||||
- [vllm.config.CompilationConfig][]
|
||||
|
||||
@ -5,16 +5,15 @@
|
||||
|
||||
## Profile with PyTorch Profiler
|
||||
|
||||
We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables:
|
||||
We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config`
|
||||
when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:
|
||||
|
||||
- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default
|
||||
- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
|
||||
- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
|
||||
- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
|
||||
- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
|
||||
- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
|
||||
|
||||
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
|
||||
- `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
|
||||
- `torch_profiler_with_memory` to record memory, off by default
|
||||
- `torch_profiler_with_stack` to enable recording stack information, on by default
|
||||
- `torch_profiler_with_flops` to enable recording FLOPs, off by default
|
||||
- `torch_profiler_use_gzip` to control gzip-compressing profiling files, on by default
|
||||
- `torch_profiler_dump_cuda_time_total` to control dumping and printing the aggregated CUDA self time table, on by default
|
||||
|
||||
When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
|
||||
|
||||
@ -40,8 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline
|
||||
#### OpenAI Server
|
||||
|
||||
```bash
|
||||
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
||||
vllm serve meta-llama/Llama-3.1-8B-Instruct
|
||||
vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}'
|
||||
```
|
||||
|
||||
vllm bench command:
|
||||
@ -104,13 +102,12 @@ To profile the server, you will want to prepend your `vllm serve` command with `
|
||||
|
||||
```bash
|
||||
# server
|
||||
VLLM_TORCH_CUDA_PROFILE=1 \
|
||||
nsys profile \
|
||||
--trace-fork-before-exec=true \
|
||||
--cuda-graph-trace=node \
|
||||
--capture-range=cudaProfilerApi \
|
||||
--capture-range-end repeat \
|
||||
vllm serve meta-llama/Llama-3.1-8B-Instruct
|
||||
vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config.profiler cuda
|
||||
|
||||
# client
|
||||
vllm bench serve \
|
||||
|
||||
@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return self.end_token_id in input_ids
|
||||
|
||||
def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool:
|
||||
return self.end_token_id in delta_token_ids
|
||||
...
|
||||
```
|
||||
|
||||
|
||||
@ -1,14 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# enable torch profiler, can also be set on cmd line
|
||||
os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
@ -22,7 +18,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
def main():
|
||||
# Create an LLM.
|
||||
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
|
||||
llm = LLM(
|
||||
model="facebook/opt-125m",
|
||||
tensor_parallel_size=1,
|
||||
profiler_config={
|
||||
"profiler": "torch",
|
||||
"torch_profiler_dir": "./vllm_profile",
|
||||
},
|
||||
)
|
||||
|
||||
llm.start_profile()
|
||||
|
||||
|
||||
@ -17,7 +17,6 @@ def test_compile():
|
||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||
@pytest.mark.forked
|
||||
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
|
||||
@pytest.mark.xfail
|
||||
def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
|
||||
"""Test that Qwen2.5-VL vision submodules are compiled.
|
||||
|
||||
|
||||
@ -80,6 +80,8 @@ def test_compile_ranges(use_fresh_inductor_cache):
|
||||
vllm_config = VllmConfig(
|
||||
scheduler_config=SchedulerConfig(
|
||||
max_num_batched_tokens=8192,
|
||||
max_model_len=8192,
|
||||
is_encoder_decoder=False,
|
||||
),
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
@ -112,6 +114,8 @@ def test_compile_config_get_compile_ranges():
|
||||
VllmConfig(
|
||||
scheduler_config=SchedulerConfig(
|
||||
max_num_batched_tokens=8192,
|
||||
max_model_len=8192,
|
||||
is_encoder_decoder=False,
|
||||
),
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
@ -134,6 +138,8 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
|
||||
)
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_batched_tokens=8192,
|
||||
max_model_len=8192,
|
||||
is_encoder_decoder=False,
|
||||
)
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
|
||||
@ -5,9 +5,14 @@ import copy
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
||||
from vllm.compilation.inductor_pass import (
|
||||
CallableInductorPass,
|
||||
InductorPass,
|
||||
pass_context,
|
||||
)
|
||||
from vllm.compilation.pass_manager import PostGradPassManager
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.config.utils import Range
|
||||
|
||||
|
||||
# dummy custom pass that doesn't inherit
|
||||
@ -42,35 +47,37 @@ class ProperPass(InductorPass):
|
||||
],
|
||||
)
|
||||
def test_pass_manager_uuid(callable):
|
||||
# Some passes need dtype to be set
|
||||
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
|
||||
# Set the pass context as PassManager uuid uses it
|
||||
with pass_context(Range(start=1, end=8)):
|
||||
# Some passes need dtype to be set
|
||||
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
|
||||
|
||||
pass_manager = PostGradPassManager()
|
||||
pass_manager.configure(config)
|
||||
pass_manager = PostGradPassManager()
|
||||
pass_manager.configure(config)
|
||||
|
||||
# Check that UUID is different if the same pass is added 2x
|
||||
pass_manager.add(callable)
|
||||
uuid1 = pass_manager.uuid()
|
||||
pass_manager.add(callable)
|
||||
uuid2 = pass_manager.uuid()
|
||||
assert uuid1 != uuid2
|
||||
# Check that UUID is different if the same pass is added 2x
|
||||
pass_manager.add(callable)
|
||||
uuid1 = pass_manager.uuid()
|
||||
pass_manager.add(callable)
|
||||
uuid2 = pass_manager.uuid()
|
||||
assert uuid1 != uuid2
|
||||
|
||||
# UUID should be the same as the original one,
|
||||
# as we constructed in the same way.
|
||||
pass_manager2 = PostGradPassManager()
|
||||
pass_manager2.configure(config)
|
||||
pass_manager2.add(callable)
|
||||
assert uuid1 == pass_manager2.uuid()
|
||||
# UUID should be the same as the original one,
|
||||
# as we constructed in the same way.
|
||||
pass_manager2 = PostGradPassManager()
|
||||
pass_manager2.configure(config)
|
||||
pass_manager2.add(callable)
|
||||
assert uuid1 == pass_manager2.uuid()
|
||||
|
||||
# UUID should be different due to config change
|
||||
config2 = copy.deepcopy(config)
|
||||
config2.compilation_config.pass_config.fuse_norm_quant = (
|
||||
not config2.compilation_config.pass_config.fuse_norm_quant
|
||||
)
|
||||
config2.compilation_config.pass_config.fuse_act_quant = (
|
||||
not config2.compilation_config.pass_config.fuse_act_quant
|
||||
)
|
||||
pass_manager3 = PostGradPassManager()
|
||||
pass_manager3.configure(config2)
|
||||
pass_manager3.add(callable)
|
||||
assert uuid1 != pass_manager3.uuid()
|
||||
# UUID should be different due to config change
|
||||
config2 = copy.deepcopy(config)
|
||||
config2.compilation_config.pass_config.fuse_norm_quant = (
|
||||
not config2.compilation_config.pass_config.fuse_norm_quant
|
||||
)
|
||||
config2.compilation_config.pass_config.fuse_act_quant = (
|
||||
not config2.compilation_config.pass_config.fuse_act_quant
|
||||
)
|
||||
pass_manager3 = PostGradPassManager()
|
||||
pass_manager3.configure(config2)
|
||||
pass_manager3.add(callable)
|
||||
assert uuid1 != pass_manager3.uuid()
|
||||
|
||||
@ -26,7 +26,14 @@ def clear_cache():
|
||||
_cached_get_attn_backend.cache_clear()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
|
||||
devices = ["cpu"]
|
||||
if current_platform.is_cuda():
|
||||
devices.append("cuda")
|
||||
if current_platform.is_rocm():
|
||||
devices.append("hip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", devices)
|
||||
def test_mha_attn_platform(device: str):
|
||||
"""
|
||||
Test the attention selector between different platform and device.
|
||||
@ -46,7 +53,7 @@ def test_mha_attn_platform(device: str):
|
||||
patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
|
||||
):
|
||||
attn = MultiHeadAttention(16, 64, scale=1)
|
||||
assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
|
||||
assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
|
||||
else:
|
||||
# Test CUDA with head_size=64 (divisible by 32)
|
||||
# - should use vLLM's FlashAttention
|
||||
|
||||
@ -103,7 +103,7 @@ def ref_dynamic_per_tensor_fp8_quant(
|
||||
.clamp(fp8_traits_min, fp8_traits_max)
|
||||
.to(FP8_DTYPE)
|
||||
)
|
||||
return ref_out, ref_scale.view((1, 1))
|
||||
return ref_out, ref_scale.view(1)
|
||||
|
||||
|
||||
def native_w8a8_block_matmul(
|
||||
|
||||
@ -132,6 +132,41 @@ class TestBaseThinkingReasoningParserMethods:
|
||||
is False
|
||||
)
|
||||
|
||||
def test_is_reasoning_end_streaming(self, test_tokenizer):
|
||||
"""Test the is_reasoning_end_streaming method."""
|
||||
parser = TestThinkingReasoningParser(test_tokenizer)
|
||||
end_token_id = parser.end_token_id
|
||||
start_token_id = parser.start_token_id
|
||||
|
||||
assert (
|
||||
parser.is_reasoning_end_streaming([1, 2, end_token_id], [end_token_id])
|
||||
is True
|
||||
)
|
||||
assert parser.is_reasoning_end_streaming([1, 2, 3, 4], [4]) is False
|
||||
assert parser.is_reasoning_end_streaming([], []) is False
|
||||
assert (
|
||||
parser.is_reasoning_end_streaming(
|
||||
[1, start_token_id, 2, end_token_id], [end_token_id]
|
||||
)
|
||||
is True
|
||||
)
|
||||
assert (
|
||||
parser.is_reasoning_end_streaming([1, start_token_id, 2, 3], [3]) is False
|
||||
)
|
||||
assert (
|
||||
parser.is_reasoning_end_streaming(
|
||||
[1, start_token_id, 2, end_token_id, 2, start_token_id, 2],
|
||||
[2],
|
||||
)
|
||||
is False
|
||||
)
|
||||
assert (
|
||||
parser.is_reasoning_end_streaming(
|
||||
[1, start_token_id, 2, end_token_id, 2, 2], [2]
|
||||
)
|
||||
is False
|
||||
)
|
||||
|
||||
def test_extract_content_ids(self, test_tokenizer):
|
||||
"""Test the extract_content_ids method."""
|
||||
parser = TestThinkingReasoningParser(test_tokenizer)
|
||||
|
||||
@ -40,6 +40,7 @@ def test_identity_reasoning_parser_basic(tokenizer):
|
||||
input_tokens = tokenizer.tokenize(input_text)
|
||||
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
|
||||
assert parser.is_reasoning_end(input_ids) is True
|
||||
assert parser.is_reasoning_end_streaming(input_ids, input_ids) is True
|
||||
|
||||
# Test extract_content_ids returns all input_ids
|
||||
assert parser.extract_content_ids(input_ids) == input_ids
|
||||
|
||||
@ -615,6 +615,7 @@ def test_extract_tool_calls_streaming(
|
||||
"single_tool_weather",
|
||||
"multiple_tool_calls",
|
||||
"content_before_tool",
|
||||
"complex",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
@ -673,6 +674,21 @@ def test_extract_tool_calls_streaming(
|
||||
],
|
||||
"bla",
|
||||
),
|
||||
(
|
||||
# Complex
|
||||
"""[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="bash",
|
||||
arguments=json.dumps(
|
||||
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls_streaming_one_chunk(
|
||||
|
||||
@ -161,10 +161,10 @@ class TestCudagraphDispatcher:
|
||||
assert rt_mode == CUDAGraphMode.NONE
|
||||
assert key == BatchDescriptor(num_tokens=15)
|
||||
|
||||
# 4. Cascade attention should have a fall back mode
|
||||
# 4. disable_full should have a fall back mode (e.g., cascade attention)
|
||||
desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False)
|
||||
rt_mode, key = dispatcher.dispatch(
|
||||
num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True
|
||||
num_tokens=8, uniform_decode=False, has_lora=False, disable_full=True
|
||||
)
|
||||
if "PIECEWISE" in cudagraph_mode_str: # string contains check
|
||||
assert rt_mode == CUDAGraphMode.PIECEWISE
|
||||
|
||||
@ -10,6 +10,7 @@ from utils import (
|
||||
BACKENDS,
|
||||
_extract_step_logprobs,
|
||||
_random_prompt,
|
||||
is_device_capability_below_90,
|
||||
resolve_model_name,
|
||||
skip_unsupported,
|
||||
)
|
||||
@ -17,6 +18,8 @@ from utils import (
|
||||
import vllm.model_executor.layers.batch_invariant as batch_invariant
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
|
||||
|
||||
|
||||
@skip_unsupported
|
||||
@pytest.mark.timeout(1000)
|
||||
@ -190,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16", # not everything is supported
|
||||
gpu_memory_utilization=0.9,
|
||||
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||
)
|
||||
|
||||
# Use more realistic prompts for better token generation
|
||||
@ -393,6 +397,8 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
|
||||
gpu_memory_utilization=0.9,
|
||||
max_model_len=2048,
|
||||
dtype="bfloat16",
|
||||
enable_prefix_caching=False,
|
||||
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||
)
|
||||
|
||||
prompt = "the capital of france is"
|
||||
@ -459,6 +465,7 @@ def test_logprobs_without_batch_invariance_should_fail(
|
||||
max_num_seqs=32,
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16",
|
||||
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||
)
|
||||
|
||||
# build ragged prompts to change shapes significantly across BS=1 vs BS=N
|
||||
@ -682,6 +689,7 @@ def test_decode_logprobs_match_prefill_logprobs(
|
||||
max_num_seqs=32,
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16",
|
||||
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||
)
|
||||
|
||||
# Use a few test prompts
|
||||
@ -925,6 +933,8 @@ def LLM_with_max_seqs(
|
||||
max_model_len=max_model_len,
|
||||
dtype="bfloat16",
|
||||
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
|
||||
enable_prefix_caching=False,
|
||||
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||
# Enable for MOE models
|
||||
# enable_expert_parallel=True,
|
||||
)
|
||||
|
||||
@ -11,8 +11,10 @@ from vllm.platforms import current_platform
|
||||
from vllm.utils.flashinfer import has_flashinfer
|
||||
|
||||
skip_unsupported = pytest.mark.skipif(
|
||||
not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
|
||||
reason="Requires CUDA and >= Hopper (SM90)",
|
||||
not (current_platform.is_cuda() and current_platform.has_device_capability(80)),
|
||||
# Supports testing on Ampere and Ada Lovelace devices.
|
||||
# Note: For devices with SM < 90, batch invariance does not support CUDA Graphs.
|
||||
reason="Requires CUDA and >= Ampere (SM80)",
|
||||
)
|
||||
|
||||
BACKENDS: list[str] = [
|
||||
@ -97,3 +99,7 @@ def _extract_step_logprobs(request_output):
|
||||
return t, inner.token_ids
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def is_device_capability_below_90() -> bool:
|
||||
return not current_platform.has_device_capability(90)
|
||||
|
||||
@ -124,6 +124,8 @@ def run_tests(
|
||||
with monkeypatch.context() as m:
|
||||
# avoid precision errors
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
||||
# lock matmul precision to full FP32
|
||||
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
|
||||
# m.setenv("VLLM_BATCH_INVARIANT", "1")
|
||||
outputs: list[tuple[str, list, list]] = []
|
||||
for n, (
|
||||
|
||||
@ -70,6 +70,7 @@ class TestReasoningStructuredOutput:
|
||||
request.use_structured_output = True
|
||||
request.prompt_token_ids = [1, 2, 3, 4, 5]
|
||||
request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8]
|
||||
request.num_computed_tokens = 5
|
||||
return request
|
||||
|
||||
def test_should_fill_bitmask_with_enable_in_reasoning(
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.profiler.gpu_profiler import WorkerProfiler
|
||||
from vllm.config import ProfilerConfig
|
||||
from vllm.profiler.wrapper import WorkerProfiler
|
||||
|
||||
|
||||
class ConcreteWorkerProfiler(WorkerProfiler):
|
||||
@ -11,11 +11,11 @@ class ConcreteWorkerProfiler(WorkerProfiler):
|
||||
A basic implementation of a worker profiler for testing purposes.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, profiler_config: ProfilerConfig):
|
||||
self.start_call_count = 0
|
||||
self.stop_call_count = 0
|
||||
self.should_fail_start = False
|
||||
super().__init__()
|
||||
super().__init__(profiler_config)
|
||||
|
||||
def _start(self) -> None:
|
||||
if self.should_fail_start:
|
||||
@ -26,17 +26,19 @@ class ConcreteWorkerProfiler(WorkerProfiler):
|
||||
self.stop_call_count += 1
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_mocks():
|
||||
"""Fixture to reset mocks and env variables before each test."""
|
||||
envs.VLLM_PROFILER_DELAY_ITERS = 0
|
||||
envs.VLLM_PROFILER_MAX_ITERS = 0
|
||||
@pytest.fixture
|
||||
def default_profiler_config():
|
||||
return ProfilerConfig(
|
||||
profiler="torch",
|
||||
torch_profiler_dir="/tmp/mock",
|
||||
delay_iterations=0,
|
||||
max_iterations=0,
|
||||
)
|
||||
|
||||
|
||||
def test_immediate_start_stop():
|
||||
def test_immediate_start_stop(default_profiler_config):
|
||||
"""Test standard start without delay."""
|
||||
profiler = ConcreteWorkerProfiler()
|
||||
|
||||
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||
profiler.start()
|
||||
assert profiler._running is True
|
||||
assert profiler._active is True
|
||||
@ -48,10 +50,10 @@ def test_immediate_start_stop():
|
||||
assert profiler.stop_call_count == 1
|
||||
|
||||
|
||||
def test_delayed_start():
|
||||
def test_delayed_start(default_profiler_config):
|
||||
"""Test that profiler waits for N steps before actually starting."""
|
||||
envs.VLLM_PROFILER_DELAY_ITERS = 2
|
||||
profiler = ConcreteWorkerProfiler()
|
||||
default_profiler_config.delay_iterations = 2
|
||||
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||
|
||||
# User requests start
|
||||
profiler.start()
|
||||
@ -71,10 +73,10 @@ def test_delayed_start():
|
||||
assert profiler.start_call_count == 1
|
||||
|
||||
|
||||
def test_max_iterations():
|
||||
def test_max_iterations(default_profiler_config):
|
||||
"""Test that profiler stops automatically after max iterations."""
|
||||
envs.VLLM_PROFILER_MAX_ITERS = 2
|
||||
profiler = ConcreteWorkerProfiler()
|
||||
default_profiler_config.max_iterations = 2
|
||||
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||
|
||||
profiler.start()
|
||||
assert profiler._running is True
|
||||
@ -95,12 +97,11 @@ def test_max_iterations():
|
||||
assert profiler.stop_call_count == 1
|
||||
|
||||
|
||||
def test_delayed_start_and_max_iters():
|
||||
def test_delayed_start_and_max_iters(default_profiler_config):
|
||||
"""Test combined delayed start and max iterations."""
|
||||
envs.VLLM_PROFILER_DELAY_ITERS = 2
|
||||
envs.VLLM_PROFILER_MAX_ITERS = 2
|
||||
profiler = ConcreteWorkerProfiler()
|
||||
|
||||
default_profiler_config.delay_iterations = 2
|
||||
default_profiler_config.max_iterations = 2
|
||||
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||
profiler.start()
|
||||
|
||||
# Step 1
|
||||
@ -127,9 +128,9 @@ def test_delayed_start_and_max_iters():
|
||||
assert profiler.stop_call_count == 1
|
||||
|
||||
|
||||
def test_idempotency():
|
||||
def test_idempotency(default_profiler_config):
|
||||
"""Test that calling start/stop multiple times doesn't break logic."""
|
||||
profiler = ConcreteWorkerProfiler()
|
||||
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||
|
||||
# Double Start
|
||||
profiler.start()
|
||||
@ -142,10 +143,10 @@ def test_idempotency():
|
||||
assert profiler.stop_call_count == 1 # Should only stop once
|
||||
|
||||
|
||||
def test_step_inactive():
|
||||
def test_step_inactive(default_profiler_config):
|
||||
"""Test that stepping while inactive does nothing."""
|
||||
envs.VLLM_PROFILER_DELAY_ITERS = 2
|
||||
profiler = ConcreteWorkerProfiler()
|
||||
default_profiler_config.delay_iterations = 2
|
||||
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||
|
||||
# Not started yet
|
||||
profiler.step()
|
||||
@ -155,9 +156,9 @@ def test_step_inactive():
|
||||
assert profiler.start_call_count == 0
|
||||
|
||||
|
||||
def test_start_failure():
|
||||
def test_start_failure(default_profiler_config):
|
||||
"""Test behavior when the underlying _start method raises exception."""
|
||||
profiler = ConcreteWorkerProfiler()
|
||||
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||
profiler.should_fail_start = True
|
||||
|
||||
profiler.start()
|
||||
@ -168,9 +169,9 @@ def test_start_failure():
|
||||
assert profiler.start_call_count == 0 # Logic failed inside start
|
||||
|
||||
|
||||
def test_shutdown():
|
||||
def test_shutdown(default_profiler_config):
|
||||
"""Test that shutdown calls stop only if running."""
|
||||
profiler = ConcreteWorkerProfiler()
|
||||
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||
|
||||
# Case 1: Not running
|
||||
profiler.shutdown()
|
||||
@ -182,10 +183,10 @@ def test_shutdown():
|
||||
assert profiler.stop_call_count == 1
|
||||
|
||||
|
||||
def test_mixed_delay_and_stop():
|
||||
def test_mixed_delay_and_stop(default_profiler_config):
|
||||
"""Test manual stop during the delay period."""
|
||||
envs.VLLM_PROFILER_DELAY_ITERS = 5
|
||||
profiler = ConcreteWorkerProfiler()
|
||||
default_profiler_config.delay_iterations = 5
|
||||
profiler = ConcreteWorkerProfiler(default_profiler_config)
|
||||
|
||||
profiler.start()
|
||||
profiler.step()
|
||||
|
||||
@ -9,6 +9,8 @@ import vllm.envs as envs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
|
||||
|
||||
_FP8_DTYPE = current_platform.fp8_dtype()
|
||||
|
||||
|
||||
def is_aiter_found() -> bool:
|
||||
from importlib.util import find_spec
|
||||
@ -467,6 +469,59 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
|
||||
return torch.empty_like(x), torch.empty_like(residual)
|
||||
|
||||
|
||||
def _rocm_aiter_per_tensor_quant_impl(
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
scale: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
from aiter.ops.quant import per_tensor_quant_hip
|
||||
|
||||
return per_tensor_quant_hip(x, scale, quant_dtype)
|
||||
|
||||
|
||||
def _rocm_aiter_per_tensor_quant_fake(
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
scale: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
return torch.empty_like(x, dtype=quant_dtype), torch.empty(
|
||||
1, dtype=torch.float32, device=x.device
|
||||
)
|
||||
|
||||
|
||||
def _rocm_aiter_per_token_quant_impl(
|
||||
x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
from aiter.ops.quant import dynamic_per_token_scaled_quant
|
||||
|
||||
assert quant_dtype in [torch.int8, _FP8_DTYPE]
|
||||
|
||||
out_shape = x.shape
|
||||
out = torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device)
|
||||
if scale is None:
|
||||
scale = torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device)
|
||||
dynamic_per_token_scaled_quant(
|
||||
out,
|
||||
x,
|
||||
scale,
|
||||
scale_ub=None,
|
||||
shuffle_scale=False,
|
||||
num_rows=None,
|
||||
num_rows_factor=1,
|
||||
)
|
||||
return out, scale
|
||||
|
||||
|
||||
def _rocm_aiter_per_token_quant_fake(
|
||||
x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
out_shape = x.shape
|
||||
return (
|
||||
torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device),
|
||||
torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device),
|
||||
)
|
||||
|
||||
|
||||
# Global flag to ensure ops are registered only once
|
||||
_OPS_REGISTERED = False
|
||||
|
||||
@ -665,6 +720,22 @@ class rocm_aiter_ops:
|
||||
dispatch_key=current_platform.dispatch_key,
|
||||
)
|
||||
|
||||
direct_register_custom_op(
|
||||
op_name="rocm_aiter_per_tensor_quant",
|
||||
op_func=_rocm_aiter_per_tensor_quant_impl,
|
||||
mutates_args=[],
|
||||
fake_impl=_rocm_aiter_per_tensor_quant_fake,
|
||||
dispatch_key=current_platform.dispatch_key,
|
||||
)
|
||||
|
||||
direct_register_custom_op(
|
||||
op_name="rocm_aiter_per_token_quant",
|
||||
op_func=_rocm_aiter_per_token_quant_impl,
|
||||
mutates_args=["scale"],
|
||||
fake_impl=_rocm_aiter_per_token_quant_fake,
|
||||
dispatch_key=current_platform.dispatch_key,
|
||||
)
|
||||
|
||||
_OPS_REGISTERED = True
|
||||
|
||||
@staticmethod
|
||||
@ -859,6 +930,22 @@ class rocm_aiter_ops:
|
||||
kv_scale=kv_scale,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def per_tensor_quant(
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
scale: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
return torch.ops.vllm.rocm_aiter_per_tensor_quant(x, quant_dtype, scale)
|
||||
|
||||
@staticmethod
|
||||
def per_token_quant(
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
scale: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
return torch.ops.vllm.rocm_aiter_per_token_quant(x, quant_dtype, scale)
|
||||
|
||||
@staticmethod
|
||||
def triton_fp4_gemm_dynamic_qaunt(
|
||||
x: torch.Tensor,
|
||||
|
||||
@ -1726,7 +1726,7 @@ def scaled_fp8_quant(
|
||||
output, input, scale, scale_ub
|
||||
)
|
||||
else:
|
||||
scale = torch.empty((1, 1), device=input.device, dtype=torch.float32)
|
||||
scale = torch.empty(1, device=input.device, dtype=torch.float32)
|
||||
torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
|
||||
else:
|
||||
assert scale.numel() == 1, f"{scale.shape}"
|
||||
|
||||
@ -89,7 +89,10 @@ def maybe_get_vit_flash_attn_backend(
|
||||
if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
|
||||
from aiter import flash_attn_varlen_func
|
||||
else:
|
||||
from vllm.attention.utils.fa_utils import flash_attn_varlen_func
|
||||
try:
|
||||
from vllm.attention.utils.fa_utils import flash_attn_varlen_func
|
||||
except ImportError:
|
||||
flash_attn_varlen_func = None
|
||||
else:
|
||||
flash_attn_varlen_func = None
|
||||
|
||||
|
||||
@ -12,7 +12,6 @@ from typing import Any
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.inputs import PromptType
|
||||
@ -79,12 +78,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
||||
raise OSError(
|
||||
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
||||
"Please set it to a valid path to use torch profiler."
|
||||
)
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
if args.profile and not engine_args.profiler_config.profiler == "torch":
|
||||
raise ValueError(
|
||||
"The torch profiler is not enabled. Please provide profiler_config."
|
||||
)
|
||||
|
||||
# Lazy import to avoid importing LLM when the bench command is not selected.
|
||||
from vllm import LLM, SamplingParams
|
||||
@ -144,7 +142,7 @@ def main(args: argparse.Namespace):
|
||||
run_to_completion(profile_dir=None)
|
||||
|
||||
if args.profile:
|
||||
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
profile_dir = engine_args.profiler_config.torch_profiler_dir
|
||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||
run_to_completion(profile_dir=profile_dir)
|
||||
return
|
||||
|
||||
@ -1097,8 +1097,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
help="Use Torch Profiler. The endpoint must be launched with "
|
||||
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
||||
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-result",
|
||||
|
||||
@ -655,8 +655,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
"--profile",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Use Torch Profiler. The env variable "
|
||||
"VLLM_TORCH_PROFILER_DIR must be set to enable profiler.",
|
||||
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
||||
)
|
||||
|
||||
# prefix repetition dataset
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
import hashlib
|
||||
import inspect
|
||||
@ -8,15 +10,17 @@ import json
|
||||
import types
|
||||
from collections.abc import Callable
|
||||
from contextlib import contextmanager
|
||||
from typing import Any
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import torch
|
||||
from torch import fx
|
||||
from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
|
||||
|
||||
from vllm.config.utils import Range
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config.utils import Range
|
||||
|
||||
if is_torch_equal_or_newer("2.6"):
|
||||
from torch._inductor.custom_graph_pass import CustomGraphPass
|
||||
else:
|
||||
|
||||
@ -53,8 +53,27 @@ class PiecewiseBackend:
|
||||
self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
|
||||
|
||||
self.is_full_graph = total_piecewise_compiles == 1
|
||||
# TODO: we need to generalize encoder compilation to other models
|
||||
self.is_encoder_compilation = vllm_backend.prefix in [
|
||||
"Qwen2_5_VisionPatchEmbed",
|
||||
"Qwen2_5_VisionPatchMerger",
|
||||
"Qwen2_5_VisionBlock",
|
||||
]
|
||||
|
||||
self.compile_ranges = self.compilation_config.get_compile_ranges()
|
||||
if self.is_encoder_compilation:
|
||||
# For encoder compilation we use the max int32 value
|
||||
# to set the upper bound of the compile ranges
|
||||
max_int32 = 2**31 - 1
|
||||
last_compile_range = self.compile_ranges[-1]
|
||||
assert (
|
||||
last_compile_range.end
|
||||
== vllm_config.scheduler_config.max_num_batched_tokens
|
||||
)
|
||||
self.compile_ranges[-1] = Range(
|
||||
start=last_compile_range.start, end=max_int32
|
||||
)
|
||||
|
||||
log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
|
||||
logger.debug_once(log_string)
|
||||
|
||||
|
||||
@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.config.observability import ObservabilityConfig
|
||||
from vllm.config.parallel import EPLBConfig, ParallelConfig
|
||||
from vllm.config.pooler import PoolerConfig
|
||||
from vllm.config.profiler import ProfilerConfig
|
||||
from vllm.config.scheduler import SchedulerConfig
|
||||
from vllm.config.speculative import SpeculativeConfig
|
||||
from vllm.config.speech_to_text import SpeechToTextConfig
|
||||
@ -89,6 +90,8 @@ __all__ = [
|
||||
"SpeechToTextConfig",
|
||||
# From vllm.config.structured_outputs
|
||||
"StructuredOutputsConfig",
|
||||
# From vllm.config.profiler
|
||||
"ProfilerConfig",
|
||||
# From vllm.config.utils
|
||||
"ConfigType",
|
||||
"SupportsMetricsInfo",
|
||||
|
||||
199
vllm/config/profiler.py
Normal file
199
vllm/config/profiler.py
Normal file
@ -0,0 +1,199 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import Field, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
from typing_extensions import Self
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
ProfilerKind = Literal["torch", "cuda"]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class ProfilerConfig:
|
||||
"""Dataclass which contains profiler config for the engine."""
|
||||
|
||||
profiler: ProfilerKind | None = None
|
||||
"""Which profiler to use. Defaults to None. Options are:
|
||||
|
||||
- 'torch': Use PyTorch profiler.\n
|
||||
- 'cuda': Use CUDA profiler."""
|
||||
|
||||
torch_profiler_dir: str = ""
|
||||
"""Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
|
||||
worker's traces (CPU & GPU) will be saved under this directory. Note that
|
||||
it must be an absolute path."""
|
||||
|
||||
torch_profiler_with_stack: bool = True
|
||||
"""If `True`, enables stack tracing in the torch profiler. Enabled by default."""
|
||||
|
||||
torch_profiler_with_flops: bool = False
|
||||
"""If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""
|
||||
|
||||
torch_profiler_use_gzip: bool = True
|
||||
"""If `True`, saves torch profiler traces in gzip format. Enabled by default"""
|
||||
|
||||
torch_profiler_dump_cuda_time_total: bool = True
|
||||
"""If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""
|
||||
|
||||
torch_profiler_record_shapes: bool = False
|
||||
"""If `True`, records tensor shapes in the torch profiler. Disabled by default."""
|
||||
|
||||
torch_profiler_with_memory: bool = False
|
||||
"""If `True`, enables memory profiling in the torch profiler.
|
||||
Disabled by default."""
|
||||
|
||||
ignore_frontend: bool = False
|
||||
"""If `True`, disables the front-end profiling of AsyncLLM when using the
|
||||
'torch' profiler. This is needed to reduce overhead when using delay/limit options,
|
||||
since the front-end profiling does not track iterations and will capture the
|
||||
entire range.
|
||||
"""
|
||||
|
||||
delay_iterations: int = Field(default=0, ge=0)
|
||||
"""Number of engine iterations to skip before starting profiling.
|
||||
Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
|
||||
"""
|
||||
|
||||
max_iterations: int = Field(default=0, ge=0)
|
||||
"""Maximum number of engine iterations to profile after starting profiling.
|
||||
Defaults to 0, meaning no limit.
|
||||
"""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
|
||||
"""Get field from env var if set, with deprecation warning."""
|
||||
|
||||
if envs.is_set(env_var_name):
|
||||
value = getattr(envs, env_var_name)
|
||||
logger.warning_once(
|
||||
"Using %s environment variable is deprecated and will be removed in "
|
||||
"v0.14.0 or v1.0.0, whichever is soonest. Please use "
|
||||
"--profiler-config.%s command line argument or "
|
||||
"ProfilerConfig(%s=...) config field instead.",
|
||||
env_var_name,
|
||||
field_name,
|
||||
field_name,
|
||||
)
|
||||
return value
|
||||
return None
|
||||
|
||||
def _set_from_env_if_set(
|
||||
self,
|
||||
field_name: str,
|
||||
env_var_name: str,
|
||||
to_bool: bool = True,
|
||||
to_int: bool = False,
|
||||
) -> None:
|
||||
"""Set field from env var if set, with deprecation warning."""
|
||||
value = self._get_from_env_if_set(field_name, env_var_name)
|
||||
if value is not None:
|
||||
if to_bool:
|
||||
value = value == "1"
|
||||
if to_int:
|
||||
value = int(value)
|
||||
setattr(self, field_name, value)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_profiler_config(self) -> Self:
|
||||
maybe_use_cuda_profiler = self._get_from_env_if_set(
|
||||
"profiler", "VLLM_TORCH_CUDA_PROFILE"
|
||||
)
|
||||
if maybe_use_cuda_profiler is not None:
|
||||
self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None
|
||||
else:
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False
|
||||
)
|
||||
if self.torch_profiler_dir:
|
||||
self.profiler = "torch"
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_record_shapes",
|
||||
"VLLM_TORCH_PROFILER_RECORD_SHAPES",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_with_memory",
|
||||
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_with_stack",
|
||||
"VLLM_TORCH_PROFILER_WITH_STACK",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_with_flops",
|
||||
"VLLM_TORCH_PROFILER_WITH_FLOPS",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"ignore_frontend",
|
||||
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_use_gzip",
|
||||
"VLLM_TORCH_PROFILER_USE_GZIP",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_dump_cuda_time_total",
|
||||
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL",
|
||||
)
|
||||
|
||||
self._set_from_env_if_set(
|
||||
"delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True
|
||||
)
|
||||
|
||||
has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
|
||||
if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
|
||||
logger.warning_once(
|
||||
"Using 'torch' profiler with delay_iterations or max_iterations "
|
||||
"while ignore_frontend is False may result in high overhead."
|
||||
)
|
||||
|
||||
profiler_dir = self.torch_profiler_dir
|
||||
if profiler_dir and self.profiler != "torch":
|
||||
raise ValueError(
|
||||
"torch_profiler_dir is only applicable when profiler is set to 'torch'"
|
||||
)
|
||||
if self.profiler == "torch" and not profiler_dir:
|
||||
raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")
|
||||
|
||||
if profiler_dir:
|
||||
is_gs_path = (
|
||||
profiler_dir.startswith("gs://")
|
||||
and profiler_dir[5:]
|
||||
and profiler_dir[5] != "/"
|
||||
)
|
||||
if not is_gs_path:
|
||||
self.torch_profiler_dir = os.path.abspath(
|
||||
os.path.expanduser(profiler_dir)
|
||||
)
|
||||
|
||||
return self
|
||||
@ -39,6 +39,7 @@ from .lora import LoRAConfig
|
||||
from .model import ModelConfig
|
||||
from .observability import ObservabilityConfig
|
||||
from .parallel import ParallelConfig
|
||||
from .profiler import ProfilerConfig
|
||||
from .scheduler import SchedulerConfig
|
||||
from .speculative import SpeculativeConfig
|
||||
from .structured_outputs import StructuredOutputsConfig
|
||||
@ -218,6 +219,8 @@ class VllmConfig:
|
||||
You can specify the full compilation config like so:
|
||||
`{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
|
||||
"""
|
||||
profiler_config: ProfilerConfig = Field(default_factory=ProfilerConfig)
|
||||
"""Profiling configuration."""
|
||||
kv_transfer_config: KVTransferConfig | None = None
|
||||
"""The configurations for distributed KV cache transfer."""
|
||||
kv_events_config: KVEventsConfig | None = None
|
||||
@ -296,6 +299,8 @@ class VllmConfig:
|
||||
vllm_factors.append("None")
|
||||
if self.structured_outputs_config:
|
||||
vllm_factors.append(self.structured_outputs_config.compute_hash())
|
||||
if self.profiler_config:
|
||||
vllm_factors.append(self.profiler_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
vllm_factors.append(self.observability_config.compute_hash())
|
||||
|
||||
@ -50,6 +50,7 @@ from vllm.config import (
|
||||
ObservabilityConfig,
|
||||
ParallelConfig,
|
||||
PoolerConfig,
|
||||
ProfilerConfig,
|
||||
SchedulerConfig,
|
||||
SpeculativeConfig,
|
||||
StructuredOutputsConfig,
|
||||
@ -536,6 +537,8 @@ class EngineArgs:
|
||||
worker_cls: str = ParallelConfig.worker_cls
|
||||
worker_extension_cls: str = ParallelConfig.worker_extension_cls
|
||||
|
||||
profiler_config: ProfilerConfig = get_field(VllmConfig, "profiler_config")
|
||||
|
||||
kv_transfer_config: KVTransferConfig | None = None
|
||||
kv_events_config: KVEventsConfig | None = None
|
||||
|
||||
@ -1168,7 +1171,7 @@ class EngineArgs:
|
||||
vllm_group.add_argument(
|
||||
"--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
|
||||
)
|
||||
|
||||
vllm_group.add_argument("--profiler-config", **vllm_kwargs["profiler_config"])
|
||||
vllm_group.add_argument(
|
||||
"--optimization-level", **vllm_kwargs["optimization_level"]
|
||||
)
|
||||
@ -1786,6 +1789,7 @@ class EngineArgs:
|
||||
kv_transfer_config=self.kv_transfer_config,
|
||||
kv_events_config=self.kv_events_config,
|
||||
ec_transfer_config=self.ec_transfer_config,
|
||||
profiler_config=self.profiler_config,
|
||||
additional_config=self.additional_config,
|
||||
optimization_level=self.optimization_level,
|
||||
)
|
||||
|
||||
@ -20,6 +20,7 @@ from vllm.beam_search import (
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
PoolerConfig,
|
||||
ProfilerConfig,
|
||||
StructuredOutputsConfig,
|
||||
is_init_field,
|
||||
)
|
||||
@ -211,6 +212,7 @@ class LLM:
|
||||
structured_outputs_config: dict[str, Any]
|
||||
| StructuredOutputsConfig
|
||||
| None = None,
|
||||
profiler_config: dict[str, Any] | ProfilerConfig | None = None,
|
||||
kv_cache_memory_bytes: int | None = None,
|
||||
compilation_config: int | dict[str, Any] | CompilationConfig | None = None,
|
||||
logits_processors: list[str | type[LogitsProcessor]] | None = None,
|
||||
@ -282,6 +284,20 @@ class LLM:
|
||||
else:
|
||||
structured_outputs_instance = StructuredOutputsConfig()
|
||||
|
||||
if profiler_config is not None:
|
||||
if isinstance(profiler_config, dict):
|
||||
profiler_config_instance = ProfilerConfig(
|
||||
**{
|
||||
k: v
|
||||
for k, v in profiler_config.items()
|
||||
if is_init_field(ProfilerConfig, k)
|
||||
}
|
||||
)
|
||||
else:
|
||||
profiler_config_instance = profiler_config
|
||||
else:
|
||||
profiler_config_instance = ProfilerConfig()
|
||||
|
||||
# warn about single-process data parallel usage.
|
||||
_dp_size = int(kwargs.get("data_parallel_size", 1))
|
||||
_distributed_executor_backend = kwargs.get("distributed_executor_backend")
|
||||
@ -324,6 +340,7 @@ class LLM:
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
pooler_config=pooler_config,
|
||||
structured_outputs_config=structured_outputs_instance,
|
||||
profiler_config=profiler_config_instance,
|
||||
compilation_config=compilation_config_instance,
|
||||
logits_processors=logits_processors,
|
||||
**kwargs,
|
||||
|
||||
@ -99,12 +99,7 @@ class MistralToolParser(ToolParser):
|
||||
self.bot_token = "[TOOL_CALLS]"
|
||||
self.bot_token_id = self.vocab.get(self.bot_token)
|
||||
self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
|
||||
if not _is_pre_v11_tokeniser(self.model_tokenizer):
|
||||
self.fn_name_regex = re.compile(
|
||||
r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL
|
||||
)
|
||||
else:
|
||||
self.fn_name_regex = None
|
||||
self._is_pre_v11 = _is_pre_v11_tokeniser(self.model_tokenizer)
|
||||
|
||||
if self.bot_token_id is None:
|
||||
raise RuntimeError(
|
||||
@ -148,23 +143,24 @@ class MistralToolParser(ToolParser):
|
||||
tool_content = model_output.replace(self.bot_token, "").strip()
|
||||
|
||||
try:
|
||||
# we first try to directly load the json as parsing very nested
|
||||
# jsons is difficult
|
||||
try:
|
||||
if self.fn_name_regex:
|
||||
if not self._is_pre_v11:
|
||||
function_call_arr = []
|
||||
for single_tool_content in model_output.split(self.bot_token):
|
||||
matches = self.fn_name_regex.findall(single_tool_content)
|
||||
if "{" not in single_tool_content:
|
||||
continue
|
||||
|
||||
for match in matches:
|
||||
fn_name = match[0]
|
||||
args = match[1]
|
||||
end_name = single_tool_content.find("{")
|
||||
fn_name, args = (
|
||||
single_tool_content[:end_name],
|
||||
single_tool_content[end_name:],
|
||||
)
|
||||
|
||||
# fn_name is encoded outside serialized json dump
|
||||
# only arguments are serialized
|
||||
function_call_arr.append(
|
||||
{"name": fn_name, "arguments": json.loads(args)}
|
||||
)
|
||||
# fn_name is encoded outside serialized json dump
|
||||
# only arguments are serialized
|
||||
function_call_arr.append(
|
||||
{"name": fn_name, "arguments": json.loads(args)}
|
||||
)
|
||||
else:
|
||||
function_call_arr = json.loads(tool_content)
|
||||
except json.JSONDecodeError:
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
from fastapi import APIRouter, FastAPI, Request
|
||||
from fastapi.responses import Response
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ProfilerConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@ -35,15 +35,12 @@ async def stop_profile(raw_request: Request):
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
||||
profiler_config = getattr(app.state.args, "profiler_config", None)
|
||||
assert profiler_config is None or isinstance(profiler_config, ProfilerConfig)
|
||||
if profiler_config is not None and profiler_config.profiler is not None:
|
||||
logger.warning_once(
|
||||
"Torch Profiler is enabled in the API server. This should ONLY be "
|
||||
"used for local development!"
|
||||
"Profiler with mode '%s' is enabled in the "
|
||||
"API server. This should ONLY be used for local development!",
|
||||
profiler_config.profiler,
|
||||
)
|
||||
elif envs.VLLM_TORCH_CUDA_PROFILE:
|
||||
logger.warning_once(
|
||||
"CUDA Profiler is enabled in the API server. This should ONLY be "
|
||||
"used for local development!"
|
||||
)
|
||||
if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
|
||||
app.include_router(router)
|
||||
|
||||
119
vllm/envs.py
119
vllm/envs.py
@ -75,6 +75,7 @@ if TYPE_CHECKING:
|
||||
VLLM_MM_INPUT_CACHE_GIB: int = 4
|
||||
VLLM_TARGET_DEVICE: str = "cuda"
|
||||
VLLM_MAIN_CUDA_VERSION: str = "12.9"
|
||||
VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
|
||||
MAX_JOBS: str | None = None
|
||||
NVCC_THREADS: str | None = None
|
||||
VLLM_USE_PRECOMPILED: bool = False
|
||||
@ -88,20 +89,23 @@ if TYPE_CHECKING:
|
||||
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
|
||||
VLLM_PLUGINS: list[str] | None = None
|
||||
VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
|
||||
VLLM_TORCH_CUDA_PROFILE: bool = False
|
||||
# Deprecated env variables for profiling, kept for backward compatibility
|
||||
# See also vllm/config/profiler.py and `--profiler-config` argument
|
||||
VLLM_TORCH_CUDA_PROFILE: str | None = None
|
||||
VLLM_TORCH_PROFILER_DIR: str | None = None
|
||||
VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
|
||||
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
|
||||
VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False
|
||||
VLLM_TORCH_PROFILER_RECORD_SHAPES: str | None = None
|
||||
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: str | None = None
|
||||
VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: str | None = None
|
||||
VLLM_TORCH_PROFILER_WITH_STACK: str | None = None
|
||||
VLLM_TORCH_PROFILER_WITH_FLOPS: str | None = None
|
||||
VLLM_TORCH_PROFILER_USE_GZIP: str | None = None
|
||||
VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: str | None = None
|
||||
VLLM_PROFILER_DELAY_ITERS: str | None = None
|
||||
VLLM_PROFILER_MAX_ITERS: str | None = None
|
||||
# End of deprecated env variables for profiling
|
||||
VLLM_USE_AOT_COMPILE: bool = False
|
||||
VLLM_USE_BYTECODE_HOOK: bool = False
|
||||
VLLM_FORCE_AOT_LOAD: bool = False
|
||||
VLLM_TORCH_PROFILER_WITH_STACK: bool = True
|
||||
VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
|
||||
VLLM_PROFILER_DELAY_ITERS: int = 0
|
||||
VLLM_PROFILER_MAX_ITERS: int = 0
|
||||
VLLM_TORCH_PROFILER_USE_GZIP: bool = True
|
||||
VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True
|
||||
VLLM_USE_TRITON_AWQ: bool = False
|
||||
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
||||
VLLM_SKIP_P2P_CHECK: bool = False
|
||||
@ -453,6 +457,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# Main CUDA version of vLLM. This follows PyTorch but can be overridden.
|
||||
"VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
|
||||
or "12.9",
|
||||
# Controls PyTorch float32 matmul precision mode within vLLM workers.
|
||||
# Valid options mirror torch.set_float32_matmul_precision
|
||||
"VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
|
||||
"VLLM_FLOAT32_MATMUL_PRECISION",
|
||||
"highest",
|
||||
["highest", "high", "medium"],
|
||||
case_sensitive=False,
|
||||
),
|
||||
# Maximum number of compilation jobs to run in parallel.
|
||||
# By default this is the number of CPUs
|
||||
"MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
|
||||
@ -842,71 +854,52 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
|
||||
"VLLM_LORA_RESOLVER_CACHE_DIR", None
|
||||
),
|
||||
# Enables torch CUDA profiling if set.
|
||||
# On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered.
|
||||
"VLLM_TORCH_CUDA_PROFILE": lambda: bool(
|
||||
os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0"
|
||||
),
|
||||
# Enables torch CUDA profiling if set to 1.
|
||||
# Deprecated, see profiler_config.
|
||||
"VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"),
|
||||
# Enables torch profiler if set.
|
||||
# Both AsyncLLM's CPU traces as well as workers'
|
||||
# traces (CPU & GPU) will be saved under this directory.
|
||||
# Note that it must be an absolute path.
|
||||
"VLLM_TORCH_PROFILER_DIR": lambda: (
|
||||
None
|
||||
if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None
|
||||
else (
|
||||
val
|
||||
if val.startswith("gs://") and val[5:] and val[5] != "/"
|
||||
else os.path.abspath(os.path.expanduser(val))
|
||||
)
|
||||
# Deprecated, see profiler_config.
|
||||
"VLLM_TORCH_PROFILER_DIR": lambda: os.getenv("VLLM_TORCH_PROFILER_DIR"),
|
||||
# Enable torch profiler to record shapes if set to 1.
|
||||
# Deprecated, see profiler_config.
|
||||
"VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: (
|
||||
os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES")
|
||||
),
|
||||
# Enable torch profiler to record shapes if set
|
||||
# VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will
|
||||
# not record shapes.
|
||||
"VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool(
|
||||
os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"
|
||||
# Enable torch profiler to profile memory if set to 1.
|
||||
# Deprecated, see profiler_config.
|
||||
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: (
|
||||
os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY")
|
||||
),
|
||||
# Enable torch profiler to profile memory if set
|
||||
# VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler
|
||||
# will not profile memory.
|
||||
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool(
|
||||
os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"
|
||||
# Enable torch profiler to profile stack if set to 1.
|
||||
# Deprecated, see profiler_config.
|
||||
"VLLM_TORCH_PROFILER_WITH_STACK": lambda: (
|
||||
os.getenv("VLLM_TORCH_PROFILER_WITH_STACK")
|
||||
),
|
||||
# Enable torch profiler to profile stack if set
|
||||
# VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL
|
||||
# profile stack by default.
|
||||
"VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool(
|
||||
os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"
|
||||
# Enable torch profiler to profile flops if set to 1.
|
||||
# Deprecated, see profiler_config.
|
||||
"VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: (
|
||||
os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS")
|
||||
),
|
||||
# Enable torch profiler to profile flops if set
|
||||
# VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will
|
||||
# not profile flops.
|
||||
"VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
|
||||
os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
|
||||
),
|
||||
# Disable torch profiling of the AsyncLLMEngine process.
|
||||
# If set to 1, will not profile the engine process.
|
||||
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool(
|
||||
os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0"
|
||||
# Disable torch profiling of the AsyncLLMEngine process if set to 1.
|
||||
# Deprecated, see profiler_config.
|
||||
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: (
|
||||
os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM")
|
||||
),
|
||||
# Delay number of iterations before starting profiling when using
|
||||
# the torch/torch CUDA profiler. If set to 0, will start profiling immediately.
|
||||
"VLLM_PROFILER_DELAY_ITERS": lambda: int(
|
||||
os.getenv("VLLM_PROFILER_DELAY_ITERS", "0")
|
||||
),
|
||||
# Deprecated, see profiler_config.
|
||||
"VLLM_PROFILER_DELAY_ITERS": lambda: (os.getenv("VLLM_PROFILER_DELAY_ITERS")),
|
||||
# Maximum number of iterations to profile when using the torch/torch CUDA profiler.
|
||||
# If set to 0, will not limit the number of iterations.
|
||||
"VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
|
||||
"VLLM_PROFILER_MAX_ITERS": lambda: os.getenv("VLLM_PROFILER_MAX_ITERS"),
|
||||
# Control whether torch profiler gzip-compresses profiling files.
|
||||
# Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default).
|
||||
"VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool(
|
||||
os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0"
|
||||
),
|
||||
# Deprecated, see profiler_config.
|
||||
"VLLM_TORCH_PROFILER_USE_GZIP": lambda: os.getenv("VLLM_TORCH_PROFILER_USE_GZIP"),
|
||||
# Control whether torch profiler dumps the self_cuda_time_total table.
|
||||
# Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping
|
||||
# (enabled by default).
|
||||
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool(
|
||||
os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0"
|
||||
# Set to 0 to disable dumping the table.
|
||||
# Deprecated, see profiler_config.
|
||||
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: (
|
||||
os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL")
|
||||
),
|
||||
# If set, vLLM will use Triton implementations of AWQ.
|
||||
"VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
|
||||
|
||||
@ -292,7 +292,7 @@ def set_forward_context(
|
||||
if num_tokens_across_dp is None:
|
||||
assert ubatch_slices is None
|
||||
assert num_tokens is not None
|
||||
_, num_tokens_across_dp = coordinate_batch_across_dp(
|
||||
_, num_tokens_across_dp, _ = coordinate_batch_across_dp(
|
||||
num_tokens_unpadded=num_tokens,
|
||||
parallel_config=vllm_config.parallel_config,
|
||||
allow_microbatching=False,
|
||||
|
||||
@ -935,7 +935,11 @@ def enable_batch_invariant_mode():
|
||||
|
||||
# Batch invariant matmuls are no longer needed after cublas overrides
|
||||
if not is_torch_equal_or_newer("2.10.0.dev"):
|
||||
if current_platform.is_device_capability(100):
|
||||
if (
|
||||
current_platform.is_device_capability(100)
|
||||
or current_platform.is_device_capability(80)
|
||||
or current_platform.is_device_capability(89)
|
||||
):
|
||||
# For PyTorch 2.9, B200 uses GEMV for bs=1
|
||||
# Requires https://github.com/pytorch/pytorch/pull/166735
|
||||
_batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA")
|
||||
|
||||
@ -895,6 +895,48 @@ def get_moe_configs(
|
||||
return None
|
||||
|
||||
|
||||
def _ensure_block_size_k_divisible(
|
||||
size_k: int, block_size_k: int, group_size: int
|
||||
) -> int:
|
||||
"""Ensure block_size_k is a divisor of size_k and divisible by group_size.
|
||||
|
||||
This ensures BLOCK_SIZE_K compatibility with MoeWNA16 CUDA kernel which
|
||||
requires size_k % BLOCK_SIZE_K == 0 and BLOCK_SIZE_K % group_size == 0.
|
||||
|
||||
Args:
|
||||
size_k: The size_k dimension that must be divisible by result.
|
||||
block_size_k: Preferred block size (will be adjusted if needed).
|
||||
group_size: The result must be divisible by this.
|
||||
|
||||
Returns:
|
||||
A valid BLOCK_SIZE_K that divides size_k and is divisible by group_size.
|
||||
"""
|
||||
# Fast path: already valid
|
||||
if size_k % block_size_k == 0 and block_size_k % group_size == 0:
|
||||
return block_size_k
|
||||
|
||||
# Find the largest value that:
|
||||
# 1. Divides size_k (size_k % candidate == 0)
|
||||
# 2. Is divisible by group_size (candidate % group_size == 0)
|
||||
# 3. Is <= block_size_k (prefer smaller values close to block_size_k)
|
||||
#
|
||||
# Strategy: Search from min(block_size_k, size_k) down to group_size,
|
||||
# stepping by group_size to ensure divisibility by group_size
|
||||
max_search = min(block_size_k, size_k)
|
||||
start = (max_search // group_size) * group_size
|
||||
for candidate in range(start, group_size - 1, -group_size):
|
||||
if size_k % candidate == 0:
|
||||
return candidate
|
||||
|
||||
# Fallback: if group_size divides size_k, use it
|
||||
# This should always be true with correct group_size configuration
|
||||
if size_k % group_size == 0:
|
||||
return group_size
|
||||
|
||||
# This should not happen with correct group_size, but ensure divisibility
|
||||
return size_k
|
||||
|
||||
|
||||
def get_moe_wna16_block_config(
|
||||
config: dict[str, int],
|
||||
use_moe_wna16_cuda: bool,
|
||||
@ -960,6 +1002,9 @@ def get_moe_wna16_block_config(
|
||||
# at the same time.
|
||||
block_size_n = 1024
|
||||
|
||||
# Ensure BLOCK_SIZE_K is a divisor of size_k for CUDA kernel compatibility
|
||||
block_size_k = _ensure_block_size_k_divisible(size_k, block_size_k, group_size)
|
||||
|
||||
return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k}
|
||||
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm._aiter_ops import rocm_aiter_ops
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
from vllm.platforms import current_platform
|
||||
@ -45,10 +46,13 @@ class QuantFP8(CustomOp):
|
||||
super().__init__()
|
||||
self.static = static
|
||||
self.group_shape = group_shape
|
||||
self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN
|
||||
self.num_token_padding = num_token_padding
|
||||
self.column_major_scales = column_major_scales
|
||||
self.use_ue8m0 = use_ue8m0
|
||||
|
||||
self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled()
|
||||
|
||||
self.is_group_quant = group_shape.is_per_group()
|
||||
if self.is_group_quant:
|
||||
assert not static, "Group quantization only supports dynamic mode"
|
||||
@ -92,6 +96,33 @@ class QuantFP8(CustomOp):
|
||||
use_per_token_if_dynamic=self.use_per_token_if_dynamic,
|
||||
)
|
||||
|
||||
def forward_hip(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
scale: torch.Tensor | None = None,
|
||||
scale_ub: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
use_aiter_quant = (
|
||||
not self.is_group_quant
|
||||
and self.use_aiter
|
||||
and scale_ub is None
|
||||
and x.is_contiguous()
|
||||
)
|
||||
use_aiter_per_tensor_quant = (
|
||||
use_aiter_quant and self.group_shape == GroupShape.PER_TENSOR
|
||||
)
|
||||
use_aiter_per_token_quant = (
|
||||
use_aiter_quant and self.group_shape == GroupShape.PER_TOKEN
|
||||
)
|
||||
|
||||
if use_aiter_per_tensor_quant:
|
||||
return rocm_aiter_ops.per_tensor_quant(x, _FP8_DTYPE, scale)
|
||||
if use_aiter_per_token_quant:
|
||||
return rocm_aiter_ops.per_token_quant(x, _FP8_DTYPE, scale)
|
||||
|
||||
# Fallback to CUDA implementation
|
||||
return self.forward_cuda(x, scale, scale_ub)
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
|
||||
@ -60,7 +60,7 @@ class MoeWNA16Config(QuantizationConfig):
|
||||
|
||||
if self.linear_quant_method == "gptq":
|
||||
self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config)
|
||||
elif self.linear_quant_method == "awq":
|
||||
elif self.linear_quant_method in ("awq", "awq_marlin"):
|
||||
capability_tuple = current_platform.get_device_capability()
|
||||
device_capability = (
|
||||
-1 if capability_tuple is None else capability_tuple.to_int()
|
||||
@ -107,7 +107,7 @@ class MoeWNA16Config(QuantizationConfig):
|
||||
if linear_quant_method == "gptq":
|
||||
has_zp = not cls.get_from_keys(config, ["sym"])
|
||||
modules_to_not_convert = []
|
||||
elif linear_quant_method == "awq":
|
||||
elif linear_quant_method in ("awq", "awq_marlin"):
|
||||
has_zp = cls.get_from_keys(config, ["zero_point"])
|
||||
modules_to_not_convert = cls.get_from_keys_or(
|
||||
config, ["modules_to_not_convert"], None
|
||||
@ -184,7 +184,7 @@ class MoeWNA16Config(QuantizationConfig):
|
||||
return GPTQConfig.from_config(self.full_config).get_quant_method(
|
||||
layer, prefix
|
||||
)
|
||||
elif self.linear_quant_method == "awq":
|
||||
elif self.linear_quant_method in ("awq", "awq_marlin"):
|
||||
if self.use_marlin and check_marlin_supports_layer(
|
||||
layer, self.group_size
|
||||
):
|
||||
@ -468,7 +468,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
||||
shard_size = layer.intermediate_size_per_partition
|
||||
|
||||
# convert gptq and awq weight to a standard format
|
||||
if layer.quant_config.linear_quant_method == "awq":
|
||||
# awq_marlin uses the same weight format as awq
|
||||
if layer.quant_config.linear_quant_method in ("awq", "awq_marlin"):
|
||||
assert layer.quant_config.weight_bits == 4
|
||||
if "weight" in weight_name:
|
||||
loaded_weight = convert_awq_tensor(loaded_weight, "qweight")
|
||||
|
||||
@ -367,6 +367,8 @@ class Qwen2MoeModel(nn.Module):
|
||||
self.embed_tokens = VocabParallelEmbedding(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.embed_tokens",
|
||||
)
|
||||
self.start_layer, self.end_layer, self.layers = make_layers(
|
||||
config.num_hidden_layers,
|
||||
@ -512,6 +514,12 @@ class Qwen2MoeModel(nn.Module):
|
||||
continue
|
||||
else:
|
||||
name = remapped_kv_scale_name
|
||||
# GGUF: make sure that shared_expert_gate is a 2D tensor.
|
||||
if (
|
||||
"mlp.shared_expert_gate" in name
|
||||
and len(loaded_weight.shape) == 1
|
||||
):
|
||||
loaded_weight = loaded_weight[None, :]
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(
|
||||
param, "weight_loader", default_weight_loader
|
||||
|
||||
@ -381,6 +381,8 @@ class RocmPlatform(Platform):
|
||||
compilation_config = vllm_config.compilation_config
|
||||
parallel_config = vllm_config.parallel_config
|
||||
is_eager_execution = compilation_config == CUDAGraphMode.NONE
|
||||
use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
|
||||
use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled()
|
||||
|
||||
if compilation_config.cudagraph_mode.has_full_cudagraphs():
|
||||
# decode context parallel does not support full cudagraphs
|
||||
@ -400,8 +402,6 @@ class RocmPlatform(Platform):
|
||||
)
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
|
||||
use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
|
||||
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
@ -415,6 +415,9 @@ class RocmPlatform(Platform):
|
||||
):
|
||||
compilation_config.custom_ops.append("+rms_norm")
|
||||
|
||||
if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops:
|
||||
compilation_config.custom_ops.append("+quant_fp8")
|
||||
|
||||
@classmethod
|
||||
def verify_model_arch(cls, model_arch: str) -> None:
|
||||
if model_arch in _ROCM_UNSUPPORTED_MODELS:
|
||||
|
||||
@ -3,26 +3,27 @@
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from contextlib import nullcontext
|
||||
from typing import Literal
|
||||
|
||||
import torch
|
||||
from typing_extensions import override
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ProfilerConfig
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class WorkerProfiler(ABC):
|
||||
def __init__(self) -> None:
|
||||
self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS
|
||||
def __init__(self, profiler_config: ProfilerConfig) -> None:
|
||||
self._delay_iters = profiler_config.delay_iterations
|
||||
if self._delay_iters > 0:
|
||||
logger.info_once(
|
||||
"GPU profiling will start "
|
||||
f"{self._delay_iters} steps after start_profile."
|
||||
)
|
||||
|
||||
self._max_iters = envs.VLLM_PROFILER_MAX_ITERS
|
||||
self._max_iters = profiler_config.max_iterations
|
||||
if self._max_iters > 0:
|
||||
logger.info_once(
|
||||
"GPU profiling will stop "
|
||||
@ -133,12 +134,27 @@ class WorkerProfiler(ABC):
|
||||
return nullcontext()
|
||||
|
||||
|
||||
TorchProfilerActivity = Literal["CPU", "CUDA", "XPU"]
|
||||
TorchProfilerActivityMap = {
|
||||
"CPU": torch.profiler.ProfilerActivity.CPU,
|
||||
"CUDA": torch.profiler.ProfilerActivity.CUDA,
|
||||
"XPU": torch.profiler.ProfilerActivity.XPU,
|
||||
}
|
||||
|
||||
|
||||
class TorchProfilerWrapper(WorkerProfiler):
|
||||
def __init__(self, worker_name: str, local_rank: int) -> None:
|
||||
super().__init__()
|
||||
def __init__(
|
||||
self,
|
||||
profiler_config: ProfilerConfig,
|
||||
worker_name: str,
|
||||
local_rank: int,
|
||||
activities: list[TorchProfilerActivity],
|
||||
) -> None:
|
||||
super().__init__(profiler_config)
|
||||
|
||||
self.local_rank = local_rank
|
||||
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
self.profiler_config = profiler_config
|
||||
torch_profiler_trace_dir = profiler_config.torch_profiler_dir
|
||||
if local_rank in (None, 0):
|
||||
logger.info(
|
||||
"Torch profiling enabled. Traces will be saved to: %s",
|
||||
@ -147,24 +163,23 @@ class TorchProfilerWrapper(WorkerProfiler):
|
||||
logger.debug(
|
||||
"Profiler config: record_shapes=%s,"
|
||||
"profile_memory=%s,with_stack=%s,with_flops=%s",
|
||||
envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
||||
envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
||||
envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
||||
envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
||||
profiler_config.torch_profiler_record_shapes,
|
||||
profiler_config.torch_profiler_with_memory,
|
||||
profiler_config.torch_profiler_with_stack,
|
||||
profiler_config.torch_profiler_with_flops,
|
||||
)
|
||||
|
||||
self.dump_cpu_time_total = "CPU" in activities and len(activities) == 1
|
||||
self.profiler = torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.CUDA,
|
||||
],
|
||||
record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
||||
profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
||||
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
||||
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
||||
activities=[TorchProfilerActivityMap[activity] for activity in activities],
|
||||
record_shapes=profiler_config.torch_profiler_record_shapes,
|
||||
profile_memory=profiler_config.torch_profiler_with_memory,
|
||||
with_stack=profiler_config.torch_profiler_with_stack,
|
||||
with_flops=profiler_config.torch_profiler_with_flops,
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
torch_profiler_trace_dir,
|
||||
worker_name=worker_name,
|
||||
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
|
||||
use_gzip=profiler_config.torch_profiler_use_gzip,
|
||||
),
|
||||
)
|
||||
|
||||
@ -176,9 +191,10 @@ class TorchProfilerWrapper(WorkerProfiler):
|
||||
def _stop(self) -> None:
|
||||
self.profiler.stop()
|
||||
|
||||
if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL:
|
||||
rank = self.local_rank
|
||||
profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
profiler_config = self.profiler_config
|
||||
rank = self.local_rank
|
||||
if profiler_config.torch_profiler_dump_cuda_time_total:
|
||||
profiler_dir = profiler_config.torch_profiler_dir
|
||||
profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
|
||||
sort_key = "self_cuda_time_total"
|
||||
table = self.profiler.key_averages().table(sort_by=sort_key)
|
||||
@ -189,6 +205,12 @@ class TorchProfilerWrapper(WorkerProfiler):
|
||||
# only print profiler results on rank 0
|
||||
if rank == 0:
|
||||
print(table)
|
||||
if self.dump_cpu_time_total and rank == 0:
|
||||
logger.info(
|
||||
self.profiler.key_averages().table(
|
||||
sort_by="self_cpu_time_total", row_limit=50
|
||||
)
|
||||
)
|
||||
|
||||
@override
|
||||
def annotate_context_manager(self, name: str):
|
||||
@ -196,8 +218,8 @@ class TorchProfilerWrapper(WorkerProfiler):
|
||||
|
||||
|
||||
class CudaProfilerWrapper(WorkerProfiler):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
def __init__(self, profiler_config: ProfilerConfig) -> None:
|
||||
super().__init__(profiler_config)
|
||||
# Note: lazy import to avoid dependency issues if CUDA is not available.
|
||||
import torch.cuda.profiler as cuda_profiler
|
||||
|
||||
@ -63,6 +63,31 @@ class ReasoningParser:
|
||||
True if the reasoning content ends in the input_ids.
|
||||
"""
|
||||
|
||||
def is_reasoning_end_streaming(
|
||||
self, input_ids: list[int], delta_ids: list[int]
|
||||
) -> bool:
|
||||
"""
|
||||
Check if the reasoning content ends in the input_ids on a
|
||||
decode step.
|
||||
|
||||
It is used in structured engines like `xgrammar` to check if the
|
||||
reasoning content ends in the model output during a decode step.
|
||||
`input_ids` the entire model output and `delta_ids` are the last few
|
||||
computed tokens of the model output (like during a decode step).
|
||||
|
||||
Parameters:
|
||||
input_ids: list[int]
|
||||
The entire model output.
|
||||
delta_ids: list[int]
|
||||
The last few computed tokens of the model output at the current decode step.
|
||||
|
||||
Returns:
|
||||
bool
|
||||
True if the reasoning content ends in the `delta_ids` on a
|
||||
decode step.
|
||||
"""
|
||||
return self.is_reasoning_end(input_ids)
|
||||
|
||||
@abstractmethod
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
"""
|
||||
|
||||
@ -74,6 +74,12 @@ class BaseThinkingReasoningParser(ReasoningParser):
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_reasoning_end_streaming(
|
||||
self, input_ids: list[int], delta_ids: list[int]
|
||||
) -> bool:
|
||||
end_token_id = self.end_token_id
|
||||
return end_token_id in delta_ids
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
"""
|
||||
Extract the content after the end tokens
|
||||
|
||||
@ -35,6 +35,11 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
|
||||
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
|
||||
return self._parser.is_reasoning_end(input_ids)
|
||||
|
||||
def is_reasoning_end_streaming(
|
||||
self, input_ids: list[int], delta_ids: list[int]
|
||||
) -> bool:
|
||||
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
return self._parser.extract_content_ids(input_ids)
|
||||
|
||||
|
||||
@ -56,6 +56,11 @@ class Holo2ReasoningParser(ReasoningParser):
|
||||
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
|
||||
return self._parser.is_reasoning_end(input_ids)
|
||||
|
||||
def is_reasoning_end_streaming(
|
||||
self, input_ids: list[int], delta_ids: list[int]
|
||||
) -> bool:
|
||||
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
return self._parser.extract_content_ids(input_ids)
|
||||
|
||||
|
||||
@ -32,6 +32,11 @@ class IdentityReasoningParser(ReasoningParser):
|
||||
# Always return True, since we never treat reasoning specially
|
||||
return True
|
||||
|
||||
def is_reasoning_end_streaming(
|
||||
self, input_ids: list[int], delta_ids: list[int]
|
||||
) -> bool:
|
||||
return True
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
# Identity: return all tokens as content
|
||||
return input_ids
|
||||
|
||||
@ -145,7 +145,7 @@ class CudagraphDispatcher:
|
||||
num_tokens: int,
|
||||
uniform_decode: bool,
|
||||
has_lora: bool,
|
||||
use_cascade_attn: bool = False,
|
||||
disable_full: bool = False,
|
||||
) -> tuple[CUDAGraphMode, BatchDescriptor]:
|
||||
"""
|
||||
Given conditions(e.g.,batch descriptor and if using cascade attention),
|
||||
@ -165,7 +165,7 @@ class CudagraphDispatcher:
|
||||
)
|
||||
relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
|
||||
|
||||
if not use_cascade_attn:
|
||||
if not disable_full:
|
||||
# check if key exists for full cudagraph
|
||||
if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
|
||||
return CUDAGraphMode.FULL, batch_desc
|
||||
|
||||
@ -166,32 +166,24 @@ class AsyncLLM(EngineClient):
|
||||
pass
|
||||
|
||||
if (
|
||||
envs.VLLM_TORCH_PROFILER_DIR
|
||||
and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM
|
||||
vllm_config.profiler_config.profiler == "torch"
|
||||
and not vllm_config.profiler_config.ignore_frontend
|
||||
):
|
||||
profiler_dir = vllm_config.profiler_config.torch_profiler_dir
|
||||
logger.info(
|
||||
"Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501
|
||||
envs.VLLM_TORCH_PROFILER_DIR,
|
||||
profiler_dir,
|
||||
)
|
||||
if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0:
|
||||
logger.warning_once(
|
||||
"Torch profiler received max_iters or delay_iters setting. These "
|
||||
"are not compatible with the AsyncLLM profiler and will be ignored "
|
||||
"for the AsyncLLM process. Engine process profiling will still "
|
||||
"respect these settings. Consider setting "
|
||||
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable "
|
||||
"AsyncLLM profiling."
|
||||
)
|
||||
worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
|
||||
self.profiler = torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
],
|
||||
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
||||
with_stack=vllm_config.profiler_config.torch_profiler_with_stack,
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
envs.VLLM_TORCH_PROFILER_DIR,
|
||||
profiler_dir,
|
||||
worker_name=worker_name,
|
||||
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
|
||||
use_gzip=vllm_config.profiler_config.torch_profiler_use_gzip,
|
||||
),
|
||||
)
|
||||
else:
|
||||
|
||||
@ -1258,7 +1258,7 @@ class EagleProposer:
|
||||
num_tokens_padded: int,
|
||||
) -> tuple[int, torch.Tensor]:
|
||||
# TODO(Flechman): support DBO ubatching
|
||||
should_ubatch, num_toks_across_dp = coordinate_batch_across_dp(
|
||||
should_ubatch, num_toks_across_dp, _ = coordinate_batch_across_dp(
|
||||
num_tokens_unpadded=num_tokens_unpadded,
|
||||
parallel_config=self.vllm_config.parallel_config,
|
||||
allow_microbatching=False,
|
||||
|
||||
@ -339,7 +339,9 @@ class StructuredOutputManager:
|
||||
return True
|
||||
|
||||
# Check if reasoning ends in *this* step
|
||||
if self.reasoner.is_reasoning_end(request.all_token_ids):
|
||||
if self.reasoner.is_reasoning_end_streaming(
|
||||
request.all_token_ids, request.all_token_ids[request.num_computed_tokens :]
|
||||
):
|
||||
# Reasoning just ended, so we shouldn't advance til
|
||||
# next pass
|
||||
structured_req.reasoning_ended = True
|
||||
|
||||
@ -13,6 +13,7 @@ from vllm.logger import init_logger
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo
|
||||
from vllm.profiler.wrapper import TorchProfilerWrapper
|
||||
from vllm.v1.worker.cpu_model_runner import CPUModelRunner
|
||||
from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
|
||||
|
||||
@ -38,30 +39,17 @@ class CPUWorker(Worker):
|
||||
|
||||
self.parallel_config.disable_custom_all_reduce = True
|
||||
|
||||
# Torch profiler. Enabled and configured through env vars:
|
||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
||||
# Torch profiler. Enabled and configured through profiler_config.
|
||||
self.profiler: Any | None = None
|
||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
||||
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
profiler_config = vllm_config.profiler_config
|
||||
if profiler_config.profiler == "torch":
|
||||
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
|
||||
logger.info(
|
||||
"Profiling enabled. Traces will be saved to: %s",
|
||||
torch_profiler_trace_dir,
|
||||
self.profiler = TorchProfilerWrapper(
|
||||
profiler_config,
|
||||
worker_name=worker_name,
|
||||
local_rank=self.local_rank,
|
||||
activities=["CPU"],
|
||||
)
|
||||
self.profiler = torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
],
|
||||
record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
||||
profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
||||
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
||||
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
torch_profiler_trace_dir, worker_name=worker_name, use_gzip=False
|
||||
),
|
||||
)
|
||||
else:
|
||||
self.profiler = None
|
||||
|
||||
def init_device(self):
|
||||
# Setup OpenMP threads affinity.
|
||||
@ -202,9 +190,3 @@ class CPUWorker(Worker):
|
||||
self.profiler.start()
|
||||
else:
|
||||
self.profiler.stop()
|
||||
if self.local_rank == 0:
|
||||
logger.info(
|
||||
self.profiler.key_averages().table(
|
||||
sort_by="self_cpu_time_total", row_limit=50
|
||||
)
|
||||
)
|
||||
|
||||
@ -40,16 +40,18 @@ def _run_ar(
|
||||
should_dp_pad: bool,
|
||||
orig_num_tokens_per_ubatch: int,
|
||||
padded_num_tokens_per_ubatch: int,
|
||||
cudagraph_mode: int,
|
||||
parallel_config: ParallelConfig,
|
||||
) -> torch.Tensor:
|
||||
dp_size = parallel_config.data_parallel_size
|
||||
dp_rank = parallel_config.data_parallel_rank
|
||||
device, group = _get_device_and_group(parallel_config)
|
||||
tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32)
|
||||
tensor = torch.zeros(5, dp_size, device=device, dtype=torch.int32)
|
||||
tensor[0][dp_rank] = orig_num_tokens_per_ubatch
|
||||
tensor[1][dp_rank] = padded_num_tokens_per_ubatch
|
||||
tensor[2][dp_rank] = 1 if should_ubatch else 0
|
||||
tensor[3][dp_rank] = 1 if should_dp_pad else 0
|
||||
tensor[4][dp_rank] = cudagraph_mode
|
||||
dist.all_reduce(tensor, group=group)
|
||||
return tensor
|
||||
|
||||
@ -89,13 +91,23 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
|
||||
return num_tokens_across_dp.cpu()
|
||||
|
||||
|
||||
def _post_process_cudagraph_mode(tensor: torch.Tensor) -> int:
|
||||
"""
|
||||
Synchronize cudagraph_mode across DP ranks by taking the minimum.
|
||||
If any rank has NONE (0), all ranks use NONE.
|
||||
This ensures all ranks send consistent values (all padded or all unpadded).
|
||||
"""
|
||||
return int(tensor[4, :].min().item())
|
||||
|
||||
|
||||
def _synchronize_dp_ranks(
|
||||
num_tokens_unpadded: int,
|
||||
num_tokens_padded: int,
|
||||
should_attempt_ubatching: bool,
|
||||
should_attempt_dp_padding: bool,
|
||||
cudagraph_mode: int,
|
||||
parallel_config: ParallelConfig,
|
||||
) -> tuple[bool, torch.Tensor | None]:
|
||||
) -> tuple[bool, torch.Tensor | None, int]:
|
||||
"""
|
||||
1. Decides if each DP rank is going to microbatch. Either all ranks
|
||||
run with microbatching or none of them do.
|
||||
@ -104,10 +116,13 @@ def _synchronize_dp_ranks(
|
||||
When running microbatched or if should_attempt_dp_padding is True, all
|
||||
ranks will be padded out so that the run with the same number of tokens
|
||||
|
||||
3. Synchronizes cudagraph_mode across ranks by taking the minimum.
|
||||
|
||||
Returns: tuple[
|
||||
should_ubatch: Are all DP ranks going to microbatch
|
||||
num_tokens_after_padding: A tensor containing the total number of
|
||||
tokens per-microbatch for each DP rank including any DP padding.
|
||||
synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
|
||||
]
|
||||
|
||||
"""
|
||||
@ -121,6 +136,7 @@ def _synchronize_dp_ranks(
|
||||
should_dp_pad=should_attempt_dp_padding,
|
||||
orig_num_tokens_per_ubatch=num_tokens_unpadded,
|
||||
padded_num_tokens_per_ubatch=num_tokens_padded,
|
||||
cudagraph_mode=cudagraph_mode,
|
||||
parallel_config=parallel_config,
|
||||
)
|
||||
|
||||
@ -148,7 +164,10 @@ def _synchronize_dp_ranks(
|
||||
should_dp_pad,
|
||||
)
|
||||
|
||||
return should_ubatch, num_tokens_after_padding
|
||||
# Synchronize cudagraph_mode across ranks (take min)
|
||||
synced_cudagraph_mode = _post_process_cudagraph_mode(tensor)
|
||||
|
||||
return should_ubatch, num_tokens_after_padding, synced_cudagraph_mode
|
||||
|
||||
|
||||
def coordinate_batch_across_dp(
|
||||
@ -159,7 +178,8 @@ def coordinate_batch_across_dp(
|
||||
num_tokens_padded: int | None = None,
|
||||
uniform_decode: bool | None = None,
|
||||
num_scheduled_tokens_per_request: np.ndarray | None = None,
|
||||
) -> tuple[bool, torch.Tensor | None]:
|
||||
cudagraph_mode: int = 0,
|
||||
) -> tuple[bool, torch.Tensor | None, int]:
|
||||
"""
|
||||
Coordinates amongst all DP ranks to determine if and how the full batch
|
||||
should be split into microbatches.
|
||||
@ -175,6 +195,7 @@ def coordinate_batch_across_dp(
|
||||
only contains single token decodes
|
||||
num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
|
||||
number of tokens per request.
|
||||
cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL)
|
||||
|
||||
Returns: tuple[
|
||||
ubatch_slices: if this is set then all DP ranks have agreed to
|
||||
@ -183,12 +204,13 @@ def coordinate_batch_across_dp(
|
||||
tokens per-microbatch for each DP rank including padding. Will be
|
||||
padded up to the max value across all DP ranks when allow_dp_padding
|
||||
is True.
|
||||
synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
|
||||
]
|
||||
|
||||
"""
|
||||
if parallel_config.data_parallel_size == 1:
|
||||
# Early exit.
|
||||
return False, None
|
||||
return False, None, cudagraph_mode
|
||||
|
||||
# If the caller has explicitly enabled microbatching.
|
||||
should_attempt_ubatching = False
|
||||
@ -204,12 +226,15 @@ def coordinate_batch_across_dp(
|
||||
if num_tokens_padded is None:
|
||||
num_tokens_padded = num_tokens_unpadded
|
||||
|
||||
(should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks(
|
||||
num_tokens_unpadded,
|
||||
num_tokens_padded,
|
||||
should_attempt_ubatching,
|
||||
allow_dp_padding,
|
||||
parallel_config,
|
||||
(should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) = (
|
||||
_synchronize_dp_ranks(
|
||||
num_tokens_unpadded,
|
||||
num_tokens_padded,
|
||||
should_attempt_ubatching,
|
||||
allow_dp_padding,
|
||||
cudagraph_mode,
|
||||
parallel_config,
|
||||
)
|
||||
)
|
||||
|
||||
return (should_ubatch, num_tokens_after_padding)
|
||||
return (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode)
|
||||
|
||||
@ -2,14 +2,15 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from contextlib import contextmanager
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm.v1.outputs import (
|
||||
AsyncModelRunnerOutput,
|
||||
LogprobsTensors,
|
||||
ModelRunnerOutput,
|
||||
SamplerOutput,
|
||||
)
|
||||
from vllm.v1.worker.gpu.sample.output import SamplerOutput
|
||||
|
||||
|
||||
class AsyncOutput(AsyncModelRunnerOutput):
|
||||
@ -34,29 +35,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
|
||||
with torch.cuda.stream(self.copy_stream):
|
||||
self.copy_stream.wait_stream(default_stream)
|
||||
|
||||
# NOTE(woosuk): We must ensure that CPU tensors are not freed
|
||||
# before the device-to-host copy is fully completed. For instance,
|
||||
# operations like
|
||||
# self.sampled_token_np = ...to("cpu", non_blocking=True).numpy()
|
||||
# are unsafe because the underlying CPU tensor can be prematurely freed and
|
||||
# reused by other tensors before the asynchronous copy finishes, potentially
|
||||
# causing race conditions. To prevent this, we delay freeing by holding
|
||||
# references until the copy event signals completion.
|
||||
# Likewise, we also need to keep the reference to the GPU tensors.
|
||||
# This is done by keeping the reference to sampler_output and
|
||||
# model_runner_output.
|
||||
self.sampled_token_ids = sampler_output.sampled_token_ids.to(
|
||||
"cpu", non_blocking=True
|
||||
)
|
||||
self.sampled_token_ids = async_copy_to_np(sampler_output.sampled_token_ids)
|
||||
if sampler_output.logprobs_tensors is not None:
|
||||
self.logprobs_tensors: LogprobsTensors | None = (
|
||||
sampler_output.logprobs_tensors.to_cpu_nonblocking()
|
||||
)
|
||||
else:
|
||||
self.logprobs_tensors = None
|
||||
self.num_sampled_tokens_cpu = num_sampled_tokens.to(
|
||||
"cpu", non_blocking=True
|
||||
)
|
||||
if sampler_output.num_nans is not None:
|
||||
self.num_nans = async_copy_to_np(sampler_output.num_nans)
|
||||
else:
|
||||
self.num_nans = None
|
||||
self.num_sampled_tokens_np = async_copy_to_np(num_sampled_tokens)
|
||||
self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
|
||||
if self.model_runner_output.prompt_logprobs_dict:
|
||||
for k, v in self.model_runner_output.prompt_logprobs_dict.items():
|
||||
@ -68,7 +58,6 @@ class AsyncOutput(AsyncModelRunnerOutput):
|
||||
|
||||
def get_output(self) -> ModelRunnerOutput:
|
||||
self.copy_event.synchronize()
|
||||
num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy()
|
||||
|
||||
# NOTE(woosuk): The following code is to ensure compatibility with
|
||||
# the existing model runner.
|
||||
@ -76,10 +65,18 @@ class AsyncOutput(AsyncModelRunnerOutput):
|
||||
# rather than Python lists.
|
||||
sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
|
||||
num_reqs = len(sampled_token_ids)
|
||||
num_sampled_tokens = self.num_sampled_tokens_np.tolist()
|
||||
for i in range(num_reqs):
|
||||
del sampled_token_ids[i][num_sampled_tokens_np[i] :]
|
||||
del sampled_token_ids[i][num_sampled_tokens[i] :]
|
||||
self.model_runner_output.sampled_token_ids = sampled_token_ids
|
||||
|
||||
if self.num_nans is not None:
|
||||
num_nans = self.num_nans.tolist()
|
||||
self.model_runner_output.num_nans_in_logits = {
|
||||
req_id: num_nans[i]
|
||||
for i, req_id in enumerate(self.model_runner_output.req_ids)
|
||||
}
|
||||
|
||||
if self.logprobs_tensors is not None:
|
||||
self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
|
||||
self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
|
||||
@ -95,3 +92,7 @@ def async_barrier(event: torch.cuda.Event | None):
|
||||
finally:
|
||||
if event is not None:
|
||||
event.record()
|
||||
|
||||
|
||||
def async_copy_to_np(x: torch.Tensor) -> np.ndarray:
|
||||
return x.to("cpu", non_blocking=True).numpy()
|
||||
|
||||
0
vllm/v1/worker/gpu/metrics/__init__.py
Normal file
0
vllm/v1/worker/gpu/metrics/__init__.py
Normal file
42
vllm/v1/worker/gpu/metrics/logits.py
Normal file
42
vllm/v1/worker/gpu/metrics/logits.py
Normal file
@ -0,0 +1,42 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import torch
|
||||
from torch._inductor.runtime.triton_helpers import libdevice
|
||||
|
||||
from vllm.triton_utils import tl, triton
|
||||
|
||||
|
||||
@triton.jit
|
||||
def _num_nans_kernel(
|
||||
logits_ptr,
|
||||
logits_stride,
|
||||
num_nans_ptr,
|
||||
vocab_size,
|
||||
BLOCK_SIZE: tl.constexpr,
|
||||
):
|
||||
req_idx = tl.program_id(0)
|
||||
num_nans = 0
|
||||
for i in range(0, vocab_size, BLOCK_SIZE):
|
||||
block = i + tl.arange(0, BLOCK_SIZE)
|
||||
mask = block < vocab_size
|
||||
logits = tl.load(
|
||||
logits_ptr + req_idx * logits_stride + block, mask=mask, other=0
|
||||
)
|
||||
logits = logits.to(tl.float32)
|
||||
is_nan = libdevice.isnan(logits).to(tl.int1)
|
||||
num_nans += tl.sum(is_nan).to(tl.int32)
|
||||
tl.store(num_nans_ptr + req_idx, num_nans)
|
||||
|
||||
|
||||
def get_num_nans(logits: torch.Tensor) -> torch.Tensor:
|
||||
num_reqs, vocab_size = logits.shape
|
||||
BLOCK_SIZE = 8192
|
||||
num_nans = torch.empty(num_reqs, dtype=torch.int32, device=logits.device)
|
||||
_num_nans_kernel[(num_reqs,)](
|
||||
logits,
|
||||
logits.stride(0),
|
||||
num_nans,
|
||||
vocab_size,
|
||||
BLOCK_SIZE=BLOCK_SIZE,
|
||||
)
|
||||
return num_nans
|
||||
@ -25,7 +25,6 @@ from vllm.v1.outputs import (
|
||||
LogprobsTensors,
|
||||
ModelRunnerOutput,
|
||||
)
|
||||
from vllm.v1.sample.sampler import SamplerOutput
|
||||
from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier
|
||||
from vllm.v1.worker.gpu.attn_utils import (
|
||||
build_attn_metadata,
|
||||
@ -53,6 +52,7 @@ from vllm.v1.worker.gpu.sample.metadata import (
|
||||
SamplingMetadata,
|
||||
expand_sampling_metadata,
|
||||
)
|
||||
from vllm.v1.worker.gpu.sample.output import SamplerOutput
|
||||
from vllm.v1.worker.gpu.sample.sampler import Sampler
|
||||
from vllm.v1.worker.gpu.spec_decode import init_speculator
|
||||
from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
|
||||
|
||||
@ -39,9 +39,7 @@ def _min_p_kernel(
|
||||
tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask)
|
||||
|
||||
|
||||
def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor | None) -> None:
|
||||
if min_p is None:
|
||||
return
|
||||
def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor) -> None:
|
||||
num_reqs, vocab_size = logits.shape
|
||||
BLOCK_SIZE = 1024
|
||||
_min_p_kernel[(num_reqs,)](
|
||||
|
||||
14
vllm/v1/worker/gpu/sample/output.py
Normal file
14
vllm/v1/worker/gpu/sample/output.py
Normal file
@ -0,0 +1,14 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.v1.outputs import LogprobsTensors
|
||||
|
||||
|
||||
@dataclass
|
||||
class SamplerOutput:
|
||||
sampled_token_ids: torch.Tensor
|
||||
logprobs_tensors: LogprobsTensors | None
|
||||
num_nans: torch.Tensor | None
|
||||
@ -62,6 +62,7 @@ def _penalties_and_temperature_kernel(
|
||||
mask=packed_block < tl.cdiv(vocab_size, 32),
|
||||
)
|
||||
prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1
|
||||
prompt_bin_mask = prompt_bin_mask.to(tl.int1)
|
||||
prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)
|
||||
|
||||
# If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
|
||||
|
||||
@ -3,13 +3,15 @@
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config.model import LogprobsMode
|
||||
from vllm.v1.outputs import SamplerOutput
|
||||
from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
|
||||
from vllm.v1.worker.gpu.metrics.logits import get_num_nans
|
||||
from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
|
||||
from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
|
||||
from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.worker.gpu.sample.min_p import apply_min_p
|
||||
from vllm.v1.worker.gpu.sample.output import SamplerOutput
|
||||
from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature
|
||||
|
||||
|
||||
@ -21,12 +23,16 @@ class Sampler:
|
||||
if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
|
||||
raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
|
||||
self.logprobs_mode = logprobs_mode
|
||||
self.compute_nans = envs.VLLM_COMPUTE_NANS_IN_LOGITS # False by default.
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
logits: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> SamplerOutput:
|
||||
# NOTE(woosuk): We intentionally compute num_nans before sampling to make clear
|
||||
# that num_nans is computed before applying penalties and temperature.
|
||||
num_nans = get_num_nans(logits) if self.compute_nans else None
|
||||
sampled, processed_logits = self.sample(logits, sampling_metadata)
|
||||
if sampling_metadata.max_num_logprobs is not None:
|
||||
logits = (
|
||||
@ -49,6 +55,7 @@ class Sampler:
|
||||
# token per request.
|
||||
sampled_token_ids=sampled.view(-1, 1),
|
||||
logprobs_tensors=logprobs_tensors,
|
||||
num_nans=num_nans,
|
||||
)
|
||||
return sampler_output
|
||||
|
||||
@ -63,7 +70,8 @@ class Sampler:
|
||||
# Apply penalties and temperature in place.
|
||||
apply_penalties_and_temperature(logits, sampling_metadata)
|
||||
# Apply min_p in place.
|
||||
apply_min_p(logits, sampling_metadata.min_p)
|
||||
if sampling_metadata.min_p is not None:
|
||||
apply_min_p(logits, sampling_metadata.min_p)
|
||||
# Apply top_k and/or top_p. This might return a new tensor.
|
||||
logits = apply_top_k_top_p(
|
||||
logits, sampling_metadata.top_k, sampling_metadata.top_p
|
||||
|
||||
@ -2788,17 +2788,19 @@ class GPUModelRunner(
|
||||
)
|
||||
|
||||
dispatch_cudagraph = (
|
||||
lambda num_tokens: self.cudagraph_dispatcher.dispatch(
|
||||
lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch(
|
||||
num_tokens=num_tokens,
|
||||
has_lora=has_lora,
|
||||
use_cascade_attn=use_cascade_attn,
|
||||
uniform_decode=uniform_decode,
|
||||
disable_full=disable_full,
|
||||
)
|
||||
if not force_eager
|
||||
else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
|
||||
)
|
||||
|
||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
|
||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
||||
num_tokens_padded, use_cascade_attn
|
||||
)
|
||||
num_tokens_padded = batch_descriptor.num_tokens
|
||||
|
||||
# Extra coordination when running data-parallel since we need to coordinate
|
||||
@ -2813,23 +2815,28 @@ class GPUModelRunner(
|
||||
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||
)
|
||||
|
||||
should_ubatch, num_tokens_across_dp = coordinate_batch_across_dp(
|
||||
num_tokens_unpadded=num_tokens,
|
||||
parallel_config=self.parallel_config,
|
||||
allow_microbatching=allow_microbatching,
|
||||
allow_dp_padding=allow_dp_padding,
|
||||
num_tokens_padded=num_tokens_padded,
|
||||
uniform_decode=uniform_decode,
|
||||
num_scheduled_tokens_per_request=num_scheduled_tokens_np,
|
||||
should_ubatch, num_tokens_across_dp, synced_cudagraph_mode = (
|
||||
coordinate_batch_across_dp(
|
||||
num_tokens_unpadded=num_tokens,
|
||||
parallel_config=self.parallel_config,
|
||||
allow_microbatching=allow_microbatching,
|
||||
allow_dp_padding=allow_dp_padding,
|
||||
num_tokens_padded=num_tokens_padded,
|
||||
uniform_decode=uniform_decode,
|
||||
num_scheduled_tokens_per_request=num_scheduled_tokens_np,
|
||||
cudagraph_mode=cudagraph_mode.value,
|
||||
)
|
||||
)
|
||||
|
||||
# Extract DP padding if there is any
|
||||
# Extract DP-synced values
|
||||
if num_tokens_across_dp is not None:
|
||||
dp_rank = self.parallel_config.data_parallel_rank
|
||||
num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
|
||||
|
||||
# Re-dispatch with DP padding
|
||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
|
||||
# Re-dispatch with DP padding so we have the correct batch_descriptor
|
||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
||||
num_tokens_padded,
|
||||
disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value,
|
||||
)
|
||||
# Assert to make sure the agreed upon token count is correct otherwise
|
||||
# num_tokens_across_dp will no-longer be valid
|
||||
assert batch_descriptor.num_tokens == num_tokens_padded
|
||||
@ -4161,10 +4168,19 @@ class GPUModelRunner(
|
||||
|
||||
if self.speculative_config and self.speculative_config.use_eagle():
|
||||
assert isinstance(self.drafter, EagleProposer)
|
||||
# Eagle currently only supports PIECEWISE cudagraphs.
|
||||
# Therefore only use cudagraphs if the main model uses PIECEWISE
|
||||
# NOTE(lucas): this is a hack, need to clean up.
|
||||
use_cudagraphs = (
|
||||
cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE)
|
||||
and not self.speculative_config.enforce_eager
|
||||
)
|
||||
(
|
||||
is_graph_capturing
|
||||
and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
|
||||
)
|
||||
or (
|
||||
not is_graph_capturing
|
||||
and cudagraph_runtime_mode != CUDAGraphMode.NONE
|
||||
)
|
||||
) and not self.speculative_config.enforce_eager
|
||||
|
||||
# Note(gnovack) - We need to disable cudagraphs for one of the two
|
||||
# lora cases when cudagraph_specialize_lora is enabled. This is a
|
||||
|
||||
@ -38,7 +38,7 @@ from vllm.model_executor import set_random_seed
|
||||
from vllm.model_executor.models.interfaces import is_mixture_of_experts
|
||||
from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper
|
||||
from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
@ -79,6 +79,10 @@ class Worker(WorkerBase):
|
||||
is_driver_worker=is_driver_worker,
|
||||
)
|
||||
|
||||
# configure float32 matmul precision according to vLLM env.
|
||||
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
|
||||
torch.set_float32_matmul_precision(precision)
|
||||
|
||||
if self.model_config.trust_remote_code:
|
||||
# note: lazy import to avoid importing torch before initializing
|
||||
from vllm.utils.import_utils import init_cached_hf_modules
|
||||
@ -88,17 +92,19 @@ class Worker(WorkerBase):
|
||||
# Buffers saved before sleep
|
||||
self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
|
||||
|
||||
# Torch/CUDA profiler. Enabled and configured through env vars:
|
||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
||||
# VLLM_TORCH_CUDA_PROFILE=1
|
||||
# Torch/CUDA profiler. Enabled and configured through profiler_config.
|
||||
self.profiler: Any | None = None
|
||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
||||
profiler_config = vllm_config.profiler_config
|
||||
if profiler_config.profiler == "torch":
|
||||
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
|
||||
self.profiler = TorchProfilerWrapper(
|
||||
worker_name=worker_name, local_rank=self.local_rank
|
||||
profiler_config,
|
||||
worker_name=worker_name,
|
||||
local_rank=self.local_rank,
|
||||
activities=["CPU", "CUDA"],
|
||||
)
|
||||
elif envs.VLLM_TORCH_CUDA_PROFILE:
|
||||
self.profiler = CudaProfilerWrapper()
|
||||
elif profiler_config.profiler == "cuda":
|
||||
self.profiler = CudaProfilerWrapper(profiler_config)
|
||||
else:
|
||||
self.profiler = None
|
||||
|
||||
|
||||
@ -98,10 +98,10 @@ class TPUWorker:
|
||||
# MP runtime is initialized.
|
||||
self.profiler = None
|
||||
self.profile_dir = None
|
||||
if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
|
||||
if vllm_config.profiler_config.profiler == "torch" and self.rank < 1:
|
||||
# For TPU, we can only have 1 active profiler session for 1 profiler
|
||||
# server. So we only profile on rank0.
|
||||
self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
self.profile_dir = vllm_config.profiler_config.torch_profiler_dir
|
||||
logger.info(
|
||||
"Profiling enabled. Traces will be saved to: %s", self.profile_dir
|
||||
)
|
||||
|
||||
@ -6,12 +6,12 @@ from typing import Any
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed import get_world_group
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor import set_random_seed
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.profiler.wrapper import TorchProfilerWrapper
|
||||
from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
|
||||
from vllm.v1.worker.xpu_model_runner import XPUModelRunner
|
||||
|
||||
@ -36,41 +36,17 @@ class XPUWorker(Worker):
|
||||
assert device_config.device_type == "xpu"
|
||||
assert current_platform.is_xpu()
|
||||
|
||||
# Torch profiler. Enabled and configured through env vars:
|
||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
||||
# Torch profiler. Enabled and configured through profiler_config.
|
||||
self.profiler: Any | None = None
|
||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
||||
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
profiler_config = vllm_config.profiler_config
|
||||
if profiler_config.profiler == "torch":
|
||||
worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
|
||||
logger.info(
|
||||
"Profiling enabled. Traces will be saved to: %s",
|
||||
torch_profiler_trace_dir,
|
||||
self.profiler = TorchProfilerWrapper(
|
||||
profiler_config,
|
||||
worker_name=worker_name,
|
||||
local_rank=self.local_rank,
|
||||
activities=["CPU", "XPU"],
|
||||
)
|
||||
logger.debug(
|
||||
"Profiler config: record_shapes=%s,"
|
||||
"profile_memory=%s,with_stack=%s,with_flops=%s",
|
||||
envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
||||
envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
||||
envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
||||
envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
||||
)
|
||||
self.profiler = torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.XPU,
|
||||
],
|
||||
record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
|
||||
profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
|
||||
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
|
||||
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
torch_profiler_trace_dir,
|
||||
worker_name=worker_name,
|
||||
use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
|
||||
),
|
||||
)
|
||||
else:
|
||||
self.profiler = None
|
||||
|
||||
# we provide this function due to `torch.xpu.mem_get_info()` doesn't
|
||||
# return correct free_gpu_memory on intel client GPU. We need to
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user