diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 6950ad774edd8..4038d32834e68 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -398,7 +398,8 @@ steps: timeout_in_minutes: 25 gpu: h100 source_file_dependencies: - - vllm/ + - vllm/v1/attention + - vllm/model_executor/layers - tests/v1/determinism/ commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn @@ -440,23 +441,29 @@ steps: working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints + - vllm/multimodal - examples/ commands: - pip install tensorizer # for tensorizer test + # for basic + - python3 offline_inference/basic/chat.py - python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/chat.py - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + # for multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py + # for pooling models + - python3 pooling/pooling/vision_language_pooling.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -718,6 +725,18 @@ steps: - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py +- label: LM Eval Small Models # 53min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + autorun_on_main: true + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + - label: OpenAI API correctness # 10min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction] @@ -727,7 +746,7 @@ steps: - csrc/ - vllm/entrypoints/openai/ - vllm/model_executor/models/whisper.py - commands: # LMEval + commands: # LMEval+Transcription WER check # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - pytest -s entrypoints/openai/correctness/ @@ -963,6 +982,19 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work +- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + - label: Multi-Modal Models Test (Extended) 1 # 60min timeout_in_minutes: 120 mirror_hardwares: [amdexperimental] @@ -1098,7 +1130,6 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - vllm/model_executor/layers/fused_moe/layer.py - tests/compile/test_fusion_attn.py - tests/compile/test_silu_mul_quant_fusion.py - tests/compile/distributed/test_fusion_all_reduce.py @@ -1132,12 +1163,25 @@ steps: - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - tests/compile/distributed/test_fusions_e2e.py - - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi # Run all e2e fusion tests - pytest -v -s tests/compile/distributed/test_fusions_e2e.py +- label: Blackwell GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + - label: Blackwell Quantized MoE Test timeout_in_minutes: 60 working_dir: "/vllm-workspace/" @@ -1155,6 +1199,16 @@ steps: commands: - pytest -s -v tests/quantization/test_blackwell_moe.py +- label: Blackwell LM Eval Small Models + timeout_in_minutes: 120 + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + ##### 1 GPU test ##### ##### multi gpus test ##### @@ -1397,6 +1451,39 @@ steps: - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py + +- label: LM Eval Large Models # optional + gpu: a100 + optional: true + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +##### H100 test ##### +- label: LM Eval Large Models (H100) # optional + gpu: h100 + optional: true + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + + ##### H200 test ##### - label: Distributed Tests (H200) # optional mirror_hardwares: [amdexperimental] @@ -1440,29 +1527,6 @@ steps: commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 -- label: Blackwell LM Eval Small Models - timeout_in_minutes: 120 - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - -- label: Multi-Modal Accuracy Eval (Small Models) # 10min - timeout_in_minutes: 70 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - vllm/multimodal/ - - vllm/inputs/ - - vllm/v1/core/ - commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - - label: LM Eval Large Models (4 Card) mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_4 @@ -1478,21 +1542,6 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 -- label: LM Eval Large Models (H100) # optional - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - - label: ROCm LM Eval Large Models (8 Card) mirror_hardwares: [amdproduction] agent_pool: mi325_8 @@ -1517,6 +1566,20 @@ steps: - uv pip install --system 'gpt-oss[eval]==0.0.5' - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 +##### RL Integration Tests ##### +- label: Prime-RL Integration Test # 15min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh - label: DeepSeek V2-Lite Accuracy mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_4 @@ -1550,17 +1613,26 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 -##### RL Integration Tests ##### -- label: Prime-RL Integration Test # 15min +- label: DeepSeek V2-Lite Async EPLB Accuracy + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] - agent_pool: mi325_2 + agent_pool: mi325_4 # grade: Blocking - timeout_in_minutes: 30 + gpu: h100 optional: true - num_gpus: 2 + num_gpus: 4 working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/ - - .buildkite/scripts/run-prime-rl-test.sh commands: - - bash .buildkite/scripts/run-prime-rl-test.sh + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0a99994e243ae..8fc3587f7813c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -468,7 +468,9 @@ steps: # tests covered elsewhere. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -482,7 +484,9 @@ steps: # as it is a heavy test that is covered in other steps. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index 56fbe5ca704a1..df8910837715d 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -13,7 +13,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - name: Set up Python uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index 3a12c4b3a8300..e80a5c0cc80f9 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -12,7 +12,7 @@ jobs: timeout-minutes: 30 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v6.0.1 - uses: astral-sh/setup-uv@v7 with: diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index a03b979ad761d..1041653c2f57e 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -16,7 +16,7 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: "3.12" diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index c8a52f1a63269..44bf71db5e9de 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -15,7 +15,7 @@ jobs: actions: write runs-on: ubuntu-latest steps: - - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0 + - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1 with: # Increasing this value ensures that changes to this workflow # propagate to all issues and PRs in days rather than months diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 56b721cbb4021..25baa9cbda39c 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -96,8 +96,9 @@ start_server() { # This correctly passes each element as a separate argument. if [[ -n "$profile_dir" ]]; then # Start server with profiling enabled - VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ - vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & + local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}" + VLLM_SERVER_DEV_MODE=1 \ + vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 & else # Start server without profiling VLLM_SERVER_DEV_MODE=1 \ diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index df122b4c5e8db..a4e1b163dcca9 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -963,8 +963,7 @@ def create_argument_parser(): parser.add_argument( "--profile", action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", + help="Use vLLM Profiling. --profiler-config must be provided on the server.", ) parser.add_argument( "--result-dir", diff --git a/docs/api/README.md b/docs/api/README.md index d3a141f327308..d51329ec2faa3 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes. - [vllm.config.MultiModalConfig][] - [vllm.config.PoolerConfig][] - [vllm.config.StructuredOutputsConfig][] +- [vllm.config.ProfilerConfig][] - [vllm.config.ObservabilityConfig][] - [vllm.config.KVTransferConfig][] - [vllm.config.CompilationConfig][] diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 65382afbe4f21..cbce14ce992ec 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -5,16 +5,15 @@ ## Profile with PyTorch Profiler -We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables: +We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config` +when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config: -- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default -- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default -- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default -- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default -- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default -- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default - -The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. +- `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default +- `torch_profiler_with_memory` to record memory, off by default +- `torch_profiler_with_stack` to enable recording stack information, on by default +- `torch_profiler_with_flops` to enable recording FLOPs, off by default +- `torch_profiler_use_gzip` to control gzip-compressing profiling files, on by default +- `torch_profiler_dump_cuda_time_total` to control dumping and printing the aggregated CUDA self time table, on by default When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag. @@ -40,8 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline #### OpenAI Server ```bash -VLLM_TORCH_PROFILER_DIR=./vllm_profile \ - vllm serve meta-llama/Llama-3.1-8B-Instruct +vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}' ``` vllm bench command: @@ -104,13 +102,12 @@ To profile the server, you will want to prepend your `vllm serve` command with ` ```bash # server -VLLM_TORCH_CUDA_PROFILE=1 \ nsys profile \ --trace-fork-before-exec=true \ --cuda-graph-trace=node \ --capture-range=cudaProfilerApi \ --capture-range-end repeat \ - vllm serve meta-llama/Llama-3.1-8B-Instruct + vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config.profiler cuda # client vllm bench serve \ diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 3315c0949afca..93cca23856a9b 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.end_token_id in input_ids + + def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool: + return self.end_token_id in delta_token_ids ... ``` diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py index 46858fffadc52..e8a75cd03befb 100644 --- a/examples/offline_inference/simple_profiling.py +++ b/examples/offline_inference/simple_profiling.py @@ -1,14 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import time from vllm import LLM, SamplingParams -# enable torch profiler, can also be set on cmd line -os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile" - # Sample prompts. prompts = [ "Hello, my name is", @@ -22,7 +18,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) def main(): # Create an LLM. - llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1) + llm = LLM( + model="facebook/opt-125m", + tensor_parallel_size=1, + profiler_config={ + "profiler": "torch", + "torch_profiler_dir": "./vllm_profile", + }, + ) llm.start_profile() diff --git a/tests/compile/fullgraph/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py index e2897b2271b8f..621f6a51a918f 100644 --- a/tests/compile/fullgraph/test_multimodal_compile.py +++ b/tests/compile/fullgraph/test_multimodal_compile.py @@ -17,7 +17,6 @@ def test_compile(): # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @pytest.mark.forked @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") -@pytest.mark.xfail def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch): """Test that Qwen2.5-VL vision submodules are compiled. diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py index d849a8617ebd2..14ae8233f1131 100644 --- a/tests/compile/test_compile_ranges.py +++ b/tests/compile/test_compile_ranges.py @@ -80,6 +80,8 @@ def test_compile_ranges(use_fresh_inductor_cache): vllm_config = VllmConfig( scheduler_config=SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, @@ -112,6 +114,8 @@ def test_compile_config_get_compile_ranges(): VllmConfig( scheduler_config=SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ), compilation_config=compilation_config, ) @@ -134,6 +138,8 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache): ) scheduler_config = SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ) torch.set_default_device("cuda") diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index 6d0ba6b655031..6ed77b0085f51 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -5,9 +5,14 @@ import copy import pytest import torch -from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass +from vllm.compilation.inductor_pass import ( + CallableInductorPass, + InductorPass, + pass_context, +) from vllm.compilation.pass_manager import PostGradPassManager from vllm.config import ModelConfig, VllmConfig +from vllm.config.utils import Range # dummy custom pass that doesn't inherit @@ -42,35 +47,37 @@ class ProperPass(InductorPass): ], ) def test_pass_manager_uuid(callable): - # Some passes need dtype to be set - config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) + # Set the pass context as PassManager uuid uses it + with pass_context(Range(start=1, end=8)): + # Some passes need dtype to be set + config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) - pass_manager = PostGradPassManager() - pass_manager.configure(config) + pass_manager = PostGradPassManager() + pass_manager.configure(config) - # Check that UUID is different if the same pass is added 2x - pass_manager.add(callable) - uuid1 = pass_manager.uuid() - pass_manager.add(callable) - uuid2 = pass_manager.uuid() - assert uuid1 != uuid2 + # Check that UUID is different if the same pass is added 2x + pass_manager.add(callable) + uuid1 = pass_manager.uuid() + pass_manager.add(callable) + uuid2 = pass_manager.uuid() + assert uuid1 != uuid2 - # UUID should be the same as the original one, - # as we constructed in the same way. - pass_manager2 = PostGradPassManager() - pass_manager2.configure(config) - pass_manager2.add(callable) - assert uuid1 == pass_manager2.uuid() + # UUID should be the same as the original one, + # as we constructed in the same way. + pass_manager2 = PostGradPassManager() + pass_manager2.configure(config) + pass_manager2.add(callable) + assert uuid1 == pass_manager2.uuid() - # UUID should be different due to config change - config2 = copy.deepcopy(config) - config2.compilation_config.pass_config.fuse_norm_quant = ( - not config2.compilation_config.pass_config.fuse_norm_quant - ) - config2.compilation_config.pass_config.fuse_act_quant = ( - not config2.compilation_config.pass_config.fuse_act_quant - ) - pass_manager3 = PostGradPassManager() - pass_manager3.configure(config2) - pass_manager3.add(callable) - assert uuid1 != pass_manager3.uuid() + # UUID should be different due to config change + config2 = copy.deepcopy(config) + config2.compilation_config.pass_config.fuse_norm_quant = ( + not config2.compilation_config.pass_config.fuse_norm_quant + ) + config2.compilation_config.pass_config.fuse_act_quant = ( + not config2.compilation_config.pass_config.fuse_act_quant + ) + pass_manager3 = PostGradPassManager() + pass_manager3.configure(config2) + pass_manager3.add(callable) + assert uuid1 != pass_manager3.uuid() diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index ae3c63cc62d6b..639abdf6f0487 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -26,7 +26,14 @@ def clear_cache(): _cached_get_attn_backend.cache_clear() -@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) +devices = ["cpu"] +if current_platform.is_cuda(): + devices.append("cuda") +if current_platform.is_rocm(): + devices.append("hip") + + +@pytest.mark.parametrize("device", devices) def test_mha_attn_platform(device: str): """ Test the attention selector between different platform and device. @@ -46,7 +53,7 @@ def test_mha_attn_platform(device: str): patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()), ): attn = MultiHeadAttention(16, 64, scale=1) - assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA + assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN else: # Test CUDA with head_size=64 (divisible by 32) # - should use vLLM's FlashAttention diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 830d43569e98b..e29f66dca313f 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -103,7 +103,7 @@ def ref_dynamic_per_tensor_fp8_quant( .clamp(fp8_traits_min, fp8_traits_max) .to(FP8_DTYPE) ) - return ref_out, ref_scale.view((1, 1)) + return ref_out, ref_scale.view(1) def native_w8a8_block_matmul( diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py index 34e9483de54b3..165e91a2c79f2 100644 --- a/tests/reasoning/test_base_thinking_reasoning_parser.py +++ b/tests/reasoning/test_base_thinking_reasoning_parser.py @@ -132,6 +132,41 @@ class TestBaseThinkingReasoningParserMethods: is False ) + def test_is_reasoning_end_streaming(self, test_tokenizer): + """Test the is_reasoning_end_streaming method.""" + parser = TestThinkingReasoningParser(test_tokenizer) + end_token_id = parser.end_token_id + start_token_id = parser.start_token_id + + assert ( + parser.is_reasoning_end_streaming([1, 2, end_token_id], [end_token_id]) + is True + ) + assert parser.is_reasoning_end_streaming([1, 2, 3, 4], [4]) is False + assert parser.is_reasoning_end_streaming([], []) is False + assert ( + parser.is_reasoning_end_streaming( + [1, start_token_id, 2, end_token_id], [end_token_id] + ) + is True + ) + assert ( + parser.is_reasoning_end_streaming([1, start_token_id, 2, 3], [3]) is False + ) + assert ( + parser.is_reasoning_end_streaming( + [1, start_token_id, 2, end_token_id, 2, start_token_id, 2], + [2], + ) + is False + ) + assert ( + parser.is_reasoning_end_streaming( + [1, start_token_id, 2, end_token_id, 2, 2], [2] + ) + is False + ) + def test_extract_content_ids(self, test_tokenizer): """Test the extract_content_ids method.""" parser = TestThinkingReasoningParser(test_tokenizer) diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py index 6e8f0e8dcc9b9..874fdef778110 100644 --- a/tests/reasoning/test_deepseekv3_reasoning_parser.py +++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py @@ -40,6 +40,7 @@ def test_identity_reasoning_parser_basic(tokenizer): input_tokens = tokenizer.tokenize(input_text) input_ids = tokenizer.convert_tokens_to_ids(input_tokens) assert parser.is_reasoning_end(input_ids) is True + assert parser.is_reasoning_end_streaming(input_ids, input_ids) is True # Test extract_content_ids returns all input_ids assert parser.extract_content_ids(input_ids) == input_ids diff --git a/tests/tool_use/test_mistral_tool_parser.py b/tests/tool_use/test_mistral_tool_parser.py index e5deb7f40eb35..2dd0399cb8eeb 100644 --- a/tests/tool_use/test_mistral_tool_parser.py +++ b/tests/tool_use/test_mistral_tool_parser.py @@ -615,6 +615,7 @@ def test_extract_tool_calls_streaming( "single_tool_weather", "multiple_tool_calls", "content_before_tool", + "complex", ], argnames=["model_output", "expected_tool_calls", "expected_content"], argvalues=[ @@ -673,6 +674,21 @@ def test_extract_tool_calls_streaming( ], "bla", ), + ( + # Complex + """[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501 + [ + ToolCall( + function=FunctionCall( + name="bash", + arguments=json.dumps( + {"command": "print(\"hello world!\")\nre.compile(r'{}')"} + ), + ) + ) + ], + "", + ), ], ) def test_extract_tool_calls_streaming_one_chunk( diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index b86534d3d4381..0e71d6c63ce68 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -161,10 +161,10 @@ class TestCudagraphDispatcher: assert rt_mode == CUDAGraphMode.NONE assert key == BatchDescriptor(num_tokens=15) - # 4. Cascade attention should have a fall back mode + # 4. disable_full should have a fall back mode (e.g., cascade attention) desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False) rt_mode, key = dispatcher.dispatch( - num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True + num_tokens=8, uniform_decode=False, has_lora=False, disable_full=True ) if "PIECEWISE" in cudagraph_mode_str: # string contains check assert rt_mode == CUDAGraphMode.PIECEWISE diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index fc953a66f0820..1c45e7fe366ff 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -10,6 +10,7 @@ from utils import ( BACKENDS, _extract_step_logprobs, _random_prompt, + is_device_capability_below_90, resolve_model_name, skip_unsupported, ) @@ -17,6 +18,8 @@ from utils import ( import vllm.model_executor.layers.batch_invariant as batch_invariant from vllm import LLM, SamplingParams +IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90() + @skip_unsupported @pytest.mark.timeout(1000) @@ -190,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( max_model_len=8192, dtype="bfloat16", # not everything is supported gpu_memory_utilization=0.9, + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) # Use more realistic prompts for better token generation @@ -393,6 +397,8 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): gpu_memory_utilization=0.9, max_model_len=2048, dtype="bfloat16", + enable_prefix_caching=False, + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) prompt = "the capital of france is" @@ -459,6 +465,7 @@ def test_logprobs_without_batch_invariance_should_fail( max_num_seqs=32, max_model_len=8192, dtype="bfloat16", + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) # build ragged prompts to change shapes significantly across BS=1 vs BS=N @@ -682,6 +689,7 @@ def test_decode_logprobs_match_prefill_logprobs( max_num_seqs=32, max_model_len=8192, dtype="bfloat16", + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) # Use a few test prompts @@ -925,6 +933,8 @@ def LLM_with_max_seqs( max_model_len=max_model_len, dtype="bfloat16", tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), + enable_prefix_caching=False, + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, # Enable for MOE models # enable_expert_parallel=True, ) diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py index 6aab50cf84ab6..a8013ed229cfc 100644 --- a/tests/v1/determinism/utils.py +++ b/tests/v1/determinism/utils.py @@ -11,8 +11,10 @@ from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer skip_unsupported = pytest.mark.skipif( - not (current_platform.is_cuda() and current_platform.has_device_capability(90)), - reason="Requires CUDA and >= Hopper (SM90)", + not (current_platform.is_cuda() and current_platform.has_device_capability(80)), + # Supports testing on Ampere and Ada Lovelace devices. + # Note: For devices with SM < 90, batch invariance does not support CUDA Graphs. + reason="Requires CUDA and >= Ampere (SM80)", ) BACKENDS: list[str] = [ @@ -97,3 +99,7 @@ def _extract_step_logprobs(request_output): return t, inner.token_ids return None, None + + +def is_device_capability_below_90() -> bool: + return not current_platform.has_device_capability(90) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 945276376d665..838d05f0486c1 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -124,6 +124,8 @@ def run_tests( with monkeypatch.context() as m: # avoid precision errors m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") + # lock matmul precision to full FP32 + m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest") # m.setenv("VLLM_BATCH_INVARIANT", "1") outputs: list[tuple[str, list, list]] = [] for n, ( diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py index 70047a993c3f9..ba52af3ad604d 100644 --- a/tests/v1/structured_output/test_reasoning_structured_output.py +++ b/tests/v1/structured_output/test_reasoning_structured_output.py @@ -70,6 +70,7 @@ class TestReasoningStructuredOutput: request.use_structured_output = True request.prompt_token_ids = [1, 2, 3, 4, 5] request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8] + request.num_computed_tokens = 5 return request def test_should_fill_bitmask_with_enable_in_reasoning( diff --git a/tests/v1/worker/test_gpu_profiler.py b/tests/v1/worker/test_gpu_profiler.py index f7255fae05a4e..933ea42f18cd5 100644 --- a/tests/v1/worker/test_gpu_profiler.py +++ b/tests/v1/worker/test_gpu_profiler.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -import vllm.envs as envs -from vllm.profiler.gpu_profiler import WorkerProfiler +from vllm.config import ProfilerConfig +from vllm.profiler.wrapper import WorkerProfiler class ConcreteWorkerProfiler(WorkerProfiler): @@ -11,11 +11,11 @@ class ConcreteWorkerProfiler(WorkerProfiler): A basic implementation of a worker profiler for testing purposes. """ - def __init__(self): + def __init__(self, profiler_config: ProfilerConfig): self.start_call_count = 0 self.stop_call_count = 0 self.should_fail_start = False - super().__init__() + super().__init__(profiler_config) def _start(self) -> None: if self.should_fail_start: @@ -26,17 +26,19 @@ class ConcreteWorkerProfiler(WorkerProfiler): self.stop_call_count += 1 -@pytest.fixture(autouse=True) -def reset_mocks(): - """Fixture to reset mocks and env variables before each test.""" - envs.VLLM_PROFILER_DELAY_ITERS = 0 - envs.VLLM_PROFILER_MAX_ITERS = 0 +@pytest.fixture +def default_profiler_config(): + return ProfilerConfig( + profiler="torch", + torch_profiler_dir="/tmp/mock", + delay_iterations=0, + max_iterations=0, + ) -def test_immediate_start_stop(): +def test_immediate_start_stop(default_profiler_config): """Test standard start without delay.""" - profiler = ConcreteWorkerProfiler() - + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() assert profiler._running is True assert profiler._active is True @@ -48,10 +50,10 @@ def test_immediate_start_stop(): assert profiler.stop_call_count == 1 -def test_delayed_start(): +def test_delayed_start(default_profiler_config): """Test that profiler waits for N steps before actually starting.""" - envs.VLLM_PROFILER_DELAY_ITERS = 2 - profiler = ConcreteWorkerProfiler() + default_profiler_config.delay_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) # User requests start profiler.start() @@ -71,10 +73,10 @@ def test_delayed_start(): assert profiler.start_call_count == 1 -def test_max_iterations(): +def test_max_iterations(default_profiler_config): """Test that profiler stops automatically after max iterations.""" - envs.VLLM_PROFILER_MAX_ITERS = 2 - profiler = ConcreteWorkerProfiler() + default_profiler_config.max_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() assert profiler._running is True @@ -95,12 +97,11 @@ def test_max_iterations(): assert profiler.stop_call_count == 1 -def test_delayed_start_and_max_iters(): +def test_delayed_start_and_max_iters(default_profiler_config): """Test combined delayed start and max iterations.""" - envs.VLLM_PROFILER_DELAY_ITERS = 2 - envs.VLLM_PROFILER_MAX_ITERS = 2 - profiler = ConcreteWorkerProfiler() - + default_profiler_config.delay_iterations = 2 + default_profiler_config.max_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() # Step 1 @@ -127,9 +128,9 @@ def test_delayed_start_and_max_iters(): assert profiler.stop_call_count == 1 -def test_idempotency(): +def test_idempotency(default_profiler_config): """Test that calling start/stop multiple times doesn't break logic.""" - profiler = ConcreteWorkerProfiler() + profiler = ConcreteWorkerProfiler(default_profiler_config) # Double Start profiler.start() @@ -142,10 +143,10 @@ def test_idempotency(): assert profiler.stop_call_count == 1 # Should only stop once -def test_step_inactive(): +def test_step_inactive(default_profiler_config): """Test that stepping while inactive does nothing.""" - envs.VLLM_PROFILER_DELAY_ITERS = 2 - profiler = ConcreteWorkerProfiler() + default_profiler_config.delay_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) # Not started yet profiler.step() @@ -155,9 +156,9 @@ def test_step_inactive(): assert profiler.start_call_count == 0 -def test_start_failure(): +def test_start_failure(default_profiler_config): """Test behavior when the underlying _start method raises exception.""" - profiler = ConcreteWorkerProfiler() + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.should_fail_start = True profiler.start() @@ -168,9 +169,9 @@ def test_start_failure(): assert profiler.start_call_count == 0 # Logic failed inside start -def test_shutdown(): +def test_shutdown(default_profiler_config): """Test that shutdown calls stop only if running.""" - profiler = ConcreteWorkerProfiler() + profiler = ConcreteWorkerProfiler(default_profiler_config) # Case 1: Not running profiler.shutdown() @@ -182,10 +183,10 @@ def test_shutdown(): assert profiler.stop_call_count == 1 -def test_mixed_delay_and_stop(): +def test_mixed_delay_and_stop(default_profiler_config): """Test manual stop during the delay period.""" - envs.VLLM_PROFILER_DELAY_ITERS = 5 - profiler = ConcreteWorkerProfiler() + default_profiler_config.delay_iterations = 5 + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() profiler.step() diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 35920d826578e..94bbc9b00225e 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -9,6 +9,8 @@ import vllm.envs as envs from vllm.platforms import current_platform from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer +_FP8_DTYPE = current_platform.fp8_dtype() + def is_aiter_found() -> bool: from importlib.util import find_spec @@ -467,6 +469,59 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake( return torch.empty_like(x), torch.empty_like(residual) +def _rocm_aiter_per_tensor_quant_impl( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter.ops.quant import per_tensor_quant_hip + + return per_tensor_quant_hip(x, scale, quant_dtype) + + +def _rocm_aiter_per_tensor_quant_fake( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + return torch.empty_like(x, dtype=quant_dtype), torch.empty( + 1, dtype=torch.float32, device=x.device + ) + + +def _rocm_aiter_per_token_quant_impl( + x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter.ops.quant import dynamic_per_token_scaled_quant + + assert quant_dtype in [torch.int8, _FP8_DTYPE] + + out_shape = x.shape + out = torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device) + if scale is None: + scale = torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device) + dynamic_per_token_scaled_quant( + out, + x, + scale, + scale_ub=None, + shuffle_scale=False, + num_rows=None, + num_rows_factor=1, + ) + return out, scale + + +def _rocm_aiter_per_token_quant_fake( + x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None +) -> tuple[torch.Tensor, torch.Tensor]: + out_shape = x.shape + return ( + torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device), + torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device), + ) + + # Global flag to ensure ops are registered only once _OPS_REGISTERED = False @@ -665,6 +720,22 @@ class rocm_aiter_ops: dispatch_key=current_platform.dispatch_key, ) + direct_register_custom_op( + op_name="rocm_aiter_per_tensor_quant", + op_func=_rocm_aiter_per_tensor_quant_impl, + mutates_args=[], + fake_impl=_rocm_aiter_per_tensor_quant_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_per_token_quant", + op_func=_rocm_aiter_per_token_quant_impl, + mutates_args=["scale"], + fake_impl=_rocm_aiter_per_token_quant_fake, + dispatch_key=current_platform.dispatch_key, + ) + _OPS_REGISTERED = True @staticmethod @@ -859,6 +930,22 @@ class rocm_aiter_ops: kv_scale=kv_scale, ) + @staticmethod + def per_tensor_quant( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return torch.ops.vllm.rocm_aiter_per_tensor_quant(x, quant_dtype, scale) + + @staticmethod + def per_token_quant( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return torch.ops.vllm.rocm_aiter_per_token_quant(x, quant_dtype, scale) + @staticmethod def triton_fp4_gemm_dynamic_qaunt( x: torch.Tensor, diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 6bbfe11b6e925..6d862c5812560 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1726,7 +1726,7 @@ def scaled_fp8_quant( output, input, scale, scale_ub ) else: - scale = torch.empty((1, 1), device=input.device, dtype=torch.float32) + scale = torch.empty(1, device=input.device, dtype=torch.float32) torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) else: assert scale.numel() == 1, f"{scale.shape}" diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 7e5adfe0742d3..c77fc0fad0038 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -89,7 +89,10 @@ def maybe_get_vit_flash_attn_backend( if attn_backend == AttentionBackendEnum.ROCM_AITER_FA: from aiter import flash_attn_varlen_func else: - from vllm.attention.utils.fa_utils import flash_attn_varlen_func + try: + from vllm.attention.utils.fa_utils import flash_attn_varlen_func + except ImportError: + flash_attn_varlen_func = None else: flash_attn_varlen_func = None diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index b4f1751837f48..99c1c846f19af 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -12,7 +12,6 @@ from typing import Any import numpy as np from tqdm import tqdm -import vllm.envs as envs from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType @@ -79,12 +78,11 @@ def add_cli_args(parser: argparse.ArgumentParser): def main(args: argparse.Namespace): - if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: - raise OSError( - "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " - "Please set it to a valid path to use torch profiler." - ) engine_args = EngineArgs.from_cli_args(args) + if args.profile and not engine_args.profiler_config.profiler == "torch": + raise ValueError( + "The torch profiler is not enabled. Please provide profiler_config." + ) # Lazy import to avoid importing LLM when the bench command is not selected. from vllm import LLM, SamplingParams @@ -144,7 +142,7 @@ def main(args: argparse.Namespace): run_to_completion(profile_dir=None) if args.profile: - profile_dir = envs.VLLM_TORCH_PROFILER_DIR + profile_dir = engine_args.profiler_config.torch_profiler_dir print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 568290aa894ff..2e2054a8a4b13 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1097,8 +1097,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--profile", action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", + help="Use vLLM Profiling. --profiler-config must be provided on the server.", ) parser.add_argument( "--save-result", diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index ea693613fdd16..d824e982b7489 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -655,8 +655,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "--profile", action="store_true", default=False, - help="Use Torch Profiler. The env variable " - "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.", + help="Use vLLM Profiling. --profiler-config must be provided on the server.", ) # prefix repetition dataset diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index 8159b817f637a..dbf154eeb86a4 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + import functools import hashlib import inspect @@ -8,15 +10,17 @@ import json import types from collections.abc import Callable from contextlib import contextmanager -from typing import Any +from typing import TYPE_CHECKING, Any import torch from torch import fx from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily -from vllm.config.utils import Range from vllm.utils.torch_utils import is_torch_equal_or_newer +if TYPE_CHECKING: + from vllm.config.utils import Range + if is_torch_equal_or_newer("2.6"): from torch._inductor.custom_graph_pass import CustomGraphPass else: diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 129b9b5deea31..a15c693767a51 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -53,8 +53,27 @@ class PiecewiseBackend: self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1 self.is_full_graph = total_piecewise_compiles == 1 + # TODO: we need to generalize encoder compilation to other models + self.is_encoder_compilation = vllm_backend.prefix in [ + "Qwen2_5_VisionPatchEmbed", + "Qwen2_5_VisionPatchMerger", + "Qwen2_5_VisionBlock", + ] self.compile_ranges = self.compilation_config.get_compile_ranges() + if self.is_encoder_compilation: + # For encoder compilation we use the max int32 value + # to set the upper bound of the compile ranges + max_int32 = 2**31 - 1 + last_compile_range = self.compile_ranges[-1] + assert ( + last_compile_range.end + == vllm_config.scheduler_config.max_num_batched_tokens + ) + self.compile_ranges[-1] = Range( + start=last_compile_range.start, end=max_int32 + ) + log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}" logger.debug_once(log_string) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 0f84f3ca9d3e3..0e91dd57420a8 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig from vllm.config.observability import ObservabilityConfig from vllm.config.parallel import EPLBConfig, ParallelConfig from vllm.config.pooler import PoolerConfig +from vllm.config.profiler import ProfilerConfig from vllm.config.scheduler import SchedulerConfig from vllm.config.speculative import SpeculativeConfig from vllm.config.speech_to_text import SpeechToTextConfig @@ -89,6 +90,8 @@ __all__ = [ "SpeechToTextConfig", # From vllm.config.structured_outputs "StructuredOutputsConfig", + # From vllm.config.profiler + "ProfilerConfig", # From vllm.config.utils "ConfigType", "SupportsMetricsInfo", diff --git a/vllm/config/profiler.py b/vllm/config/profiler.py new file mode 100644 index 0000000000000..76cc546f3c9e2 --- /dev/null +++ b/vllm/config/profiler.py @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +from typing import Any, Literal + +from pydantic import Field, model_validator +from pydantic.dataclasses import dataclass +from typing_extensions import Self + +import vllm.envs as envs +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.utils.hashing import safe_hash + +logger = init_logger(__name__) + +ProfilerKind = Literal["torch", "cuda"] + + +@config +@dataclass +class ProfilerConfig: + """Dataclass which contains profiler config for the engine.""" + + profiler: ProfilerKind | None = None + """Which profiler to use. Defaults to None. Options are: + + - 'torch': Use PyTorch profiler.\n + - 'cuda': Use CUDA profiler.""" + + torch_profiler_dir: str = "" + """Directory to save torch profiler traces. Both AsyncLLM's CPU traces and + worker's traces (CPU & GPU) will be saved under this directory. Note that + it must be an absolute path.""" + + torch_profiler_with_stack: bool = True + """If `True`, enables stack tracing in the torch profiler. Enabled by default.""" + + torch_profiler_with_flops: bool = False + """If `True`, enables FLOPS counting in the torch profiler. Disabled by default.""" + + torch_profiler_use_gzip: bool = True + """If `True`, saves torch profiler traces in gzip format. Enabled by default""" + + torch_profiler_dump_cuda_time_total: bool = True + """If `True`, dumps total CUDA time in torch profiler traces. Enabled by default.""" + + torch_profiler_record_shapes: bool = False + """If `True`, records tensor shapes in the torch profiler. Disabled by default.""" + + torch_profiler_with_memory: bool = False + """If `True`, enables memory profiling in the torch profiler. + Disabled by default.""" + + ignore_frontend: bool = False + """If `True`, disables the front-end profiling of AsyncLLM when using the + 'torch' profiler. This is needed to reduce overhead when using delay/limit options, + since the front-end profiling does not track iterations and will capture the + entire range. + """ + + delay_iterations: int = Field(default=0, ge=0) + """Number of engine iterations to skip before starting profiling. + Defaults to 0, meaning profiling starts immediately after receiving /start_profile. + """ + + max_iterations: int = Field(default=0, ge=0) + """Maximum number of engine iterations to profile after starting profiling. + Defaults to 0, meaning no limit. + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None: + """Get field from env var if set, with deprecation warning.""" + + if envs.is_set(env_var_name): + value = getattr(envs, env_var_name) + logger.warning_once( + "Using %s environment variable is deprecated and will be removed in " + "v0.14.0 or v1.0.0, whichever is soonest. Please use " + "--profiler-config.%s command line argument or " + "ProfilerConfig(%s=...) config field instead.", + env_var_name, + field_name, + field_name, + ) + return value + return None + + def _set_from_env_if_set( + self, + field_name: str, + env_var_name: str, + to_bool: bool = True, + to_int: bool = False, + ) -> None: + """Set field from env var if set, with deprecation warning.""" + value = self._get_from_env_if_set(field_name, env_var_name) + if value is not None: + if to_bool: + value = value == "1" + if to_int: + value = int(value) + setattr(self, field_name, value) + + @model_validator(mode="after") + def _validate_profiler_config(self) -> Self: + maybe_use_cuda_profiler = self._get_from_env_if_set( + "profiler", "VLLM_TORCH_CUDA_PROFILE" + ) + if maybe_use_cuda_profiler is not None: + self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None + else: + self._set_from_env_if_set( + "torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False + ) + if self.torch_profiler_dir: + self.profiler = "torch" + self._set_from_env_if_set( + "torch_profiler_record_shapes", + "VLLM_TORCH_PROFILER_RECORD_SHAPES", + ) + self._set_from_env_if_set( + "torch_profiler_with_memory", + "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", + ) + self._set_from_env_if_set( + "torch_profiler_with_stack", + "VLLM_TORCH_PROFILER_WITH_STACK", + ) + self._set_from_env_if_set( + "torch_profiler_with_flops", + "VLLM_TORCH_PROFILER_WITH_FLOPS", + ) + self._set_from_env_if_set( + "ignore_frontend", + "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", + ) + self._set_from_env_if_set( + "torch_profiler_use_gzip", + "VLLM_TORCH_PROFILER_USE_GZIP", + ) + self._set_from_env_if_set( + "torch_profiler_dump_cuda_time_total", + "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", + ) + + self._set_from_env_if_set( + "delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True + ) + self._set_from_env_if_set( + "max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True + ) + + has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0 + if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend: + logger.warning_once( + "Using 'torch' profiler with delay_iterations or max_iterations " + "while ignore_frontend is False may result in high overhead." + ) + + profiler_dir = self.torch_profiler_dir + if profiler_dir and self.profiler != "torch": + raise ValueError( + "torch_profiler_dir is only applicable when profiler is set to 'torch'" + ) + if self.profiler == "torch" and not profiler_dir: + raise ValueError("torch_profiler_dir must be set when profiler is 'torch'") + + if profiler_dir: + is_gs_path = ( + profiler_dir.startswith("gs://") + and profiler_dir[5:] + and profiler_dir[5] != "/" + ) + if not is_gs_path: + self.torch_profiler_dir = os.path.abspath( + os.path.expanduser(profiler_dir) + ) + + return self diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index a74413536407b..614a3226cb711 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -39,6 +39,7 @@ from .lora import LoRAConfig from .model import ModelConfig from .observability import ObservabilityConfig from .parallel import ParallelConfig +from .profiler import ProfilerConfig from .scheduler import SchedulerConfig from .speculative import SpeculativeConfig from .structured_outputs import StructuredOutputsConfig @@ -218,6 +219,8 @@ class VllmConfig: You can specify the full compilation config like so: `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}` """ + profiler_config: ProfilerConfig = Field(default_factory=ProfilerConfig) + """Profiling configuration.""" kv_transfer_config: KVTransferConfig | None = None """The configurations for distributed KV cache transfer.""" kv_events_config: KVEventsConfig | None = None @@ -296,6 +299,8 @@ class VllmConfig: vllm_factors.append("None") if self.structured_outputs_config: vllm_factors.append(self.structured_outputs_config.compute_hash()) + if self.profiler_config: + vllm_factors.append(self.profiler_config.compute_hash()) else: vllm_factors.append("None") vllm_factors.append(self.observability_config.compute_hash()) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 096ac15afcfab..d0bfab4e6569f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -50,6 +50,7 @@ from vllm.config import ( ObservabilityConfig, ParallelConfig, PoolerConfig, + ProfilerConfig, SchedulerConfig, SpeculativeConfig, StructuredOutputsConfig, @@ -536,6 +537,8 @@ class EngineArgs: worker_cls: str = ParallelConfig.worker_cls worker_extension_cls: str = ParallelConfig.worker_extension_cls + profiler_config: ProfilerConfig = get_field(VllmConfig, "profiler_config") + kv_transfer_config: KVTransferConfig | None = None kv_events_config: KVEventsConfig | None = None @@ -1168,7 +1171,7 @@ class EngineArgs: vllm_group.add_argument( "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"] ) - + vllm_group.add_argument("--profiler-config", **vllm_kwargs["profiler_config"]) vllm_group.add_argument( "--optimization-level", **vllm_kwargs["optimization_level"] ) @@ -1786,6 +1789,7 @@ class EngineArgs: kv_transfer_config=self.kv_transfer_config, kv_events_config=self.kv_events_config, ec_transfer_config=self.ec_transfer_config, + profiler_config=self.profiler_config, additional_config=self.additional_config, optimization_level=self.optimization_level, ) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 913324fd5f9c3..5d5c4a1cdb77b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -20,6 +20,7 @@ from vllm.beam_search import ( from vllm.config import ( CompilationConfig, PoolerConfig, + ProfilerConfig, StructuredOutputsConfig, is_init_field, ) @@ -211,6 +212,7 @@ class LLM: structured_outputs_config: dict[str, Any] | StructuredOutputsConfig | None = None, + profiler_config: dict[str, Any] | ProfilerConfig | None = None, kv_cache_memory_bytes: int | None = None, compilation_config: int | dict[str, Any] | CompilationConfig | None = None, logits_processors: list[str | type[LogitsProcessor]] | None = None, @@ -282,6 +284,20 @@ class LLM: else: structured_outputs_instance = StructuredOutputsConfig() + if profiler_config is not None: + if isinstance(profiler_config, dict): + profiler_config_instance = ProfilerConfig( + **{ + k: v + for k, v in profiler_config.items() + if is_init_field(ProfilerConfig, k) + } + ) + else: + profiler_config_instance = profiler_config + else: + profiler_config_instance = ProfilerConfig() + # warn about single-process data parallel usage. _dp_size = int(kwargs.get("data_parallel_size", 1)) _distributed_executor_backend = kwargs.get("distributed_executor_backend") @@ -324,6 +340,7 @@ class LLM: mm_processor_kwargs=mm_processor_kwargs, pooler_config=pooler_config, structured_outputs_config=structured_outputs_instance, + profiler_config=profiler_config_instance, compilation_config=compilation_config_instance, logits_processors=logits_processors, **kwargs, diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index aa5089ffe84d7..bc827f045606c 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -99,12 +99,7 @@ class MistralToolParser(ToolParser): self.bot_token = "[TOOL_CALLS]" self.bot_token_id = self.vocab.get(self.bot_token) self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL) - if not _is_pre_v11_tokeniser(self.model_tokenizer): - self.fn_name_regex = re.compile( - r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL - ) - else: - self.fn_name_regex = None + self._is_pre_v11 = _is_pre_v11_tokeniser(self.model_tokenizer) if self.bot_token_id is None: raise RuntimeError( @@ -148,23 +143,24 @@ class MistralToolParser(ToolParser): tool_content = model_output.replace(self.bot_token, "").strip() try: - # we first try to directly load the json as parsing very nested - # jsons is difficult try: - if self.fn_name_regex: + if not self._is_pre_v11: function_call_arr = [] for single_tool_content in model_output.split(self.bot_token): - matches = self.fn_name_regex.findall(single_tool_content) + if "{" not in single_tool_content: + continue - for match in matches: - fn_name = match[0] - args = match[1] + end_name = single_tool_content.find("{") + fn_name, args = ( + single_tool_content[:end_name], + single_tool_content[end_name:], + ) - # fn_name is encoded outside serialized json dump - # only arguments are serialized - function_call_arr.append( - {"name": fn_name, "arguments": json.loads(args)} - ) + # fn_name is encoded outside serialized json dump + # only arguments are serialized + function_call_arr.append( + {"name": fn_name, "arguments": json.loads(args)} + ) else: function_call_arr = json.loads(tool_content) except json.JSONDecodeError: diff --git a/vllm/entrypoints/serve/profile/api_router.py b/vllm/entrypoints/serve/profile/api_router.py index 166f13764eb36..eeed6b45ef4e9 100644 --- a/vllm/entrypoints/serve/profile/api_router.py +++ b/vllm/entrypoints/serve/profile/api_router.py @@ -5,7 +5,7 @@ from fastapi import APIRouter, FastAPI, Request from fastapi.responses import Response -import vllm.envs as envs +from vllm.config import ProfilerConfig from vllm.engine.protocol import EngineClient from vllm.logger import init_logger @@ -35,15 +35,12 @@ async def stop_profile(raw_request: Request): def attach_router(app: FastAPI): - if envs.VLLM_TORCH_PROFILER_DIR: + profiler_config = getattr(app.state.args, "profiler_config", None) + assert profiler_config is None or isinstance(profiler_config, ProfilerConfig) + if profiler_config is not None and profiler_config.profiler is not None: logger.warning_once( - "Torch Profiler is enabled in the API server. This should ONLY be " - "used for local development!" + "Profiler with mode '%s' is enabled in the " + "API server. This should ONLY be used for local development!", + profiler_config.profiler, ) - elif envs.VLLM_TORCH_CUDA_PROFILE: - logger.warning_once( - "CUDA Profiler is enabled in the API server. This should ONLY be " - "used for local development!" - ) - if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE: app.include_router(router) diff --git a/vllm/envs.py b/vllm/envs.py index 0eb34cbb83963..86d3925710227 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -75,6 +75,7 @@ if TYPE_CHECKING: VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" VLLM_MAIN_CUDA_VERSION: str = "12.9" + VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False @@ -88,20 +89,23 @@ if TYPE_CHECKING: VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds VLLM_PLUGINS: list[str] | None = None VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None - VLLM_TORCH_CUDA_PROFILE: bool = False + # Deprecated env variables for profiling, kept for backward compatibility + # See also vllm/config/profiler.py and `--profiler-config` argument + VLLM_TORCH_CUDA_PROFILE: str | None = None VLLM_TORCH_PROFILER_DIR: str | None = None - VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False - VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False - VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False + VLLM_TORCH_PROFILER_RECORD_SHAPES: str | None = None + VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: str | None = None + VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: str | None = None + VLLM_TORCH_PROFILER_WITH_STACK: str | None = None + VLLM_TORCH_PROFILER_WITH_FLOPS: str | None = None + VLLM_TORCH_PROFILER_USE_GZIP: str | None = None + VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: str | None = None + VLLM_PROFILER_DELAY_ITERS: str | None = None + VLLM_PROFILER_MAX_ITERS: str | None = None + # End of deprecated env variables for profiling VLLM_USE_AOT_COMPILE: bool = False VLLM_USE_BYTECODE_HOOK: bool = False VLLM_FORCE_AOT_LOAD: bool = False - VLLM_TORCH_PROFILER_WITH_STACK: bool = True - VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False - VLLM_PROFILER_DELAY_ITERS: int = 0 - VLLM_PROFILER_MAX_ITERS: int = 0 - VLLM_TORCH_PROFILER_USE_GZIP: bool = True - VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False @@ -453,6 +457,14 @@ environment_variables: dict[str, Callable[[], Any]] = { # Main CUDA version of vLLM. This follows PyTorch but can be overridden. "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.9", + # Controls PyTorch float32 matmul precision mode within vLLM workers. + # Valid options mirror torch.set_float32_matmul_precision + "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices( + "VLLM_FLOAT32_MATMUL_PRECISION", + "highest", + ["highest", "high", "medium"], + case_sensitive=False, + ), # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None), @@ -842,71 +854,52 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv( "VLLM_LORA_RESOLVER_CACHE_DIR", None ), - # Enables torch CUDA profiling if set. - # On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered. - "VLLM_TORCH_CUDA_PROFILE": lambda: bool( - os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0" - ), + # Enables torch CUDA profiling if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"), # Enables torch profiler if set. - # Both AsyncLLM's CPU traces as well as workers' - # traces (CPU & GPU) will be saved under this directory. - # Note that it must be an absolute path. - "VLLM_TORCH_PROFILER_DIR": lambda: ( - None - if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None - else ( - val - if val.startswith("gs://") and val[5:] and val[5] != "/" - else os.path.abspath(os.path.expanduser(val)) - ) + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_DIR": lambda: os.getenv("VLLM_TORCH_PROFILER_DIR"), + # Enable torch profiler to record shapes if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES") ), - # Enable torch profiler to record shapes if set - # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will - # not record shapes. - "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0" + # Enable torch profiler to profile memory if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY") ), - # Enable torch profiler to profile memory if set - # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler - # will not profile memory. - "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0" + # Enable torch profiler to profile stack if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_WITH_STACK": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_WITH_STACK") ), - # Enable torch profiler to profile stack if set - # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL - # profile stack by default. - "VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0" + # Enable torch profiler to profile flops if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS") ), - # Enable torch profiler to profile flops if set - # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will - # not profile flops. - "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0" - ), - # Disable torch profiling of the AsyncLLMEngine process. - # If set to 1, will not profile the engine process. - "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0" + # Disable torch profiling of the AsyncLLMEngine process if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM") ), # Delay number of iterations before starting profiling when using # the torch/torch CUDA profiler. If set to 0, will start profiling immediately. - "VLLM_PROFILER_DELAY_ITERS": lambda: int( - os.getenv("VLLM_PROFILER_DELAY_ITERS", "0") - ), + # Deprecated, see profiler_config. + "VLLM_PROFILER_DELAY_ITERS": lambda: (os.getenv("VLLM_PROFILER_DELAY_ITERS")), # Maximum number of iterations to profile when using the torch/torch CUDA profiler. # If set to 0, will not limit the number of iterations. - "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")), + "VLLM_PROFILER_MAX_ITERS": lambda: os.getenv("VLLM_PROFILER_MAX_ITERS"), # Control whether torch profiler gzip-compresses profiling files. - # Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default). - "VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0" - ), + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_USE_GZIP": lambda: os.getenv("VLLM_TORCH_PROFILER_USE_GZIP"), # Control whether torch profiler dumps the self_cuda_time_total table. - # Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping - # (enabled by default). - "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0" + # Set to 0 to disable dumping the table. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL") ), # If set, vLLM will use Triton implementations of AWQ. "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 173d366267e87..033cc1f544b3b 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -292,7 +292,7 @@ def set_forward_context( if num_tokens_across_dp is None: assert ubatch_slices is None assert num_tokens is not None - _, num_tokens_across_dp = coordinate_batch_across_dp( + _, num_tokens_across_dp, _ = coordinate_batch_across_dp( num_tokens_unpadded=num_tokens, parallel_config=vllm_config.parallel_config, allow_microbatching=False, diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 4cab47f4192a2..b14e7dad77f9a 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -935,7 +935,11 @@ def enable_batch_invariant_mode(): # Batch invariant matmuls are no longer needed after cublas overrides if not is_torch_equal_or_newer("2.10.0.dev"): - if current_platform.is_device_capability(100): + if ( + current_platform.is_device_capability(100) + or current_platform.is_device_capability(80) + or current_platform.is_device_capability(89) + ): # For PyTorch 2.9, B200 uses GEMV for bs=1 # Requires https://github.com/pytorch/pytorch/pull/166735 _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA") diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index f3c158ee2f9dc..0b83a3f5c4803 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -895,6 +895,48 @@ def get_moe_configs( return None +def _ensure_block_size_k_divisible( + size_k: int, block_size_k: int, group_size: int +) -> int: + """Ensure block_size_k is a divisor of size_k and divisible by group_size. + + This ensures BLOCK_SIZE_K compatibility with MoeWNA16 CUDA kernel which + requires size_k % BLOCK_SIZE_K == 0 and BLOCK_SIZE_K % group_size == 0. + + Args: + size_k: The size_k dimension that must be divisible by result. + block_size_k: Preferred block size (will be adjusted if needed). + group_size: The result must be divisible by this. + + Returns: + A valid BLOCK_SIZE_K that divides size_k and is divisible by group_size. + """ + # Fast path: already valid + if size_k % block_size_k == 0 and block_size_k % group_size == 0: + return block_size_k + + # Find the largest value that: + # 1. Divides size_k (size_k % candidate == 0) + # 2. Is divisible by group_size (candidate % group_size == 0) + # 3. Is <= block_size_k (prefer smaller values close to block_size_k) + # + # Strategy: Search from min(block_size_k, size_k) down to group_size, + # stepping by group_size to ensure divisibility by group_size + max_search = min(block_size_k, size_k) + start = (max_search // group_size) * group_size + for candidate in range(start, group_size - 1, -group_size): + if size_k % candidate == 0: + return candidate + + # Fallback: if group_size divides size_k, use it + # This should always be true with correct group_size configuration + if size_k % group_size == 0: + return group_size + + # This should not happen with correct group_size, but ensure divisibility + return size_k + + def get_moe_wna16_block_config( config: dict[str, int], use_moe_wna16_cuda: bool, @@ -960,6 +1002,9 @@ def get_moe_wna16_block_config( # at the same time. block_size_n = 1024 + # Ensure BLOCK_SIZE_K is a divisor of size_k for CUDA kernel compatibility + block_size_k = _ensure_block_size_k_divisible(size_k, block_size_k, group_size) + return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k} diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index 7ded8eea79060..a5db086fb4729 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -5,6 +5,7 @@ import torch import torch.nn.functional as F from vllm import _custom_ops as ops +from vllm._aiter_ops import rocm_aiter_ops from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.platforms import current_platform @@ -45,10 +46,13 @@ class QuantFP8(CustomOp): super().__init__() self.static = static self.group_shape = group_shape + self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN self.num_token_padding = num_token_padding self.column_major_scales = column_major_scales self.use_ue8m0 = use_ue8m0 + self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled() + self.is_group_quant = group_shape.is_per_group() if self.is_group_quant: assert not static, "Group quantization only supports dynamic mode" @@ -92,6 +96,33 @@ class QuantFP8(CustomOp): use_per_token_if_dynamic=self.use_per_token_if_dynamic, ) + def forward_hip( + self, + x: torch.Tensor, + scale: torch.Tensor | None = None, + scale_ub: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + use_aiter_quant = ( + not self.is_group_quant + and self.use_aiter + and scale_ub is None + and x.is_contiguous() + ) + use_aiter_per_tensor_quant = ( + use_aiter_quant and self.group_shape == GroupShape.PER_TENSOR + ) + use_aiter_per_token_quant = ( + use_aiter_quant and self.group_shape == GroupShape.PER_TOKEN + ) + + if use_aiter_per_tensor_quant: + return rocm_aiter_ops.per_tensor_quant(x, _FP8_DTYPE, scale) + if use_aiter_per_token_quant: + return rocm_aiter_ops.per_token_quant(x, _FP8_DTYPE, scale) + + # Fallback to CUDA implementation + return self.forward_cuda(x, scale, scale_ub) + def forward_native( self, x: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index cf348290a2716..8570b8c339987 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -60,7 +60,7 @@ class MoeWNA16Config(QuantizationConfig): if self.linear_quant_method == "gptq": self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config) - elif self.linear_quant_method == "awq": + elif self.linear_quant_method in ("awq", "awq_marlin"): capability_tuple = current_platform.get_device_capability() device_capability = ( -1 if capability_tuple is None else capability_tuple.to_int() @@ -107,7 +107,7 @@ class MoeWNA16Config(QuantizationConfig): if linear_quant_method == "gptq": has_zp = not cls.get_from_keys(config, ["sym"]) modules_to_not_convert = [] - elif linear_quant_method == "awq": + elif linear_quant_method in ("awq", "awq_marlin"): has_zp = cls.get_from_keys(config, ["zero_point"]) modules_to_not_convert = cls.get_from_keys_or( config, ["modules_to_not_convert"], None @@ -184,7 +184,7 @@ class MoeWNA16Config(QuantizationConfig): return GPTQConfig.from_config(self.full_config).get_quant_method( layer, prefix ) - elif self.linear_quant_method == "awq": + elif self.linear_quant_method in ("awq", "awq_marlin"): if self.use_marlin and check_marlin_supports_layer( layer, self.group_size ): @@ -468,7 +468,8 @@ class MoeWNA16Method(FusedMoEMethodBase): shard_size = layer.intermediate_size_per_partition # convert gptq and awq weight to a standard format - if layer.quant_config.linear_quant_method == "awq": + # awq_marlin uses the same weight format as awq + if layer.quant_config.linear_quant_method in ("awq", "awq_marlin"): assert layer.quant_config.weight_bits == 4 if "weight" in weight_name: loaded_weight = convert_awq_tensor(loaded_weight, "qweight") diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 5a428740082f6..cbc618f1abd08 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -367,6 +367,8 @@ class Qwen2MoeModel(nn.Module): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, @@ -512,6 +514,12 @@ class Qwen2MoeModel(nn.Module): continue else: name = remapped_kv_scale_name + # GGUF: make sure that shared_expert_gate is a 2D tensor. + if ( + "mlp.shared_expert_gate" in name + and len(loaded_weight.shape) == 1 + ): + loaded_weight = loaded_weight[None, :] param = params_dict[name] weight_loader = getattr( param, "weight_loader", default_weight_loader diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index ff0fc78517876..f7adecbd88746 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -381,6 +381,8 @@ class RocmPlatform(Platform): compilation_config = vllm_config.compilation_config parallel_config = vllm_config.parallel_config is_eager_execution = compilation_config == CUDAGraphMode.NONE + use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled() + use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled() if compilation_config.cudagraph_mode.has_full_cudagraphs(): # decode context parallel does not support full cudagraphs @@ -400,8 +402,6 @@ class RocmPlatform(Platform): ) compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled() - if cache_config and cache_config.block_size is None: cache_config.block_size = 16 @@ -415,6 +415,9 @@ class RocmPlatform(Platform): ): compilation_config.custom_ops.append("+rms_norm") + if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops: + compilation_config.custom_ops.append("+quant_fp8") + @classmethod def verify_model_arch(cls, model_arch: str) -> None: if model_arch in _ROCM_UNSUPPORTED_MODELS: diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/wrapper.py similarity index 73% rename from vllm/profiler/gpu_profiler.py rename to vllm/profiler/wrapper.py index 798c615221b9f..a44a6a5eea0dd 100644 --- a/vllm/profiler/gpu_profiler.py +++ b/vllm/profiler/wrapper.py @@ -3,26 +3,27 @@ from abc import ABC, abstractmethod from contextlib import nullcontext +from typing import Literal import torch from typing_extensions import override -import vllm.envs as envs +from vllm.config import ProfilerConfig from vllm.logger import init_logger logger = init_logger(__name__) class WorkerProfiler(ABC): - def __init__(self) -> None: - self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS + def __init__(self, profiler_config: ProfilerConfig) -> None: + self._delay_iters = profiler_config.delay_iterations if self._delay_iters > 0: logger.info_once( "GPU profiling will start " f"{self._delay_iters} steps after start_profile." ) - self._max_iters = envs.VLLM_PROFILER_MAX_ITERS + self._max_iters = profiler_config.max_iterations if self._max_iters > 0: logger.info_once( "GPU profiling will stop " @@ -133,12 +134,27 @@ class WorkerProfiler(ABC): return nullcontext() +TorchProfilerActivity = Literal["CPU", "CUDA", "XPU"] +TorchProfilerActivityMap = { + "CPU": torch.profiler.ProfilerActivity.CPU, + "CUDA": torch.profiler.ProfilerActivity.CUDA, + "XPU": torch.profiler.ProfilerActivity.XPU, +} + + class TorchProfilerWrapper(WorkerProfiler): - def __init__(self, worker_name: str, local_rank: int) -> None: - super().__init__() + def __init__( + self, + profiler_config: ProfilerConfig, + worker_name: str, + local_rank: int, + activities: list[TorchProfilerActivity], + ) -> None: + super().__init__(profiler_config) self.local_rank = local_rank - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + self.profiler_config = profiler_config + torch_profiler_trace_dir = profiler_config.torch_profiler_dir if local_rank in (None, 0): logger.info( "Torch profiling enabled. Traces will be saved to: %s", @@ -147,24 +163,23 @@ class TorchProfilerWrapper(WorkerProfiler): logger.debug( "Profiler config: record_shapes=%s," "profile_memory=%s,with_stack=%s,with_flops=%s", - envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - envs.VLLM_TORCH_PROFILER_WITH_STACK, - envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + profiler_config.torch_profiler_record_shapes, + profiler_config.torch_profiler_with_memory, + profiler_config.torch_profiler_with_stack, + profiler_config.torch_profiler_with_flops, ) + + self.dump_cpu_time_total = "CPU" in activities and len(activities) == 1 self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, - with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + activities=[TorchProfilerActivityMap[activity] for activity in activities], + record_shapes=profiler_config.torch_profiler_record_shapes, + profile_memory=profiler_config.torch_profiler_with_memory, + with_stack=profiler_config.torch_profiler_with_stack, + with_flops=profiler_config.torch_profiler_with_flops, on_trace_ready=torch.profiler.tensorboard_trace_handler( torch_profiler_trace_dir, worker_name=worker_name, - use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP, + use_gzip=profiler_config.torch_profiler_use_gzip, ), ) @@ -176,9 +191,10 @@ class TorchProfilerWrapper(WorkerProfiler): def _stop(self) -> None: self.profiler.stop() - if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: - rank = self.local_rank - profiler_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_config = self.profiler_config + rank = self.local_rank + if profiler_config.torch_profiler_dump_cuda_time_total: + profiler_dir = profiler_config.torch_profiler_dir profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt" sort_key = "self_cuda_time_total" table = self.profiler.key_averages().table(sort_by=sort_key) @@ -189,6 +205,12 @@ class TorchProfilerWrapper(WorkerProfiler): # only print profiler results on rank 0 if rank == 0: print(table) + if self.dump_cpu_time_total and rank == 0: + logger.info( + self.profiler.key_averages().table( + sort_by="self_cpu_time_total", row_limit=50 + ) + ) @override def annotate_context_manager(self, name: str): @@ -196,8 +218,8 @@ class TorchProfilerWrapper(WorkerProfiler): class CudaProfilerWrapper(WorkerProfiler): - def __init__(self) -> None: - super().__init__() + def __init__(self, profiler_config: ProfilerConfig) -> None: + super().__init__(profiler_config) # Note: lazy import to avoid dependency issues if CUDA is not available. import torch.cuda.profiler as cuda_profiler diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index d0661d1f23b06..bf593ca4e52a0 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -63,6 +63,31 @@ class ReasoningParser: True if the reasoning content ends in the input_ids. """ + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + """ + Check if the reasoning content ends in the input_ids on a + decode step. + + It is used in structured engines like `xgrammar` to check if the + reasoning content ends in the model output during a decode step. + `input_ids` the entire model output and `delta_ids` are the last few + computed tokens of the model output (like during a decode step). + + Parameters: + input_ids: list[int] + The entire model output. + delta_ids: list[int] + The last few computed tokens of the model output at the current decode step. + + Returns: + bool + True if the reasoning content ends in the `delta_ids` on a + decode step. + """ + return self.is_reasoning_end(input_ids) + @abstractmethod def extract_content_ids(self, input_ids: list[int]) -> list[int]: """ diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py index e78ac4a5ebb37..43067ca4afe05 100644 --- a/vllm/reasoning/basic_parsers.py +++ b/vllm/reasoning/basic_parsers.py @@ -74,6 +74,12 @@ class BaseThinkingReasoningParser(ReasoningParser): return True return False + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + end_token_id = self.end_token_id + return end_token_id in delta_ids + def extract_content_ids(self, input_ids: list[int]) -> list[int]: """ Extract the content after the end tokens diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py index afdf73262aca0..6604f70badbcf 100644 --- a/vllm/reasoning/deepseek_v3_reasoning_parser.py +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -35,6 +35,11 @@ class DeepSeekV3ReasoningParser(ReasoningParser): def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: return self._parser.is_reasoning_end(input_ids) + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + return self._parser.is_reasoning_end_streaming(input_ids, delta_ids) + def extract_content_ids(self, input_ids: list[int]) -> list[int]: return self._parser.extract_content_ids(input_ids) diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py index 76de1c077c88b..f80190d28d6aa 100644 --- a/vllm/reasoning/holo2_reasoning_parser.py +++ b/vllm/reasoning/holo2_reasoning_parser.py @@ -56,6 +56,11 @@ class Holo2ReasoningParser(ReasoningParser): def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: return self._parser.is_reasoning_end(input_ids) + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + return self._parser.is_reasoning_end_streaming(input_ids, delta_ids) + def extract_content_ids(self, input_ids: list[int]) -> list[int]: return self._parser.extract_content_ids(input_ids) diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py index e92f8add0391a..e998e071efcf6 100644 --- a/vllm/reasoning/identity_reasoning_parser.py +++ b/vllm/reasoning/identity_reasoning_parser.py @@ -32,6 +32,11 @@ class IdentityReasoningParser(ReasoningParser): # Always return True, since we never treat reasoning specially return True + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + return True + def extract_content_ids(self, input_ids: list[int]) -> list[int]: # Identity: return all tokens as content return input_ids diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index ef0f8d9e67452..8a3500c0aac6b 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -145,7 +145,7 @@ class CudagraphDispatcher: num_tokens: int, uniform_decode: bool, has_lora: bool, - use_cascade_attn: bool = False, + disable_full: bool = False, ) -> tuple[CUDAGraphMode, BatchDescriptor]: """ Given conditions(e.g.,batch descriptor and if using cascade attention), @@ -165,7 +165,7 @@ class CudagraphDispatcher: ) relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs() - if not use_cascade_attn: + if not disable_full: # check if key exists for full cudagraph if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]: return CUDAGraphMode.FULL, batch_desc diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fd7e04dc02082..931d13be3d9b6 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -166,32 +166,24 @@ class AsyncLLM(EngineClient): pass if ( - envs.VLLM_TORCH_PROFILER_DIR - and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM + vllm_config.profiler_config.profiler == "torch" + and not vllm_config.profiler_config.ignore_frontend ): + profiler_dir = vllm_config.profiler_config.torch_profiler_dir logger.info( "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501 - envs.VLLM_TORCH_PROFILER_DIR, + profiler_dir, ) - if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0: - logger.warning_once( - "Torch profiler received max_iters or delay_iters setting. These " - "are not compatible with the AsyncLLM profiler and will be ignored " - "for the AsyncLLM process. Engine process profiling will still " - "respect these settings. Consider setting " - "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable " - "AsyncLLM profiling." - ) worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm" self.profiler = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, ], - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + with_stack=vllm_config.profiler_config.torch_profiler_with_stack, on_trace_ready=torch.profiler.tensorboard_trace_handler( - envs.VLLM_TORCH_PROFILER_DIR, + profiler_dir, worker_name=worker_name, - use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP, + use_gzip=vllm_config.profiler_config.torch_profiler_use_gzip, ), ) else: diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 31428db2d3afc..9f7859a5c3565 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1258,7 +1258,7 @@ class EagleProposer: num_tokens_padded: int, ) -> tuple[int, torch.Tensor]: # TODO(Flechman): support DBO ubatching - should_ubatch, num_toks_across_dp = coordinate_batch_across_dp( + should_ubatch, num_toks_across_dp, _ = coordinate_batch_across_dp( num_tokens_unpadded=num_tokens_unpadded, parallel_config=self.vllm_config.parallel_config, allow_microbatching=False, diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 5ee88178cdf60..4dd478804049b 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -339,7 +339,9 @@ class StructuredOutputManager: return True # Check if reasoning ends in *this* step - if self.reasoner.is_reasoning_end(request.all_token_ids): + if self.reasoner.is_reasoning_end_streaming( + request.all_token_ids, request.all_token_ids[request.num_computed_tokens :] + ): # Reasoning just ended, so we shouldn't advance til # next pass structured_req.reasoning_ended = True diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index b080fea1d2dd6..e54b995ab908f 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -13,6 +13,7 @@ from vllm.logger import init_logger from vllm.model_executor.utils import set_random_seed from vllm.platforms import CpuArchEnum, current_platform from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo +from vllm.profiler.wrapper import TorchProfilerWrapper from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment @@ -38,30 +39,17 @@ class CPUWorker(Worker): self.parallel_config.disable_custom_all_reduce = True - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + # Torch profiler. Enabled and configured through profiler_config. self.profiler: Any | None = None - if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_config = vllm_config.profiler_config + if profiler_config.profiler == "torch": worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" - logger.info( - "Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir, + self.profiler = TorchProfilerWrapper( + profiler_config, + worker_name=worker_name, + local_rank=self.local_rank, + activities=["CPU"], ) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - ], - record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, - with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, worker_name=worker_name, use_gzip=False - ), - ) - else: - self.profiler = None def init_device(self): # Setup OpenMP threads affinity. @@ -202,9 +190,3 @@ class CPUWorker(Worker): self.profiler.start() else: self.profiler.stop() - if self.local_rank == 0: - logger.info( - self.profiler.key_averages().table( - sort_by="self_cpu_time_total", row_limit=50 - ) - ) diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 5da55d740c347..1b9646e1980a8 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -40,16 +40,18 @@ def _run_ar( should_dp_pad: bool, orig_num_tokens_per_ubatch: int, padded_num_tokens_per_ubatch: int, + cudagraph_mode: int, parallel_config: ParallelConfig, ) -> torch.Tensor: dp_size = parallel_config.data_parallel_size dp_rank = parallel_config.data_parallel_rank device, group = _get_device_and_group(parallel_config) - tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32) + tensor = torch.zeros(5, dp_size, device=device, dtype=torch.int32) tensor[0][dp_rank] = orig_num_tokens_per_ubatch tensor[1][dp_rank] = padded_num_tokens_per_ubatch tensor[2][dp_rank] = 1 if should_ubatch else 0 tensor[3][dp_rank] = 1 if should_dp_pad else 0 + tensor[4][dp_rank] = cudagraph_mode dist.all_reduce(tensor, group=group) return tensor @@ -89,13 +91,23 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch return num_tokens_across_dp.cpu() +def _post_process_cudagraph_mode(tensor: torch.Tensor) -> int: + """ + Synchronize cudagraph_mode across DP ranks by taking the minimum. + If any rank has NONE (0), all ranks use NONE. + This ensures all ranks send consistent values (all padded or all unpadded). + """ + return int(tensor[4, :].min().item()) + + def _synchronize_dp_ranks( num_tokens_unpadded: int, num_tokens_padded: int, should_attempt_ubatching: bool, should_attempt_dp_padding: bool, + cudagraph_mode: int, parallel_config: ParallelConfig, -) -> tuple[bool, torch.Tensor | None]: +) -> tuple[bool, torch.Tensor | None, int]: """ 1. Decides if each DP rank is going to microbatch. Either all ranks run with microbatching or none of them do. @@ -104,10 +116,13 @@ def _synchronize_dp_ranks( When running microbatched or if should_attempt_dp_padding is True, all ranks will be padded out so that the run with the same number of tokens + 3. Synchronizes cudagraph_mode across ranks by taking the minimum. + Returns: tuple[ should_ubatch: Are all DP ranks going to microbatch num_tokens_after_padding: A tensor containing the total number of tokens per-microbatch for each DP rank including any DP padding. + synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks) ] """ @@ -121,6 +136,7 @@ def _synchronize_dp_ranks( should_dp_pad=should_attempt_dp_padding, orig_num_tokens_per_ubatch=num_tokens_unpadded, padded_num_tokens_per_ubatch=num_tokens_padded, + cudagraph_mode=cudagraph_mode, parallel_config=parallel_config, ) @@ -148,7 +164,10 @@ def _synchronize_dp_ranks( should_dp_pad, ) - return should_ubatch, num_tokens_after_padding + # Synchronize cudagraph_mode across ranks (take min) + synced_cudagraph_mode = _post_process_cudagraph_mode(tensor) + + return should_ubatch, num_tokens_after_padding, synced_cudagraph_mode def coordinate_batch_across_dp( @@ -159,7 +178,8 @@ def coordinate_batch_across_dp( num_tokens_padded: int | None = None, uniform_decode: bool | None = None, num_scheduled_tokens_per_request: np.ndarray | None = None, -) -> tuple[bool, torch.Tensor | None]: + cudagraph_mode: int = 0, +) -> tuple[bool, torch.Tensor | None, int]: """ Coordinates amongst all DP ranks to determine if and how the full batch should be split into microbatches. @@ -175,6 +195,7 @@ def coordinate_batch_across_dp( only contains single token decodes num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The number of tokens per request. + cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL) Returns: tuple[ ubatch_slices: if this is set then all DP ranks have agreed to @@ -183,12 +204,13 @@ def coordinate_batch_across_dp( tokens per-microbatch for each DP rank including padding. Will be padded up to the max value across all DP ranks when allow_dp_padding is True. + synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks) ] """ if parallel_config.data_parallel_size == 1: # Early exit. - return False, None + return False, None, cudagraph_mode # If the caller has explicitly enabled microbatching. should_attempt_ubatching = False @@ -204,12 +226,15 @@ def coordinate_batch_across_dp( if num_tokens_padded is None: num_tokens_padded = num_tokens_unpadded - (should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks( - num_tokens_unpadded, - num_tokens_padded, - should_attempt_ubatching, - allow_dp_padding, - parallel_config, + (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) = ( + _synchronize_dp_ranks( + num_tokens_unpadded, + num_tokens_padded, + should_attempt_ubatching, + allow_dp_padding, + cudagraph_mode, + parallel_config, + ) ) - return (should_ubatch, num_tokens_after_padding) + return (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py index f6bc607c1ae67..a2e3decad0486 100644 --- a/vllm/v1/worker/gpu/async_utils.py +++ b/vllm/v1/worker/gpu/async_utils.py @@ -2,14 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager +import numpy as np import torch from vllm.v1.outputs import ( AsyncModelRunnerOutput, LogprobsTensors, ModelRunnerOutput, - SamplerOutput, ) +from vllm.v1.worker.gpu.sample.output import SamplerOutput class AsyncOutput(AsyncModelRunnerOutput): @@ -34,29 +35,18 @@ class AsyncOutput(AsyncModelRunnerOutput): with torch.cuda.stream(self.copy_stream): self.copy_stream.wait_stream(default_stream) - # NOTE(woosuk): We must ensure that CPU tensors are not freed - # before the device-to-host copy is fully completed. For instance, - # operations like - # self.sampled_token_np = ...to("cpu", non_blocking=True).numpy() - # are unsafe because the underlying CPU tensor can be prematurely freed and - # reused by other tensors before the asynchronous copy finishes, potentially - # causing race conditions. To prevent this, we delay freeing by holding - # references until the copy event signals completion. - # Likewise, we also need to keep the reference to the GPU tensors. - # This is done by keeping the reference to sampler_output and - # model_runner_output. - self.sampled_token_ids = sampler_output.sampled_token_ids.to( - "cpu", non_blocking=True - ) + self.sampled_token_ids = async_copy_to_np(sampler_output.sampled_token_ids) if sampler_output.logprobs_tensors is not None: self.logprobs_tensors: LogprobsTensors | None = ( sampler_output.logprobs_tensors.to_cpu_nonblocking() ) else: self.logprobs_tensors = None - self.num_sampled_tokens_cpu = num_sampled_tokens.to( - "cpu", non_blocking=True - ) + if sampler_output.num_nans is not None: + self.num_nans = async_copy_to_np(sampler_output.num_nans) + else: + self.num_nans = None + self.num_sampled_tokens_np = async_copy_to_np(num_sampled_tokens) self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {} if self.model_runner_output.prompt_logprobs_dict: for k, v in self.model_runner_output.prompt_logprobs_dict.items(): @@ -68,7 +58,6 @@ class AsyncOutput(AsyncModelRunnerOutput): def get_output(self) -> ModelRunnerOutput: self.copy_event.synchronize() - num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy() # NOTE(woosuk): The following code is to ensure compatibility with # the existing model runner. @@ -76,10 +65,18 @@ class AsyncOutput(AsyncModelRunnerOutput): # rather than Python lists. sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist() num_reqs = len(sampled_token_ids) + num_sampled_tokens = self.num_sampled_tokens_np.tolist() for i in range(num_reqs): - del sampled_token_ids[i][num_sampled_tokens_np[i] :] + del sampled_token_ids[i][num_sampled_tokens[i] :] self.model_runner_output.sampled_token_ids = sampled_token_ids + if self.num_nans is not None: + num_nans = self.num_nans.tolist() + self.model_runner_output.num_nans_in_logits = { + req_id: num_nans[i] + for i, req_id in enumerate(self.model_runner_output.req_ids) + } + if self.logprobs_tensors is not None: self.model_runner_output.logprobs = self.logprobs_tensors.tolists() self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict @@ -95,3 +92,7 @@ def async_barrier(event: torch.cuda.Event | None): finally: if event is not None: event.record() + + +def async_copy_to_np(x: torch.Tensor) -> np.ndarray: + return x.to("cpu", non_blocking=True).numpy() diff --git a/vllm/v1/worker/gpu/metrics/__init__.py b/vllm/v1/worker/gpu/metrics/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/worker/gpu/metrics/logits.py b/vllm/v1/worker/gpu/metrics/logits.py new file mode 100644 index 0000000000000..fd7b30beaa1f8 --- /dev/null +++ b/vllm/v1/worker/gpu/metrics/logits.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch +from torch._inductor.runtime.triton_helpers import libdevice + +from vllm.triton_utils import tl, triton + + +@triton.jit +def _num_nans_kernel( + logits_ptr, + logits_stride, + num_nans_ptr, + vocab_size, + BLOCK_SIZE: tl.constexpr, +): + req_idx = tl.program_id(0) + num_nans = 0 + for i in range(0, vocab_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < vocab_size + logits = tl.load( + logits_ptr + req_idx * logits_stride + block, mask=mask, other=0 + ) + logits = logits.to(tl.float32) + is_nan = libdevice.isnan(logits).to(tl.int1) + num_nans += tl.sum(is_nan).to(tl.int32) + tl.store(num_nans_ptr + req_idx, num_nans) + + +def get_num_nans(logits: torch.Tensor) -> torch.Tensor: + num_reqs, vocab_size = logits.shape + BLOCK_SIZE = 8192 + num_nans = torch.empty(num_reqs, dtype=torch.int32, device=logits.device) + _num_nans_kernel[(num_reqs,)]( + logits, + logits.stride(0), + num_nans, + vocab_size, + BLOCK_SIZE=BLOCK_SIZE, + ) + return num_nans diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 464f7b7bd3532..9f4c6edfb6aa9 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -25,7 +25,6 @@ from vllm.v1.outputs import ( LogprobsTensors, ModelRunnerOutput, ) -from vllm.v1.sample.sampler import SamplerOutput from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier from vllm.v1.worker.gpu.attn_utils import ( build_attn_metadata, @@ -53,6 +52,7 @@ from vllm.v1.worker.gpu.sample.metadata import ( SamplingMetadata, expand_sampling_metadata, ) +from vllm.v1.worker.gpu.sample.output import SamplerOutput from vllm.v1.worker.gpu.sample.sampler import Sampler from vllm.v1.worker.gpu.spec_decode import init_speculator from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample diff --git a/vllm/v1/worker/gpu/sample/min_p.py b/vllm/v1/worker/gpu/sample/min_p.py index 0638818006f50..c98a42cb2b1bb 100644 --- a/vllm/v1/worker/gpu/sample/min_p.py +++ b/vllm/v1/worker/gpu/sample/min_p.py @@ -39,9 +39,7 @@ def _min_p_kernel( tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask) -def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor | None) -> None: - if min_p is None: - return +def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor) -> None: num_reqs, vocab_size = logits.shape BLOCK_SIZE = 1024 _min_p_kernel[(num_reqs,)]( diff --git a/vllm/v1/worker/gpu/sample/output.py b/vllm/v1/worker/gpu/sample/output.py new file mode 100644 index 0000000000000..13e8cf1d6c1ec --- /dev/null +++ b/vllm/v1/worker/gpu/sample/output.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + +import torch + +from vllm.v1.outputs import LogprobsTensors + + +@dataclass +class SamplerOutput: + sampled_token_ids: torch.Tensor + logprobs_tensors: LogprobsTensors | None + num_nans: torch.Tensor | None diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py index c8d4b7d81841d..b4fcc822ecfce 100644 --- a/vllm/v1/worker/gpu/sample/penalties.py +++ b/vllm/v1/worker/gpu/sample/penalties.py @@ -62,6 +62,7 @@ def _penalties_and_temperature_kernel( mask=packed_block < tl.cdiv(vocab_size, 32), ) prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1 + prompt_bin_mask = prompt_bin_mask.to(tl.int1) prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE) # If token appears in prompt or output, apply, otherwise use 1.0 for no-op. diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py index 9a4224d8fddef..84a3e18671b2c 100644 --- a/vllm/v1/worker/gpu/sample/sampler.py +++ b/vllm/v1/worker/gpu/sample/sampler.py @@ -3,13 +3,15 @@ import torch +import vllm.envs as envs from vllm.config.model import LogprobsMode -from vllm.v1.outputs import SamplerOutput from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p +from vllm.v1.worker.gpu.metrics.logits import get_num_nans from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu.sample.min_p import apply_min_p +from vllm.v1.worker.gpu.sample.output import SamplerOutput from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature @@ -21,12 +23,16 @@ class Sampler: if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]: raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}") self.logprobs_mode = logprobs_mode + self.compute_nans = envs.VLLM_COMPUTE_NANS_IN_LOGITS # False by default. def __call__( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> SamplerOutput: + # NOTE(woosuk): We intentionally compute num_nans before sampling to make clear + # that num_nans is computed before applying penalties and temperature. + num_nans = get_num_nans(logits) if self.compute_nans else None sampled, processed_logits = self.sample(logits, sampling_metadata) if sampling_metadata.max_num_logprobs is not None: logits = ( @@ -49,6 +55,7 @@ class Sampler: # token per request. sampled_token_ids=sampled.view(-1, 1), logprobs_tensors=logprobs_tensors, + num_nans=num_nans, ) return sampler_output @@ -63,7 +70,8 @@ class Sampler: # Apply penalties and temperature in place. apply_penalties_and_temperature(logits, sampling_metadata) # Apply min_p in place. - apply_min_p(logits, sampling_metadata.min_p) + if sampling_metadata.min_p is not None: + apply_min_p(logits, sampling_metadata.min_p) # Apply top_k and/or top_p. This might return a new tensor. logits = apply_top_k_top_p( logits, sampling_metadata.top_k, sampling_metadata.top_p diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 22a3f9d8d2dda..7398defd74a38 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2788,17 +2788,19 @@ class GPUModelRunner( ) dispatch_cudagraph = ( - lambda num_tokens: self.cudagraph_dispatcher.dispatch( + lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch( num_tokens=num_tokens, has_lora=has_lora, - use_cascade_attn=use_cascade_attn, uniform_decode=uniform_decode, + disable_full=disable_full, ) if not force_eager else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) ) - cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded) + cudagraph_mode, batch_descriptor = dispatch_cudagraph( + num_tokens_padded, use_cascade_attn + ) num_tokens_padded = batch_descriptor.num_tokens # Extra coordination when running data-parallel since we need to coordinate @@ -2813,23 +2815,28 @@ class GPUModelRunner( self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE ) - should_ubatch, num_tokens_across_dp = coordinate_batch_across_dp( - num_tokens_unpadded=num_tokens, - parallel_config=self.parallel_config, - allow_microbatching=allow_microbatching, - allow_dp_padding=allow_dp_padding, - num_tokens_padded=num_tokens_padded, - uniform_decode=uniform_decode, - num_scheduled_tokens_per_request=num_scheduled_tokens_np, + should_ubatch, num_tokens_across_dp, synced_cudagraph_mode = ( + coordinate_batch_across_dp( + num_tokens_unpadded=num_tokens, + parallel_config=self.parallel_config, + allow_microbatching=allow_microbatching, + allow_dp_padding=allow_dp_padding, + num_tokens_padded=num_tokens_padded, + uniform_decode=uniform_decode, + num_scheduled_tokens_per_request=num_scheduled_tokens_np, + cudagraph_mode=cudagraph_mode.value, + ) ) - # Extract DP padding if there is any + # Extract DP-synced values if num_tokens_across_dp is not None: dp_rank = self.parallel_config.data_parallel_rank num_tokens_padded = int(num_tokens_across_dp[dp_rank].item()) - - # Re-dispatch with DP padding - cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded) + # Re-dispatch with DP padding so we have the correct batch_descriptor + cudagraph_mode, batch_descriptor = dispatch_cudagraph( + num_tokens_padded, + disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value, + ) # Assert to make sure the agreed upon token count is correct otherwise # num_tokens_across_dp will no-longer be valid assert batch_descriptor.num_tokens == num_tokens_padded @@ -4161,10 +4168,19 @@ class GPUModelRunner( if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) + # Eagle currently only supports PIECEWISE cudagraphs. + # Therefore only use cudagraphs if the main model uses PIECEWISE + # NOTE(lucas): this is a hack, need to clean up. use_cudagraphs = ( - cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE) - and not self.speculative_config.enforce_eager - ) + ( + is_graph_capturing + and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ) + or ( + not is_graph_capturing + and cudagraph_runtime_mode != CUDAGraphMode.NONE + ) + ) and not self.speculative_config.enforce_eager # Note(gnovack) - We need to disable cudagraphs for one of the two # lora cases when cudagraph_specialize_lora is enabled. This is a diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index a46ec2bd118fe..f2b6a1f76b0b9 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -38,7 +38,7 @@ from vllm.model_executor import set_random_seed from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.platforms import current_platform -from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper +from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask from vllm.utils.mem_constants import GiB_bytes @@ -79,6 +79,10 @@ class Worker(WorkerBase): is_driver_worker=is_driver_worker, ) + # configure float32 matmul precision according to vLLM env. + precision = envs.VLLM_FLOAT32_MATMUL_PRECISION + torch.set_float32_matmul_precision(precision) + if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing from vllm.utils.import_utils import init_cached_hf_modules @@ -88,17 +92,19 @@ class Worker(WorkerBase): # Buffers saved before sleep self._sleep_saved_buffers: dict[str, torch.Tensor] = {} - # Torch/CUDA profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace - # VLLM_TORCH_CUDA_PROFILE=1 + # Torch/CUDA profiler. Enabled and configured through profiler_config. self.profiler: Any | None = None - if envs.VLLM_TORCH_PROFILER_DIR: + profiler_config = vllm_config.profiler_config + if profiler_config.profiler == "torch": worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" self.profiler = TorchProfilerWrapper( - worker_name=worker_name, local_rank=self.local_rank + profiler_config, + worker_name=worker_name, + local_rank=self.local_rank, + activities=["CPU", "CUDA"], ) - elif envs.VLLM_TORCH_CUDA_PROFILE: - self.profiler = CudaProfilerWrapper() + elif profiler_config.profiler == "cuda": + self.profiler = CudaProfilerWrapper(profiler_config) else: self.profiler = None diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index ce18ca6c37165..7a10ac198985e 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -98,10 +98,10 @@ class TPUWorker: # MP runtime is initialized. self.profiler = None self.profile_dir = None - if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1: + if vllm_config.profiler_config.profiler == "torch" and self.rank < 1: # For TPU, we can only have 1 active profiler session for 1 profiler # server. So we only profile on rank0. - self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR + self.profile_dir = vllm_config.profiler_config.torch_profiler_dir logger.info( "Profiling enabled. Traces will be saved to: %s", self.profile_dir ) diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 267369c730368..1faa1a24ff0ea 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -6,12 +6,12 @@ from typing import Any import torch import torch.distributed -import vllm.envs as envs from vllm.config import VllmConfig from vllm.distributed import get_world_group from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.platforms import current_platform +from vllm.profiler.wrapper import TorchProfilerWrapper from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment from vllm.v1.worker.xpu_model_runner import XPUModelRunner @@ -36,41 +36,17 @@ class XPUWorker(Worker): assert device_config.device_type == "xpu" assert current_platform.is_xpu() - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + # Torch profiler. Enabled and configured through profiler_config. self.profiler: Any | None = None - if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_config = vllm_config.profiler_config + if profiler_config.profiler == "torch": worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" - logger.info( - "Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir, + self.profiler = TorchProfilerWrapper( + profiler_config, + worker_name=worker_name, + local_rank=self.local_rank, + activities=["CPU", "XPU"], ) - logger.debug( - "Profiler config: record_shapes=%s," - "profile_memory=%s,with_stack=%s,with_flops=%s", - envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - envs.VLLM_TORCH_PROFILER_WITH_STACK, - envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - ) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.XPU, - ], - record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, - with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, - worker_name=worker_name, - use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP, - ), - ) - else: - self.profiler = None # we provide this function due to `torch.xpu.mem_get_info()` doesn't # return correct free_gpu_memory on intel client GPU. We need to