diff --git a/.buildkite/scripts/ci-clean-log.sh b/.buildkite/scripts/ci-clean-log.sh new file mode 100644 index 0000000000000..69d8a3a288316 --- /dev/null +++ b/.buildkite/scripts/ci-clean-log.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Usage: ./ci_clean_log.sh ci.log +# This script strips timestamps and color codes from CI log files. + +# Check if argument is given +if [ $# -lt 1 ]; then + echo "Usage: $0 ci.log" + exit 1 +fi + +INPUT_FILE="$1" + +# Strip timestamps +sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE" + +# Strip colorization +sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE" diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 61aa7df13b4d5..8db8c3a05fb30 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -24,13 +24,22 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 + # list packages + docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " + set -e + pip list" + + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pip list" + # offline inference docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " set -e @@ -43,7 +52,10 @@ function cpu_tests() { pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model pytest -v -s tests/models/language/generation -m cpu_model pytest -v -s tests/models/language/pooling -m cpu_model - pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model" + pytest -v -s tests/models/multimodal/generation \ + --ignore=tests/models/multimodal/generation/test_mllama.py \ + --ignore=tests/models/multimodal/generation/test_pixtral.py \ + -m cpu_model" # Run compressed-tensor test docker exec cpu-test-"$NUMA_NODE" bash -c " @@ -69,7 +81,7 @@ function cpu_tests() { set -e python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - python3 benchmarks/benchmark_serving.py \ + VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \ --backend vllm \ --dataset-name random \ --model facebook/opt-125m \ diff --git a/.buildkite/scripts/rerun-test.sh b/.buildkite/scripts/rerun-test.sh new file mode 100644 index 0000000000000..d79c0d5f381b1 --- /dev/null +++ b/.buildkite/scripts/rerun-test.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Usage: ./rerun_test.sh path/to/test.py::test_name + +# Check if argument is given +if [ $# -lt 1 ]; then + echo "Usage: $0 path/to/test.py::test_name" + echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]" + exit 1 +fi + +TEST=$1 +COUNT=1 + +while pytest -sv "$TEST"; do + COUNT=$((COUNT + 1)) + echo "RUN NUMBER ${COUNT}" +done diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b739851cb9052..8f39862708689 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -177,6 +177,11 @@ steps: - tests/tracing commands: - pytest -v -s metrics + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" - pytest -v -s tracing ##### fast check tests ##### @@ -305,6 +310,7 @@ steps: commands: - pytest -v -s compile/test_pass_manager.py - pytest -v -s compile/test_fusion.py + - pytest -v -s compile/test_fusion_attn.py - pytest -v -s compile/test_silu_mul_quant_fusion.py - pytest -v -s compile/test_sequence_parallelism.py - pytest -v -s compile/test_async_tp.py @@ -669,7 +675,7 @@ steps: - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins - label: Multi-step Tests (4 GPUs) # 36min - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml index f05be2ba8707a..8c5c28cd77cff 100644 --- a/.github/ISSUE_TEMPLATE/400-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml @@ -8,6 +8,16 @@ body: attributes: value: > #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: markdown + attributes: + value: | + ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as: + - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys) + - Passwords or authentication credentials + - Private URLs or endpoints + - Personal or confidential data + + Consider redacting or replacing sensitive values with placeholders like `` when sharing configuration or code examples. - type: textarea attributes: label: Your current environment diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index c1d1e07bf628f..017ec7ca82da7 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,6 +2,7 @@ - [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". - [ ] The test plan, such as providing test command. - [ ] The test results, such as pasting the results comparison before and after, or e2e results +- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model. PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED. @@ -11,5 +12,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B ## Test Result +## (Optional) Documentation Update + **BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) diff --git a/.github/mergify.yml b/.github/mergify.yml index e595060c325a5..20b4a8fc2dbc3 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -36,6 +36,20 @@ pull_request_rules: add: - frontend +- name: label-llama + description: Automatically apply llama label + conditions: + - or: + - files~=^examples/.*llama.*\.py + - files~=^tests/.*llama.*\.py + - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py + - files~=^vllm/model_executor/models/.*llama.*\.py + - files~=^vllm/transformers_utils/configs/.*llama.*\.py + actions: + label: + add: + - llama + - name: label-multi-modality description: Automatically apply multi-modality label conditions: @@ -51,6 +65,26 @@ pull_request_rules: add: - multi-modality +- name: label-rocm + description: Automatically apply rocm label + conditions: + - or: + - files~=^csrc/rocm/ + - files~=^docker/Dockerfile.rocm + - files~=^requirements/rocm.*\.txt + - files~=^vllm/attention/backends/rocm.*\.py + - files~=^vllm/attention/ops/rocm.*\.py + - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py + - files~=^vllm/v1/attention/backends/mla/rocm.*\.py + - files~=^tests/kernels/.*_rocm.*\.py + - files=vllm/platforms/rocm.py + - title~=(?i)AMD + - title~=(?i)ROCm + actions: + label: + add: + - rocm + - name: label-structured-output description: Automatically apply structured-output label conditions: diff --git a/.gitignore b/.gitignore index e49d1d6ba6191..88a42a5c0f644 100644 --- a/.gitignore +++ b/.gitignore @@ -200,5 +200,5 @@ benchmarks/**/*.json actionlint shellcheck*/ -# Ingore moe/marlin_moe gen code +# Ignore moe/marlin_moe gen code csrc/moe/marlin_moe_wna16/kernel_* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a105b0e14c4af..7534ae55907e6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,12 +20,10 @@ repos: args: [--output-format, github, --fix] - id: ruff-format files: ^(.buildkite|benchmarks|examples)/.* -- repo: https://github.com/codespell-project/codespell - rev: v2.4.1 +- repo: https://github.com/crate-ci/typos + rev: v1.32.0 hooks: - - id: codespell - additional_dependencies: ['tomli'] - args: ['--toml', 'pyproject.toml'] + - id: typos - repo: https://github.com/PyCQA/isort rev: 6.0.1 hooks: @@ -145,6 +143,13 @@ repos: types: [python] pass_filenames: false additional_dependencies: [regex] + - id: check-pickle-imports + name: Prevent new pickle/cloudpickle imports + entry: python tools/check_pickle_imports.py + language: python + types: [python] + pass_filenames: false + additional_dependencies: [pathspec, regex] # Keep `suggestion` last - id: suggestion name: Suggestion diff --git a/CMakeLists.txt b/CMakeLists.txt index afaed7cd18214..d75f0d3212476 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. # 9.0 for latest bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") if (MARLIN_ARCHS) # @@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # kernels for the remaining archs that are not already built for 3x. # (Build 8.9 for FP8) cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS - "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}") + "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) if (SCALED_MM_2X_ARCHS) @@ -542,10 +542,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # CUTLASS MoE kernels - # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works + # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled # if it's possible to compile MoE kernels that use its output. - cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu" "csrc/quantization/cutlass_w8a8/moe/moe_data.cu") @@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") # 9.0 for latest bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") if (MARLIN_MOE_ARCHS) # diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index c06857247eeed..4d2ea126b24a5 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -123,7 +123,7 @@ def main(args: argparse.Namespace): save_to_pytorch_benchmark_format(args, results) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the latency of processing a single batch of " "requests till completion." @@ -171,6 +171,12 @@ if __name__ == "__main__": # V1 enables prefix caching by default which skews the latency # numbers. We need to disable prefix caching by default. parser.set_defaults(enable_prefix_caching=False) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: raise OSError( diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 00869fa94e71a..6e0f3b51c9d28 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -142,7 +142,7 @@ def main(args): ) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the performance with or " "without automatic prefix caching." @@ -192,5 +192,11 @@ if __name__ == "__main__": ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 3e4704f0b8205..b5e2613de1cd4 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -218,7 +218,7 @@ def main(args): ) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the performance with or without " "automatic prefix caching." @@ -268,5 +268,11 @@ if __name__ == "__main__": ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 5496703f23ccb..bb453791c1862 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -161,7 +161,7 @@ def main(args: argparse.Namespace): json.dump(results, f, indent=4) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument( "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm" @@ -204,6 +204,12 @@ if __name__ == "__main__": ) parser = EngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 81428fb7dae12..f38e45b261138 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -875,7 +875,7 @@ def main(args: argparse.Namespace): save_to_pytorch_benchmark_format(args, result_json, file_name) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the online serving throughput." ) @@ -1225,6 +1225,10 @@ if __name__ == "__main__": "script chooses a LoRA module at random.", ) - args = parser.parse_args() + return parser + +if __name__ == "__main__": + parser = create_argument_parser() + args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index c1501ad52c25a..e23a5a9e2233d 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -850,7 +850,7 @@ def main(args: argparse.Namespace): json.dump(results, outfile, indent=4) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser( description="Benchmark the online serving throughput." ) @@ -1034,5 +1034,10 @@ if __name__ == "__main__": help="Ratio of Structured Outputs requests", ) + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index d19753d40e497..401ebe0bdb265 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -595,7 +595,7 @@ def validate_args(args): ) -if __name__ == "__main__": +def create_argument_parser(): parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument( "--backend", @@ -717,6 +717,12 @@ if __name__ == "__main__": ) parser = AsyncEngineArgs.add_cli_args(parser) + + return parser + + +if __name__ == "__main__": + parser = create_argument_parser() args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py index b964ed242edf8..d17443871cf66 100644 --- a/benchmarks/kernels/bench_fp8_gemm.py +++ b/benchmarks/kernels/bench_fp8_gemm.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy import itertools @@ -11,6 +10,80 @@ from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant from vllm.triton_utils import triton +PROVIDER_CFGS = { + "torch-bf16": dict(enabled=True), + "fp8-tensor-w-token-a": dict( + w="tensor", a="token", no_a_quant=False, enabled=False + ), + "fp8-tensor-w-tensor-a": dict( + w="tensor", a="tensor", no_a_quant=False, enabled=True + ), + "fp8-channel-w-token-a": dict( + w="channel", a="token", no_a_quant=False, enabled=True + ), + "fp8-channel-w-tensor-a": dict( + w="channel", a="tensor", no_a_quant=False, enabled=False + ), + "fp8-tensor-w-token-a-noquant": dict( + w="tensor", a="token", no_a_quant=True, enabled=False + ), + "fp8-tensor-w-tensor-a-noquant": dict( + w="tensor", a="tensor", no_a_quant=True, enabled=True + ), + "fp8-channel-w-token-a-noquant": dict( + w="channel", a="token", no_a_quant=True, enabled=True + ), + "fp8-channel-w-tensor-a-noquant": dict( + w="channel", a="tensor", no_a_quant=True, enabled=False + ), +} + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str): + if w_type == "tensor": + scale_b = torch.ones(1, device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + else: + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True) + return b_fp8.t(), scale_b_fp8 + + +def build_fp8_runner(cfg, a, b, dtype, device): + b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device) + + scale_a_const = ( + torch.ones(1, device=device, dtype=torch.float32) + if cfg["a"] == "tensor" + else None + ) + + if cfg["no_a_quant"]: + if cfg["a"] == "tensor": + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const) + else: + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True) + + def run(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + return run + + if cfg["a"] == "tensor": + + def run(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + else: + + def run(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + return run + @triton.testing.perf_report( triton.testing.Benchmark( @@ -18,28 +91,8 @@ from vllm.triton_utils import triton x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], x_log=False, line_arg="provider", - line_vals=[ - "torch-bf16", - # "fp8-tensor-w-token-a", - "fp8-tensor-w-tensor-a", - "fp8-channel-w-token-a", - # "fp8-channel-w-tensor-a", - # "fp8-tensor-w-token-a-noquant", - "fp8-tensor-w-tensor-a-noquant", - "fp8-channel-w-token-a-noquant", - # "fp8-channel-w-tensor-a-noquant", - ], - line_names=[ - "torch-bf16", - # "fp8-tensor-w-token-a", - "fp8-tensor-w-tensor-a", - "fp8-channel-w-token-a", - # "fp8-channel-w-tensor-a", - # "fp8-tensor-w-token-a-noquant", - "fp8-tensor-w-tensor-a-noquant", - "fp8-channel-w-token-a-noquant", - # "fp8-channel-w-tensor-a-noquant", - ], + line_vals=_enabled, + line_names=_enabled, ylabel="TFLOP/s (larger is better)", plot_name="BF16 vs FP8 GEMMs", args={}, @@ -50,144 +103,34 @@ def benchmark(batch_size, provider, N, K): device = "cuda" dtype = torch.bfloat16 - # Create input tensors a = torch.randn((M, K), device=device, dtype=dtype) b = torch.randn((N, K), device=device, dtype=dtype) quantiles = [0.5, 0.2, 0.8] - if "torch-bf16" in provider: + if provider == "torch-bf16": ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( lambda: torch.nn.functional.linear(a, b), quantiles=quantiles ) - - elif "fp8" in provider: - # Weights are always quantized ahead of time - if "noquant" in provider: - # For no quantization, we just measure the GEMM - if "tensor-w-token-a" in provider: - # Dynamic per-token quant for A, per-tensor quant for B - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) - assert scale_b_fp8.numel() == 1 - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( - a, use_per_token_if_dynamic=True - ) - - def run_quant(): - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "tensor-w-tensor-a" in provider: - # Static per-tensor quantization with fixed scales - # for both A and B - scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) - scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - assert scale_b_fp8.numel() == 1 - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) - - def run_quant(): - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "channel-w-token-a" in provider: - # Static per-channel quantization for weights, per-token - # quant for A - scale_b = torch.tensor((N,), device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - scale_b_fp8 = scale_b_fp8.expand(N).contiguous() - assert scale_b_fp8.numel() == N - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( - a, use_per_token_if_dynamic=True - ) - - def run_quant(): - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "channel-w-tensor-a" in provider: - # Static per-channel quantization for weights, per-tensor - # quant for A - scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) - scale_b = torch.tensor((N,), device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - scale_b_fp8 = scale_b_fp8.expand(N).contiguous() - assert scale_b_fp8.numel() == N - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) - - def run_quant(): - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - else: - # In these cases, we quantize the activations during the GEMM call - if "tensor-w-token-a" in provider: - # Dynamic per-token quant for A, per-tensor quant for B - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) - assert scale_b_fp8.numel() == 1 - - def run_quant(): - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( - a, use_per_token_if_dynamic=True - ) - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "tensor-w-tensor-a" in provider: - # Static per-tensor quantization with fixed scales - # for both A and B - scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) - scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - assert scale_b_fp8.numel() == 1 - - def run_quant(): - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "channel-w-token-a" in provider: - # Static per-channel quantization for weights, per-token - # quant for A - scale_b = torch.tensor((N,), device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - scale_b_fp8 = scale_b_fp8.expand(N).contiguous() - assert scale_b_fp8.numel() == N - - def run_quant(): - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( - a, use_per_token_if_dynamic=True - ) - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - elif "channel-w-tensor-a" in provider: - # Static per-channel quantization for weights, per-tensor - # quant for A - scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) - scale_b = torch.tensor((N,), device=device, dtype=torch.float32) - b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) - scale_b_fp8 = scale_b_fp8.expand(N).contiguous() - assert scale_b_fp8.numel() == N - - def run_quant(): - a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) - return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) - - b_fp8 = b_fp8.t() - + else: + cfg = PROVIDER_CFGS[provider] + run_quant = build_fp8_runner(cfg, a, b, dtype, device) ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( lambda: run_quant(), quantiles=quantiles ) - # Calculate TFLOP/s, two flops per multiply-add - tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3) - return tflops(ms), tflops(max_ms), tflops(min_ms) + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) def prepare_shapes(args): - KN_model_names = [] - models_tps = list(itertools.product(args.models, args.tp_sizes)) - for model, tp_size in models_tps: - assert model in WEIGHT_SHAPES - for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]): - KN[tp_split_dim] = KN[tp_split_dim] // tp_size + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size KN.append(model) - KN_model_names.append(KN) - return KN_model_names + out.append(KN) + return out if __name__ == "__main__": @@ -197,21 +140,13 @@ if __name__ == "__main__": nargs="+", type=str, default=["meta-llama/Llama-3.1-8B-Instruct"], - choices=[*WEIGHT_SHAPES.keys()], - help="List of models to benchmark", - ) - parser.add_argument( - "--tp-sizes", - nargs="+", - type=int, - default=[1], - help="List of tensor parallel sizes", + choices=list(WEIGHT_SHAPES.keys()), ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) args = parser.parse_args() - KN_model_names = prepare_shapes(args) - for K, N, model_name in KN_model_names: - print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:") + for K, N, model in prepare_shapes(args): + print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:") benchmark.run( print_data=True, show_plots=True, diff --git a/benchmarks/kernels/bench_int8_gemm.py b/benchmarks/kernels/bench_int8_gemm.py new file mode 100644 index 0000000000000..e9c6d64404d0d --- /dev/null +++ b/benchmarks/kernels/bench_int8_gemm.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import copy +import itertools + +import torch +from weight_shapes import WEIGHT_SHAPES + +from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm +from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant +from vllm.triton_utils import triton + +PROVIDER_CFGS = { + "torch-bf16": dict(enabled=True), + "int8-tensor-w-token-a": dict( + w="tensor", a="token", no_a_quant=False, enabled=False + ), + "int8-tensor-w-tensor-a": dict( + w="tensor", a="tensor", no_a_quant=False, enabled=True + ), + "int8-channel-w-token-a": dict( + w="channel", a="token", no_a_quant=False, enabled=True + ), + "int8-channel-w-tensor-a": dict( + w="channel", a="tensor", no_a_quant=False, enabled=False + ), + "int8-tensor-w-token-a-noquant": dict( + w="tensor", a="token", no_a_quant=True, enabled=False + ), + "int8-tensor-w-tensor-a-noquant": dict( + w="tensor", a="tensor", no_a_quant=True, enabled=True + ), + "int8-channel-w-token-a-noquant": dict( + w="channel", a="token", no_a_quant=True, enabled=True + ), + "int8-channel-w-tensor-a-noquant": dict( + w="channel", a="tensor", no_a_quant=True, enabled=False + ), +} + + +def _quant_weight(b, w_type, device): + if w_type == "tensor": + scale_b = torch.ones(1, device=device, dtype=torch.float32) + b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b) + assert scale_b_int8.numel() == 1 + else: # channel + b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b) + assert scale_b_int8.numel() == b.shape[0] + return b_int8.t(), scale_b_int8 + + +def build_int8_runner(cfg, a, b, dtype, device): + # quant before running the kernel + b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device) + + scale_a_const = None + if cfg["a"] == "tensor": + scale_a_const = torch.ones(1, device=device, dtype=torch.float32) + + # no quant, create activation ahead + if cfg["no_a_quant"]: + if cfg["a"] == "tensor": + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const) + else: # token + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a) + + def run_quant(): + return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) + + return run_quant + + # dynamic quant, create activation inside + if cfg["a"] == "tensor": + + def run_quant(): + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const) + return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) + + else: # token + + def run_quant(): + a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a) + return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype) + + return run_quant + + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")] + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=[k for k in _enabled], + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs INT8 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + else: + cfg = PROVIDER_CFGS[provider] + run_quant = build_int8_runner(cfg, a, b, dtype, device) + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), quantiles=quantiles + ) + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +def prepare_shapes(args): + KN_model_names = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + KN_model_names.append(KN) + return KN_model_names + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + help="List of models to benchmark", + ) + parser.add_argument( + "--tp-sizes", + nargs="+", + type=int, + default=[1], + help="List of tensor parallel sizes", + ) + args = parser.parse_args() + + for K, N, model in prepare_shapes(args): + print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_int8_res_n{N}_k{K}", + N=N, + K=K, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 6cb55b35993ef..cef53b183cef3 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -7,7 +7,6 @@ import time from contextlib import nullcontext from datetime import datetime from itertools import product -from types import SimpleNamespace from typing import Any, TypedDict import ray @@ -43,7 +42,7 @@ def benchmark_config( use_fp8_w8a8: bool, use_int8_w8a16: bool, num_iters: int = 100, - block_quant_shape: List[int] = None, + block_quant_shape: list[int] = None, use_deep_gemm: bool = False, ) -> float: init_dtype = torch.float16 if use_fp8_w8a8 else dtype @@ -400,7 +399,7 @@ class BenchmarkWorker: dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - block_quant_shape: List[int] = None, + block_quant_shape: list[int] = None, use_deep_gemm: bool = False, ) -> tuple[dict[str, int], float]: current_platform.seed_everything(self.seed) @@ -532,7 +531,7 @@ def save_configs( dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - block_quant_shape: List[int], + block_quant_shape: list[int], ) -> None: dtype_str = get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 @@ -563,7 +562,6 @@ def main(args: argparse.Namespace): config = get_config(model=args.model, trust_remote_code=args.trust_remote_code) if args.model_prefix: config = getattr(config, args.model_prefix) - config = SimpleNamespace(**config) if config.architectures[0] == "DbrxForCausalLM": E = config.ffn_config.moe_num_experts @@ -595,11 +593,7 @@ def main(args: argparse.Namespace): shard_intermediate_size = 2 * intermediate_size // args.tp_size hidden_size = config.hidden_size - dtype = ( - torch.float16 - if current_platform.is_rocm() - else getattr(torch, config.torch_dtype) - ) + dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" block_quant_shape = get_weight_block_size_safety(config) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index fb763db9fc359..5cd2c98f23438 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") else() find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND) + find_isa(${CPUINFO} "Power11" POWER11_FOUND) find_isa(${CPUINFO} "POWER10" POWER10_FOUND) find_isa(${CPUINFO} "POWER9" POWER9_FOUND) find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support @@ -106,13 +107,19 @@ elseif (AVX2_FOUND) list(APPEND CXX_COMPILE_FLAGS "-mavx2") message(WARNING "vLLM CPU backend using AVX2 ISA") -elseif (POWER9_FOUND OR POWER10_FOUND) +elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) message(STATUS "PowerPC detected") - # Check for PowerPC VSX support - list(APPEND CXX_COMPILE_FLAGS - "-mvsx" - "-mcpu=native" - "-mtune=native") + if (POWER9_FOUND) + list(APPEND CXX_COMPILE_FLAGS + "-mvsx" + "-mcpu=power9" + "-mtune=power9") + elseif (POWER10_FOUND OR POWER11_FOUND) + list(APPEND CXX_COMPILE_FLAGS + "-mvsx" + "-mcpu=power10" + "-mtune=power10") + endif() elseif (ASIMD_FOUND) message(STATUS "ARMv8 or later architecture detected") diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index a4edd5b96fe29..dba5baa362b83 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491 + GIT_TAG 763ad155a1c826f71ff318f41edb1e4e5e376ddb GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index 9b3a5c4b1014a..46108a32d719b 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -65,9 +65,6 @@ void paged_attention_v1_launcher( int kv_block_stride = key_cache.stride(0); int kv_head_stride = key_cache.stride(1); - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - // NOTE: alibi_slopes is optional. const float* alibi_slopes_ptr = alibi_slopes @@ -193,4 +190,4 @@ void paged_attention_v1( #undef WARP_SIZE #undef MAX #undef MIN -#undef DIVIDE_ROUND_UP \ No newline at end of file +#undef DIVIDE_ROUND_UP diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index 9935359e02fb1..9358c0d9f6a2a 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -66,9 +66,6 @@ void paged_attention_v2_launcher( int kv_block_stride = key_cache.stride(0); int kv_head_stride = key_cache.stride(1); - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - // NOTE: alibi_slopes is optional. const float* alibi_slopes_ptr = alibi_slopes @@ -203,4 +200,4 @@ void paged_attention_v2( #undef WARP_SIZE #undef MAX #undef MIN -#undef DIVIDE_ROUND_UP \ No newline at end of file +#undef DIVIDE_ROUND_UP diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index 0257d8ff16baf..82862fea7f2be 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -137,8 +137,8 @@ FORCE_INLINE std::pair reduceSoftmaxAlibi(T* data, const int size, } template -FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data, - const int size) { +FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data, + const int size) { T max = max_data[0]; for (int i = 1; i < size; ++i) { max = max >= max_data[i] ? max : max_data[i]; @@ -634,7 +634,7 @@ struct paged_attention_v2_impl { if (partition_num == 1) continue; - reducePartitonSoftmax( + reducePartitionSoftmax( max_logits + seq_idx * num_heads * max_num_partitions + head_idx * max_num_partitions, exp_sums + seq_idx * num_heads * max_num_partitions + diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 9a613ba588ddf..3952c43cbc727 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec { explicit FP16Vec16(const void* ptr) : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - // non-temproal load + // non-temporal load explicit FP16Vec16(bool, void* ptr) : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} @@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec { explicit BF16Vec16(const void* ptr) : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - // non-temproal load + // non-temporal load explicit BF16Vec16(bool, void* ptr) : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} @@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec { // normal load explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} - // non-temproal load + // non-temporal load explicit FP32Vec16(bool, void* ptr) : reg((__m512)_mm512_stream_load_si512(ptr)) {} @@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec { // normal load explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {} - // non-temproal load + // non-temporal load explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {} void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); } @@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec { _mm512_mask_storeu_epi8(ptr, mask, reg); } - // non-temproal save + // non-temporal save void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); } }; #endif diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index c17a8961629a6..02514edce8073 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -54,8 +54,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp); int page_num = numa_migrate_pages(pid, src_mask, mask); if (page_num == -1) { - TORCH_CHECK(false, - "numa_migrate_pages failed. errno: " + std::to_string(errno)); + TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno)); } // restrict memory allocation node. @@ -105,4 +104,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { return ss.str(); } -#endif \ No newline at end of file +#endif diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index 68f429fac18ab..a77471a7f2078 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -12,7 +12,7 @@ void moe_permute( const torch::Tensor& input, // [n_token, hidden] const torch::Tensor& topk_weights, //[n_token, topk] torch::Tensor& topk_ids, // [n_token, topk] - const torch::Tensor& token_expert_indicies, // [n_token, topk] + const torch::Tensor& token_expert_indices, // [n_token, topk] const std::optional& expert_map, // [n_expert] int64_t n_expert, int64_t n_local_expert, int64_t topk, const std::optional& align_block_size, @@ -27,15 +27,15 @@ void moe_permute( "expert_first_token_offset must be int64"); TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int, "topk_ids must be int32"); - TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int, - "token_expert_indicies must be int32"); + TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int, + "token_expert_indices must be int32"); TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int, "src_row_id2dst_row_id_map must be int32"); TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1, "expert_first_token_offset shape != n_local_expert+1") TORCH_CHECK( - src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(), - "token_expert_indicies shape must be same as src_row_id2dst_row_id_map"); + src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(), + "token_expert_indices shape must be same as src_row_id2dst_row_id_map"); auto n_token = input.sizes()[0]; auto n_hidden = input.sizes()[1]; auto align_block_size_value = @@ -71,7 +71,7 @@ void moe_permute( expert_map_ptr, n_expert, stream); } // expert sort topk expert id and scan expert id get expert_first_token_offset - sortAndScanExpert(get_ptr(topk_ids), get_ptr(token_expert_indicies), + sortAndScanExpert(get_ptr(topk_ids), get_ptr(token_expert_indices), get_ptr(permuted_experts_id), get_ptr(dst_row_id2src_row_id_map), get_ptr(expert_first_token_offset), n_token, @@ -190,7 +190,7 @@ void shuffle_rows(const torch::Tensor& input_tensor, void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, torch::Tensor& topk_ids, - const torch::Tensor& token_expert_indicies, + const torch::Tensor& token_expert_indices, const std::optional& expert_map, int64_t n_expert, int64_t n_local_expert, int64_t topk, const std::optional& align_block_size, @@ -203,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, void moe_unpermute(const torch::Tensor& input, const torch::Tensor& topk_weights, torch::Tensor& topk_ids, - const torch::Tensor& token_expert_indicies, + const torch::Tensor& token_expert_indices, const std::optional& expert_map, int64_t n_expert, int64_t n_local_expert, int64_t topk, const std::optional& align_block_size, diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl index 42441800fb110..ad0d390665a00 100644 --- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl @@ -20,7 +20,6 @@ __global__ void expandInputRowsKernel( int expert_id = sorted_experts[expanded_dest_row]; extern __shared__ int64_t smem_expert_first_token_offset[]; - int64_t align_expanded_row_accumulate = 0; if constexpr (ALIGN_BLOCK_SIZE) { // load g2s for (int idx = threadIdx.x; idx < num_local_experts + 1; @@ -63,7 +62,6 @@ __global__ void expandInputRowsKernel( using DataElem = cutlass::Array; // Duplicate and permute rows - int64_t const source_k_rank = expanded_source_row / num_rows; int64_t const source_row = expanded_source_row % num_rows; auto const* source_row_ptr = @@ -160,7 +158,6 @@ __global__ void finalizeMoeRoutingKernel( elem_index += stride) { ComputeElem thread_output; thread_output.fill(0); - float row_rescale{0.f}; for (int k_idx = 0; k_idx < k; ++k_idx) { int64_t const expanded_original_row = original_row + k_idx * num_rows; int64_t const expanded_permuted_row = @@ -177,8 +174,6 @@ __global__ void finalizeMoeRoutingKernel( auto const* expanded_permuted_rows_row_ptr = expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col; - int64_t const expert_idx = expert_for_source_row[k_offset]; - ComputeElem expert_result = arrayConvert( expanded_permuted_rows_row_ptr[elem_index]); thread_output = thread_output + row_scale * (expert_result); diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 10be47966f611..dea5b1f21ec27 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \ topkGatingSoftmaxLauncherHelper( \ - gating_output, nullptr, topk_weights, topk_indicies, \ + gating_output, nullptr, topk_weights, topk_indices, \ token_expert_indices, num_tokens, topk, 0, num_experts, \ stream); @@ -433,7 +433,7 @@ template void topkGatingSoftmaxKernelLauncher( const float* gating_output, float* topk_weights, - IndType* topk_indicies, + IndType* topk_indices, int* token_expert_indices, float* softmax_workspace, const int num_tokens, @@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher( moeSoftmax<<>>( gating_output, nullptr, softmax_workspace, num_experts); moeTopK<<>>( - softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices, + softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices, num_experts, topk, 0, num_experts); } } diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index a74eb3720cf1c..d6ef4940b6c31 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -66,7 +66,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { m.def( "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids," - "Tensor token_expert_indicies, Tensor? expert_map, int n_expert," + "Tensor token_expert_indices, Tensor? expert_map, int n_expert," "int n_local_expert," "int topk, int? align_block_size,Tensor! permuted_input, Tensor! " "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! " diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index fea4bc2ca0d8f..3d5077d9de461 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -274,7 +274,6 @@ void advance_step_flashinfer( cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - [[maybe_unused]] int block_tables_stride = block_tables.stride(0); TORCH_CHECK((blocks * threads > num_queries), "multi-step: not enough threads to map to num_queries = ", num_queries, " block_tables.stride(0) = ", block_tables.stride(0), diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index bf46cce60a233..87117a165fe92 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -1,15 +1,17 @@ #include #include + #include #include "../../dispatch_utils.h" +#include "../vectorization_utils.cuh" #ifndef USE_ROCM - #include #include + #include #else - #include #include + #include #endif static inline __device__ int8_t float_to_int8_rn(float x) { @@ -103,134 +105,170 @@ static inline __device__ int8_t int32_to_int8(int32_t x) { namespace vllm { -template +template __global__ void static_scaled_int8_quant_kernel( - scalar_t const* __restrict__ input, int8_t* __restrict__ out, - scale_type const* scale_ptr, const int hidden_size) { - int const tid = threadIdx.x; - int64_t const token_idx = blockIdx.x; - scale_type const scale = *scale_ptr; + const scalar_t* __restrict__ input, int8_t* __restrict__ output, + const scale_t* scale_ptr, const int hidden_size) { + const int tid = threadIdx.x; + const int stride = blockDim.x; + const int64_t token_idx = blockIdx.x; + const float scale = *scale_ptr; // Must be performed using 64-bit math to avoid integer overflow. - out += token_idx * hidden_size; - input += token_idx * hidden_size; + const scalar_t* row_in = input + token_idx * hidden_size; + int8_t* row_out = output + token_idx * hidden_size; - for (int i = tid; i < hidden_size; i += blockDim.x) { - out[i] = float_to_int8_rn(static_cast(input[i]) / scale); - } + vectorize_with_alignment<16>( + row_in, row_out, hidden_size, tid, stride, + [=] __device__(int8_t& dst, const scalar_t& src) { + dst = float_to_int8_rn(static_cast(src) / scale); + }); } -template +template __global__ void static_scaled_int8_azp_quant_kernel( - scalar_t const* __restrict__ input, int8_t* __restrict__ out, - scale_type const* scale_ptr, azp_type const* azp_ptr, - const int hidden_size) { - int const tid = threadIdx.x; - int64_t const token_idx = blockIdx.x; - scale_type const scale = *scale_ptr; - azp_type const azp = *azp_ptr; + const scalar_t* __restrict__ input, int8_t* __restrict__ output, + const scale_t* scale_ptr, const azp_t* azp_ptr, const int hidden_size) { + const int tid = threadIdx.x; + const int stride = blockDim.x; + const int64_t token_idx = blockIdx.x; + const float scale = *scale_ptr; + const azp_t azp = *azp_ptr; + const float inv_s = 1.0f / scale; // Must be performed using 64-bit math to avoid integer overflow. - out += token_idx * hidden_size; - input += token_idx * hidden_size; + const scalar_t* row_in = input + token_idx * hidden_size; + int8_t* row_out = output + token_idx * hidden_size; - for (int i = tid; i < hidden_size; i += blockDim.x) { - auto const val = static_cast(input[i]); - auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale) + azp); - out[i] = quant_val; - } + vectorize_with_alignment<16>( + row_in, row_out, hidden_size, tid, stride, + [=] __device__(int8_t& dst, const scalar_t& src) { + const auto v = static_cast(src) * inv_s; + dst = int32_to_int8(float_to_int32_rn(v) + azp); + }); } -template +template __global__ void dynamic_scaled_int8_quant_kernel( - scalar_t const* __restrict__ input, int8_t* __restrict__ out, - scale_type* scale, const int hidden_size) { - int const tid = threadIdx.x; - int64_t const token_idx = blockIdx.x; - float absmax_val = 0.0f; - float const zero = 0.0f; + const scalar_t* __restrict__ input, int8_t* __restrict__ output, + scale_t* scale_out, const int hidden_size) { + const int tid = threadIdx.x; + const int stride = blockDim.x; + const int64_t token_idx = blockIdx.x; // Must be performed using 64-bit math to avoid integer overflow. - out += token_idx * hidden_size; - input += token_idx * hidden_size; + const scalar_t* row_in = input + token_idx * hidden_size; + int8_t* row_out = output + token_idx * hidden_size; - for (int i = tid; i < hidden_size; i += blockDim.x) { - float val = static_cast(input[i]); - val = val > zero ? val : -val; - absmax_val = val > absmax_val ? val : absmax_val; + // calculate for absmax + float thread_max = 0.f; + for (int i = tid; i < hidden_size; i += stride) { + const auto v = fabsf(static_cast(row_in[i])); + thread_max = fmaxf(thread_max, v); } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage reduceStorage; - float const block_absmax_val_maybe = - BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x); - __shared__ float block_absmax_val; + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage tmp; + float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x); + __shared__ float absmax; if (tid == 0) { - block_absmax_val = block_absmax_val_maybe; - scale[token_idx] = block_absmax_val / 127.0f; + absmax = block_max; + scale_out[blockIdx.x] = absmax / 127.f; } __syncthreads(); - float const tmp_scale = 127.0f / block_absmax_val; - for (int i = tid; i < hidden_size; i += blockDim.x) { - out[i] = float_to_int8_rn(static_cast(input[i]) * tmp_scale); - } + float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax; + + // 2. quantize + vectorize_with_alignment<16>( + row_in, row_out, hidden_size, tid, stride, + [=] __device__(int8_t& dst, const scalar_t& src) { + dst = float_to_int8_rn(static_cast(src) * inv_s); + }); } -template +// MinMax structure to hold min and max values in one go +struct MinMax { + float min, max; + + __host__ __device__ MinMax() + : min(std::numeric_limits::max()), + max(std::numeric_limits::lowest()) {} + + __host__ __device__ explicit MinMax(float v) : min(v), max(v) {} + + // add a value to the MinMax + __host__ __device__ MinMax& operator+=(float v) { + min = fminf(min, v); + max = fmaxf(max, v); + return *this; + } + + // merge two MinMax objects + __host__ __device__ MinMax& operator&=(const MinMax& other) { + min = fminf(min, other.min); + max = fmaxf(max, other.max); + return *this; + } +}; + +__host__ __device__ inline MinMax operator+(MinMax a, float v) { + return a += v; +} +__host__ __device__ inline MinMax operator&(MinMax a, const MinMax& b) { + return a &= b; +} + +template __global__ void dynamic_scaled_int8_azp_quant_kernel( - scalar_t const* __restrict__ input, int8_t* __restrict__ out, - scale_type* scale, azp_type* azp, const int hidden_size) { - int64_t const token_idx = blockIdx.x; + const scalar_t* __restrict__ input, int8_t* __restrict__ output, + scale_t* scale_out, azp_t* azp_out, const int hidden_size) { + const int tid = threadIdx.x; + const int stride = blockDim.x; + const int64_t token_idx = blockIdx.x; // Must be performed using 64-bit math to avoid integer overflow. - out += token_idx * hidden_size; - input += token_idx * hidden_size; + const scalar_t* row_in = input + token_idx * hidden_size; + int8_t* row_out = output + token_idx * hidden_size; - // Scan for the min and max value for this token - float max_val = std::numeric_limits::min(); - float min_val = std::numeric_limits::max(); - for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) { - auto val = static_cast(input[i]); - max_val = std::max(max_val, val); - min_val = std::min(min_val, val); + // 1. calculate min & max + MinMax thread_mm; + for (int i = tid; i < hidden_size; i += stride) { + thread_mm += static_cast(row_in[i]); } - // Reduce the max and min values across the block - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage reduceStorage; - max_val = BlockReduce(reduceStorage).Reduce(max_val, cub::Max{}, blockDim.x); - __syncthreads(); // Make sure min doesn't mess with max shared memory - min_val = BlockReduce(reduceStorage).Reduce(min_val, cub::Min{}, blockDim.x); + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage tmp; - __shared__ scale_type scale_sh; - __shared__ azp_type azp_sh; + MinMax mm = BlockReduce(tmp).Reduce( + thread_mm, + [] __device__(MinMax a, const MinMax& b) { + a &= b; + return a; + }, + blockDim.x); - // Compute the scale and zero point and store them, only on the first thread - if (threadIdx.x == 0) { - float const scale_val = (max_val - min_val) / 255.0f; - // Use rounding to even (same as torch.round) - auto const azp_float = std::nearbyint(-128.0f - min_val / scale_val); - auto const azp_val = static_cast(azp_float); - - // Store the scale and azp into shared and global - scale[token_idx] = scale_sh = scale_val; - azp[token_idx] = azp_sh = azp_val; + __shared__ float scale_sh; + __shared__ azp_t azp_sh; + if (tid == 0) { + float s = (mm.max - mm.min) / 255.f; + float zp = nearbyintf(-128.f - mm.min / s); // round-to-even + scale_sh = s; + azp_sh = azp_t(zp); + scale_out[blockIdx.x] = s; + azp_out[blockIdx.x] = azp_sh; } - - // Wait for the scale and azp to be computed __syncthreads(); - float const scale_val = scale_sh; - azp_type const azp_val = azp_sh; + const float inv_s = 1.f / scale_sh; + const azp_t azp = azp_sh; - // Quantize the values - for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) { - auto const val = static_cast(input[i]); - auto const quant_val = - int32_to_int8(float_to_int32_rn(val / scale_val) + azp_val); - out[i] = quant_val; - } + // 2. quantize + vectorize_with_alignment<16>( + row_in, row_out, hidden_size, tid, stride, + [=] __device__(int8_t& dst, const scalar_t& src) { + const auto v = static_cast(src) * inv_s; + dst = int32_to_int8(float_to_int32_rn(v) + azp); + }); } } // namespace vllm @@ -247,7 +285,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] int const hidden_size = input.size(-1); int const num_tokens = input.numel() / hidden_size; dim3 const grid(num_tokens); - dim3 const block(std::min(hidden_size, 1024)); + dim3 const block(std::min(hidden_size, 256)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( input.scalar_type(), "static_scaled_int8_quant_kernel", [&] { @@ -278,7 +316,7 @@ void dynamic_scaled_int8_quant( int const hidden_size = input.size(-1); int const num_tokens = input.numel() / hidden_size; dim3 const grid(num_tokens); - dim3 const block(std::min(hidden_size, 1024)); + dim3 const block(std::min(hidden_size, 256)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] { diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh index 6da2da6340759..1549ed96aa2be 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh @@ -15,11 +15,25 @@ using c3x::cutlass_gemm_caller; template typename Epilogue> struct sm100_fp8_config_default { - // M in (128, inf) + // M in (256, inf) static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; - using TileShape = Shape<_256, _128, _64>; + using TileShape = Shape<_256, _128, _128>; + using ClusterShape = Shape<_2, _2, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + +template typename Epilogue> +struct sm100_fp8_config_M256 { + // M in (128, 256] + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_128, _128, _128>; using ClusterShape = Shape<_2, _2, _1>; using Cutlass3xGemm = cutlass_3x_gemm_sm100()); using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; - using TileShape = Shape<_128, _128, _64>; - using ClusterShape = Shape<_2, _2, _1>; + using TileShape = Shape<_128, _128, _256>; + using ClusterShape = Shape<_2, _4, _1>; using Cutlass3xGemm = cutlass_3x_gemm_sm100; @@ -72,6 +86,8 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out, typename sm100_fp8_config_M64::Cutlass3xGemm; using Cutlass3xGemmM128 = typename sm100_fp8_config_M128::Cutlass3xGemm; + using Cutlass3xGemmM256 = + typename sm100_fp8_config_M256::Cutlass3xGemm; uint32_t const m = a.size(0); uint32_t const mp2 = @@ -85,8 +101,12 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out, // m in (64, 128] return cutlass_gemm_caller( out, a, b, std::forward(args)...); + } else if (mp2 <= 256) { + // m in (128, 256] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); } else { - // m in (128, inf) + // m in (256, inf) return cutlass_gemm_caller( out, a, b, std::forward(args)...); } diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu index 076c4a085337b..b51033c9b72c9 100644 --- a/csrc/quantization/fp4/nvfp4_experts_quant.cu +++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu @@ -231,12 +231,115 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec& vec, float SFScaleVal, } // Use UE4M3 by default. -template +template __global__ void #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) __launch_bounds__(512, 4) cvt_fp16_to_fp4( #else cvt_fp16_to_fp4( +#endif + int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, + uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, + uint32_t* output_scale_offset_by_experts, int n_experts, bool low_latency) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + using PackedVec = PackedVec; + static constexpr int CVT_FP4_NUM_THREADS_PER_SF = + (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); + static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, + "Vec size is not matched."); + + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD; + + // Each global thread processes one element + for (int globalIdx = tid; globalIdx < numRows * colsPerRow; + globalIdx += gridDim.x * blockDim.x) { + // Calculate which row and column this global thread should process + int rowIdx = globalIdx / colsPerRow; + int colIdx = globalIdx % colsPerRow; + + int64_t inOffset = rowIdx * colsPerRow + colIdx; + PackedVec in_vec = reinterpret_cast(in)[inOffset]; + // Get the output tensor offset. + // Same as inOffset because 8 elements are packed into one uint32_t. + int64_t outOffset = inOffset; + auto& out_pos = out[outOffset]; + + // Find index within the experts using different strategies based on expert + // count + int rowIdx_in_expert = 0; + int expert_idx = 0; + + if constexpr (SMALL_NUM_EXPERTS) { + for (int i = 0; i < n_experts; i++) { + uint32_t current_offset = __ldca(&input_offset_by_experts[i]); + uint32_t next_offset = __ldca(&input_offset_by_experts[i + 1]); + if (rowIdx >= current_offset && rowIdx < next_offset) { + rowIdx_in_expert = rowIdx - current_offset; + expert_idx = i; + break; + } + } + } else { + // Load input offsets into registers first, then do the computation. + // Local array size set to 17 because of register limit. + uint32_t local_offsets[17]; + for (int chunk_start = 0; chunk_start < n_experts; chunk_start += 16) { + *reinterpret_cast(local_offsets) = + __ldca(reinterpret_cast( + &input_offset_by_experts[chunk_start])); + *reinterpret_cast(local_offsets + 4) = + __ldca(reinterpret_cast( + &input_offset_by_experts[chunk_start + 4])); + *reinterpret_cast(local_offsets + 8) = + __ldca(reinterpret_cast( + &input_offset_by_experts[chunk_start + 8])); + *reinterpret_cast(local_offsets + 12) = + __ldca(reinterpret_cast( + &input_offset_by_experts[chunk_start + 12])); + local_offsets[16] = __ldca(&input_offset_by_experts[chunk_start + 16]); + + // Check against the 16 loaded offsets + #pragma unroll + for (int i = 0; i < 16; i++) { + if (rowIdx >= local_offsets[i] && rowIdx < local_offsets[i + 1]) { + rowIdx_in_expert = rowIdx - local_offsets[i]; + expert_idx = chunk_start + i; + break; + } + } + } + } + + // Get the global scaling factor, which will be applied to the SF. + // Note SFScale is the same as next GEMM's alpha, which is + // (448.f / (Alpha_A / 6.f)). + float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx]; + + int factor = CVT_FP4_SF_VEC_SIZE * 4; + // The actual output_scales dim is computed from the padded numCols. + int32_t numCols_padded = (numCols + factor - 1) / factor * factor; + int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4; + uint32_t* SFout_in_expert = + SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout; + + auto sf_out = + cvt_quant_to_fp4_get_sf_out_offset( + rowIdx_in_expert, colIdx, numCols, SFout_in_expert); + + out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); + } +#endif +} + +// Kernel for LARGE_M_TOPK = true (large m_topk optimized version) +template +__global__ void +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) +__launch_bounds__(1024, 4) cvt_fp16_to_fp4( +#else +cvt_fp16_to_fp4( #endif int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, @@ -247,50 +350,80 @@ cvt_fp16_to_fp4( (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + extern __shared__ uint32_t shared_input_offsets[]; - // Input tensor row/col loops. - for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) { - for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; - colIdx += blockDim.x) { - int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; - PackedVec in_vec = reinterpret_cast(in)[inOffset]; - // Get the output tensor offset. - // Same as inOffset because 8 elements are packed into one uint32_t. - int64_t outOffset = inOffset; - auto& out_pos = out[outOffset]; - - // Find index within the experts. - int rowIdx_in_expert = 0; - int expert_idx = 0; - for (int i = 0; i < n_experts; i++) { - if (rowIdx >= input_offset_by_experts[i] && - rowIdx < input_offset_by_experts[i + 1]) { - rowIdx_in_expert = rowIdx - input_offset_by_experts[i]; - expert_idx = i; - break; - } - } - - // Get the global scaling factor, which will be applied to the SF. - // Note SFScale is the same as next GEMM's alpha, which is - // (448.f / (Alpha_A / 6.f)). - float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx]; - - int factor = CVT_FP4_SF_VEC_SIZE * 4; - // The actual output_scales dim is computed from the padded numCols. - int32_t numCols_padded = (numCols + factor - 1) / factor * factor; - int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4; - uint32_t* SFout_in_expert = - SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout; - - auto sf_out = - cvt_quant_to_fp4_get_sf_out_offset( - rowIdx_in_expert, colIdx, numCols, SFout_in_expert); - - out_pos = - cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); + // Load input offsets into shared memory. + // If n_experts is larger than 4, use vectorized int4 to save instructions. + // If n_experts is smaller than 4, read directly. + if constexpr (SMALL_NUM_EXPERTS) { + for (int i = threadIdx.x; i < n_experts + 1; i += blockDim.x) { + shared_input_offsets[i] = input_offset_by_experts[i]; } + } else { + for (int i = threadIdx.x * 4; i < n_experts; i += blockDim.x * 4) { + *reinterpret_cast(&shared_input_offsets[i]) = + *reinterpret_cast(&input_offset_by_experts[i]); + } + if (threadIdx.x == 0) { + shared_input_offsets[n_experts] = input_offset_by_experts[n_experts]; + } + } + + __syncthreads(); + + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD; + + // Each global thread processes one element + for (int globalIdx = tid; globalIdx < numRows * colsPerRow; + globalIdx += gridDim.x * blockDim.x) { + // Calculate which row and column this global thread should process + int rowIdx = globalIdx / colsPerRow; + int colIdx = globalIdx % colsPerRow; + + int64_t inOffset = rowIdx * colsPerRow + colIdx; + PackedVec in_vec = reinterpret_cast(in)[inOffset]; + int64_t outOffset = inOffset; + auto& out_pos = out[outOffset]; + + // Find expert using binary search for better performance with large m_topk + int rowIdx_in_expert = 0; + int expert_idx = 0; + + // Binary search through experts using shared memory + int left = 0, right = n_experts - 1; + while (left <= right) { + int mid = (left + right) / 2; + // Get offsets: shared_input_offsets[i] corresponds to + // input_offset_by_experts[i] + uint32_t mid_offset = shared_input_offsets[mid]; + uint32_t next_offset = shared_input_offsets[mid + 1]; + + if (rowIdx >= mid_offset && rowIdx < next_offset) { + rowIdx_in_expert = rowIdx - mid_offset; + expert_idx = mid; + break; + } else if (rowIdx < mid_offset) { + right = mid - 1; + } else { + left = mid + 1; + } + } + + float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx]; + + int factor = CVT_FP4_SF_VEC_SIZE * 4; + int32_t numCols_padded = (numCols + factor - 1) / factor * factor; + int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4; + uint32_t* SFout_in_expert = + SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout; + + auto sf_out = + cvt_quant_to_fp4_get_sf_out_offset( + rowIdx_in_expert, colIdx, numCols, SFout_in_expert); + + out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } #endif } @@ -309,18 +442,63 @@ void quant_impl(void* output, void* output_scale, void* input, // Grid, Block size. // Each thread converts 8 values. - dim3 block(std::min(int(k / ELTS_PER_THREAD), 512)); + int const workSizePerRow = k / ELTS_PER_THREAD; + int const totalWorkSize = m_topk * workSizePerRow; + dim3 block(std::min(workSizePerRow, 512)); // Get number of blocks per SM (assume we can fully utilize the SM). int const numBlocksPerSM = 2048 / block.x; - dim3 grid(std::min(int(m_topk), multiProcessorCount * numBlocksPerSM)); + dim3 grid(std::min(static_cast((totalWorkSize + block.x - 1) / block.x), + multiProcessorCount * numBlocksPerSM)); + while (grid.x <= multiProcessorCount && block.x > 64) { + grid.x *= 2; + block.x = (block.x + 1) / 2; + } - cvt_fp16_to_fp4<<>>( - m_topk, k, reinterpret_cast(input), - reinterpret_cast(input_global_scale), - reinterpret_cast(output), - reinterpret_cast(output_scale), - reinterpret_cast(input_offset_by_experts), - reinterpret_cast(output_scale_offset_by_experts), n_experts); + int const blockRepeat = + (totalWorkSize + block.x * grid.x - 1) / (block.x * grid.x); + if (blockRepeat > 1) { + size_t shared_mem_size = (n_experts + 1) * sizeof(uint32_t); + if (n_experts >= 4) { + cvt_fp16_to_fp4 + <<>>( + m_topk, k, reinterpret_cast(input), + reinterpret_cast(input_global_scale), + reinterpret_cast(output), + reinterpret_cast(output_scale), + reinterpret_cast(input_offset_by_experts), + reinterpret_cast(output_scale_offset_by_experts), + n_experts); + } else { + cvt_fp16_to_fp4<<>>( + m_topk, k, reinterpret_cast(input), + reinterpret_cast(input_global_scale), + reinterpret_cast(output), + reinterpret_cast(output_scale), + reinterpret_cast(input_offset_by_experts), + reinterpret_cast(output_scale_offset_by_experts), + n_experts); + } + } else { + if (n_experts >= 16) { + cvt_fp16_to_fp4<<>>( + m_topk, k, reinterpret_cast(input), + reinterpret_cast(input_global_scale), + reinterpret_cast(output), + reinterpret_cast(output_scale), + reinterpret_cast(input_offset_by_experts), + reinterpret_cast(output_scale_offset_by_experts), + n_experts, /* bool low_latency */ true); + } else { + cvt_fp16_to_fp4<<>>( + m_topk, k, reinterpret_cast(input), + reinterpret_cast(input_global_scale), + reinterpret_cast(output), + reinterpret_cast(output_scale), + reinterpret_cast(input_offset_by_experts), + reinterpret_cast(output_scale_offset_by_experts), + n_experts, /* bool low_latency */ true); + } + } } /*Quantization entry for fp4 experts quantization*/ diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh index c4ed1b4757928..e51a4e14e518f 100644 --- a/csrc/quantization/fp8/amd/quant_utils.cuh +++ b/csrc/quantization/fp8/amd/quant_utils.cuh @@ -446,8 +446,6 @@ scaled_vec_conversion(const uint8_t& a, float scale) { template <> __inline__ __device__ uint32_t scaled_vec_conversion(const uint16_t& a, float scale) { - [[maybe_unused]] __half2_raw h2r = - __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret); union { __half2_raw h2r; uint32_t ui32; diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu index 6c146c3fb6fde..3b5180b516239 100644 --- a/csrc/quantization/gguf/gguf_kernel.cu +++ b/csrc/quantization/gguf/gguf_kernel.cu @@ -92,111 +92,112 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, // quant weight torch::Tensor X, // input int64_t type, int64_t row) { int col = X.sizes()[1]; + int vecs = X.sizes()[0]; const int padded = (col + 512 - 1) / 512 * 512; const at::cuda::OptionalCUDAGuard device_guard(device_of(X)); auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device()); - at::Tensor Y = torch::empty({1, row}, options); + at::Tensor Y = torch::empty({vecs, row}, options); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); options = torch::TensorOptions().dtype(torch::kInt32).device(W.device()); - at::Tensor quant_X = torch::empty({1, padded / 32 * 9}, options); + at::Tensor quant_X = torch::empty({vecs, padded / 32 * 9}, options); VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_vec_a8", [&] { - quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), - (void*)quant_X.data_ptr(), col, 1, stream); + quantize_row_q8_1_cuda( + (scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(), col, vecs, stream); switch (type) { case 2: mul_mat_vec_q4_0_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 3: mul_mat_vec_q4_1_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 6: mul_mat_vec_q5_0_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 7: mul_mat_vec_q5_1_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 8: mul_mat_vec_q8_0_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 10: mul_mat_vec_q2_K_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 11: mul_mat_vec_q3_K_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 12: mul_mat_vec_q4_K_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 13: mul_mat_vec_q5_K_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 14: mul_mat_vec_q6_K_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 16: mul_mat_vec_iq2_xxs_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 17: mul_mat_vec_iq2_xs_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 18: mul_mat_vec_iq3_xxs_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 19: mul_mat_vec_iq1_s_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 20: mul_mat_vec_iq4_nl_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 21: mul_mat_vec_iq3_s_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 22: mul_mat_vec_iq2_s_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 23: mul_mat_vec_iq4_xs_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; case 29: mul_mat_vec_iq1_m_q8_1_cuda( (void*)W.data_ptr(), (void*)quant_X.data_ptr(), - (scalar_t*)Y.data_ptr(), col, row, stream); + (scalar_t*)Y.data_ptr(), col, row, vecs, stream); break; } }); diff --git a/csrc/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh index 687cb0a374105..e27bec7af5b7d 100644 --- a/csrc/quantization/gguf/mmvq.cuh +++ b/csrc/quantization/gguf/mmvq.cuh @@ -1,16 +1,19 @@ // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu template -static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) { +static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows, const int nvecs) { const auto row = blockIdx.x*blockDim.y + threadIdx.y; + const auto vec = blockIdx.y; - if (row >= nrows) { + if (row >= nrows || vec >= nvecs) { return; } const int blocks_per_row = ncols / qk; const int blocks_per_warp = vdr * WARP_SIZE / qi; + const int nrows_y = (ncols + 512 - 1) / 512 * 512; -// partial sum for each thread + + // partial sum for each thread float tmp = 0.0f; const block_q_t * x = (const block_q_t *) vx; @@ -19,7 +22,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) { const int ibx = row*blocks_per_row + i; // x block index - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + const int iby = vec*(nrows_y/QK8_1) + i * (qk/QK8_1); // y block index that aligns with ibx const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int @@ -33,177 +36,177 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * } if (threadIdx.x == 0) { - dst[row] = tmp; + dst[vec*nrows + row] = tmp; } } template -static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } template -static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) { const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_nums(block_num_y, nvecs, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + <<>>(vx, vy, dst, ncols, nrows, nvecs); } diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu index 6fad16e196bbc..43b245530e950 100644 --- a/csrc/quantization/gptq/q_gemm.cu +++ b/csrc/quantization/gptq/q_gemm.cu @@ -206,8 +206,6 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel( auto offset_m = blockIdx.y * m_count; auto offset_k = blockIdx.z * BLOCK_KN_SIZE; - [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -344,8 +342,6 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel( auto offset_m = blockIdx.y * m_count; auto offset_k = blockIdx.z * BLOCK_KN_SIZE; - [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -465,8 +461,6 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel( auto offset_m = blockIdx.y * m_count; auto offset_k = blockIdx.z * BLOCK_KN_SIZE; - [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -593,8 +587,6 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel( auto offset_m = blockIdx.y * m_count; auto offset_k = blockIdx.z * BLOCK_KN_SIZE; - [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh index 572894064dc59..eca5d328b00c9 100644 --- a/csrc/quantization/machete/machete_mainloop.cuh +++ b/csrc/quantization/machete/machete_mainloop.cuh @@ -1003,7 +1003,7 @@ struct MacheteCollectiveMma { static constexpr int A_CPY_VEC = decltype(max_common_vector(tCsA, tCrA_load)){}; - static constexpr int COVERSION_WIDTH = + static constexpr int CONVERSION_WIDTH = std::min(A_CPY_VEC, int(size<0>(tCrA_mma))); auto load_A_to_registers = [&](int read_stage) { @@ -1026,8 +1026,8 @@ struct MacheteCollectiveMma { // PIPELINED MAIN LOOP // - auto convert_A = [&, a_vec = Int{}](int k_block, - int read_stage) { + auto convert_A = [&, a_vec = Int{}](int k_block, + int read_stage) { load_extra_info_to_registers(partitioned_extra_info, copy_partitions_extra_info, k_block, read_stage); diff --git a/csrc/quantization/vectorization_utils.cuh b/csrc/quantization/vectorization_utils.cuh new file mode 100644 index 0000000000000..8d3c1d6d3b9fb --- /dev/null +++ b/csrc/quantization/vectorization_utils.cuh @@ -0,0 +1,75 @@ +#pragma once +#include "vectorization.cuh" + +namespace vllm { + +template +struct DefaultVecOp { + ScaOp scalar_op; + + __device__ __forceinline__ void operator()( + vec_n_t& dst, const vec_n_t& src) const { +#pragma unroll + for (int i = 0; i < VEC_SIZE; ++i) { + scalar_op(dst.val[i], src.val[i]); + } + } +}; + +template +__device__ inline void vectorize_with_alignment( + const InT* in, OutT* out, int len, int tid, int stride, + VecOp&& vec_op, // vec_n_t -> vec_n_t + ScaOp&& scalar_op) { // InT -> OutT + static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0, + "VEC_SIZE must be a positive power-of-two"); + constexpr int WIDTH = VEC_SIZE * sizeof(InT); // eg: 64 B + uintptr_t addr = reinterpret_cast(in); + + int misalignment_offset = addr & (WIDTH - 1); // addr % 64 + int alignment_bytes = WIDTH - misalignment_offset; // 64 - (addr % 64) + int prefix_elems = alignment_bytes & (WIDTH - 1); // handle 64 + prefix_elems /= sizeof(InT); + prefix_elems = min(prefix_elems, len); // 0 ≤ prefix < 16 + + // 1. prefill the when it is unsafe to vectorize + for (int i = tid; i < prefix_elems; i += stride) { + scalar_op(out[i], in[i]); + } + + in += prefix_elems; + out += prefix_elems; + len -= prefix_elems; + + int num_vec = len / VEC_SIZE; + using vin_t = vec_n_t; + using vout_t = vec_n_t; + auto* v_in = reinterpret_cast(in); + auto* v_out = reinterpret_cast(out); + + // 2. vectorize the main part + for (int i = tid; i < num_vec; i += stride) { + vout_t tmp; + vec_op(tmp, v_in[i]); + v_out[i] = tmp; + } + + // 3. handle the tail + int tail_start = num_vec * VEC_SIZE; + for (int i = tid + tail_start; i < len; i += stride) { + scalar_op(out[i], in[i]); + } +} + +template +__device__ __forceinline__ void vectorize_with_alignment(const InT* in, + OutT* out, int len, + int tid, int stride, + ScaOp&& scalar_op) { + using Vec = DefaultVecOp>; + vectorize_with_alignment(in, out, len, tid, stride, Vec{scalar_op}, + std::forward(scalar_op)); +} + +} // namespace vllm diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index f1e7da1641998..39997030751b8 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -136,11 +136,6 @@ __device__ __forceinline__ T from_float(const float& inp) { template __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) { - [[maybe_unused]] union tmpcvt { - uint16_t u; - _Float16 f; - __hip_bfloat16 b; - } t16; _B16x4 ret; if constexpr (std::is_same::value) { union h2cvt { @@ -169,11 +164,6 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) { template __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1, const _B16x4& inp2) { - [[maybe_unused]] union tmpcvt { - uint16_t u; - _Float16 f; - __hip_bfloat16 b; - } t1, t2, res; _B16x4 ret; if constexpr (std::is_same::value) { union h2cvt { @@ -325,8 +315,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4); - [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1]; - [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1]; // shared_logits is used for multiple purposes __shared__ _B16x4 shared_logits[NWARPS][4][16][4]; @@ -444,8 +432,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; const int klocal_token_idx = TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; - [[maybe_unused]] const int kglobal_token_idx = - partition_start_token_idx + klocal_token_idx; const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX; @@ -1309,9 +1295,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const int context_len = context_lens[seq_idx]; const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); - [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; const auto warpid = threadIdx.x / WARP_SIZE; - [[maybe_unused]] const auto laneid = threadIdx.x % WARP_SIZE; __shared__ float shared_global_exp_sum; // max num partitions supported is warp_size * NPAR_LOOPS @@ -2080,9 +2064,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const int context_len = context_lens[seq_idx]; const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); - [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE; - [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; __shared__ float shared_global_exp_sum; // max num partitions supported is warp_size * NPAR_LOOPS @@ -2816,9 +2798,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const int context_len = context_lens[seq_idx]; const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); - [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE; - [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; __shared__ float shared_global_exp_sum; // max num partitions supported is warp_size * NPAR_LOOPS diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu index e31aa0162628f..6212570c79d1f 100644 --- a/csrc/rocm/skinny_gemms.cu +++ b/csrc/rocm/skinny_gemms.cu @@ -320,7 +320,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // Goal is to bring the activation matrix A to the LDS // and use it across the lifetime of the work group // TODO: When activation matrix is larger than 64 KB - // then this is not goint to work! + // then this is not going to work! //---------------------------------------------------- __shared__ scalar_t s[max_lds_len]; @@ -581,7 +581,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // Goal is to bring the activation matrix A to the LDS // and use it across the lifetime of the work group // TODO: When activation matrix is larger than 64 KB - // then this is not goint to work! + // then this is not going to work! //---------------------------------------------------- __shared__ scalar_t s[max_lds_len]; @@ -601,7 +601,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp); uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE; - // Check whether there will be fragmenation! + // Check whether there will be fragmentation! // This will happen only for the last wave! if (m < M && (m + YTILE) >= M) { uint32_t startColumn = M - YTILE; @@ -827,7 +827,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; - // Check whether there will be fragmenation! + // Check whether there will be fragmentation! // This will happen only for the last wave! if (m < M && (m + YTILE) >= M) { uint32_t startColumn = M - YTILE; @@ -882,7 +882,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // Goal is to bring the activation matrix A to the LDS // and use it across the lifetime of the work group // TODO: When activation matrix is larger than 64 KB - // then this is not goint to work! + // then this is not going to work! //---------------------------------------------------- __shared__ scalar_t s[max_lds_len]; @@ -904,7 +904,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) //---------------------------------------------------- uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE; - // Check whether there will be fragmenation! + // Check whether there will be fragmentation! // This will happen only for the last wave! if (m < M && (m + YTILE) >= M) { uint32_t startColumn = M - YTILE; @@ -1176,7 +1176,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; kBase = 0; - // Check whether there will be fragmenation! + // Check whether there will be fragmentation! // This will happen only for the last wave! if (m < M && (m + YTILE) >= M) { uint32_t startColumn = M - YTILE; diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu index 3dcaa6373f118..d053ecc8dd70d 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu @@ -277,7 +277,7 @@ CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a) { uint32_t const m = 1; // Set M to 1 for compression uint32_t const n = a.size(1); - // Note: For correctess, the compressed format must be invariant in: + // Note: For correctness, the compressed format must be invariant in: // - M, the flattened number of tokens // - Whether output dtype is fp16 or bf16 // - CUTLASS epilogues diff --git a/docker/Dockerfile b/docker/Dockerfile index 24986a1b73b1b..cf9c245a95174 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -243,30 +243,32 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # If we need to build FlashInfer wheel before its release: -# $ export FLASHINFER_ENABLE_AOT=1 # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ -# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' +# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a' # $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive # $ cd flashinfer -# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4 -# $ rm -rf build -# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose -# $ ls dist -# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl +# $ git checkout v0.2.6.post1 +# $ python -m flashinfer.aot +# $ python -m build --no-isolation --wheel +# $ ls -la dist +# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl +# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - # FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use + # FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use if [[ "$CUDA_VERSION" == 12.8* ]]; then \ - uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \ + uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl; \ else \ - export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \ - CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ - if [ "$CUDA_MAJOR" -lt 12 ]; then \ - export FLASHINFER_ENABLE_SM90=0; \ - fi; \ - uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \ + export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a' && \ + git clone https://github.com/flashinfer-ai/flashinfer.git --single-branch --branch v0.2.6.post1 --recursive && \ + # Needed to build AOT kernels + (cd flashinfer && \ + python3 -m flashinfer.aot && \ + uv pip install --system --no-build-isolation . \ + ) && \ + rm -rf flashinfer; \ fi \ fi COPY examples examples diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 6db2f307a3800..3e9fa0e7af2dc 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -98,6 +98,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ VLLM_TARGET_DEVICE=cpu python3 setup.py develop RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ + cp requirements/test.in requirements/test-cpu.in && \ + sed -i '/mamba_ssm/d' requirements/test-cpu.in && \ + uv pip compile requirements/test-cpu.in -o requirements/test.txt && \ uv pip install -r requirements/dev.txt && \ pre-commit install --hook-type pre-commit --hook-type commit-msg diff --git a/docs/ci/update_pytorch_version.md b/docs/ci/update_pytorch_version.md new file mode 100644 index 0000000000000..2ad3430a4de85 --- /dev/null +++ b/docs/ci/update_pytorch_version.md @@ -0,0 +1,134 @@ +--- +title: Update PyTorch version on vLLM OSS CI/CD +--- + +vLLM's current policy is to always use the latest PyTorch stable +release in CI/CD. It is standard practice to submit a PR to update the +PyTorch version as early as possible when a new [PyTorch stable +release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available. +This process is non-trivial due to the gap between PyTorch +releases. Using [#16859](https://github.com/vllm-project/vllm/pull/16859) as +an example, this document outlines common steps to achieve this update along with +a list of potential issues and how to address them. + +## Test PyTorch release candidates (RCs) + +Updating PyTorch in vLLM after the official release is not +ideal because any issues discovered at that point can only be resolved +by waiting for the next release or by implementing hacky workarounds in vLLM. +The better solution is to test vLLM with PyTorch release candidates (RC) to ensure +compatibility before each release. + +PyTorch release candidates can be downloaded from PyTorch test index at https://download.pytorch.org/whl/test. +For example, torch2.7.0+cu12.8 RC can be installed using the following command: + +``` +uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128 +``` + +When the final RC is ready for testing, it will be announced to the community +on the [PyTorch dev-discuss forum](https://dev-discuss.pytorch.org/c/release-announcements). +After this announcement, we can begin testing vLLM integration by drafting a pull request +following this 3-step process: + +1. Update requirements files in https://github.com/vllm-project/vllm/tree/main/requirements +to point to the new releases for torch, torchvision, and torchaudio. +2. Use `--extra-index-url https://download.pytorch.org/whl/test/` to +get the final release candidates' wheels. Some common platforms are `cpu`, `cu128`, +and `rocm6.2.4`. +3. As vLLM uses uv, make sure that `unsafe-best-match` strategy is set either +via `UV_INDEX_STRATEGY` env variable or via `--index-strategy unsafe-best-match`. + +If failures are found in the pull request, raise them as issues on vLLM and +cc the PyTorch release team to initiate discussion on how to address them. + +## Update CUDA version + +The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example, +torch2.7.0+cu12.6) is uploaded to PyPI. However, vLLM may require a different CUDA version, +such as 12.8 for Blackwell support. +This complicates the process as we cannot use the out-of-the-box +`pip install torch torchvision torchaudio` command. The solution is to use +`--extra-index-url` in vLLM's Dockerfiles. + +1. Use `--extra-index-url https://download.pytorch.org/whl/cu128` to install torch+cu128. +2. Other important indexes at the moment include: + 1. CPU ‒ https://download.pytorch.org/whl/cpu + 2. ROCm ‒ https://download.pytorch.org/whl/rocm6.2.4 and https://download.pytorch.org/whl/rocm6.3 + 3. XPU ‒ https://download.pytorch.org/whl/xpu +3. Update .buildkite/release-pipeline.yaml and .buildkite/scripts/upload-wheels.sh to +match the CUDA version from step 1. This makes sure that the release vLLM wheel is tested +on CI. + +## Address long vLLM build time + +When building vLLM with a new PyTorch/CUDA version, no cache will exist +in the vLLM sccache S3 bucket, causing the build job on CI to potentially take more than 5 hours +and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mode, +it doesn't populate the cache, so re-running it to warm up the cache +is ineffective. + +While ongoing efforts like [#17419](https://github.com/vllm-project/vllm/issues/17419) +address the long build time at its source, the current workaround is to set VLLM_CI_BRANCH +to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`) +when manually triggering a build on Buildkite. This branch accomplishes two things: + +1. Increase the timeout limit to 10 hours so that the build doesn't timeout. +2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket +to warm it up so that future builds are faster. + +

+ +

+ +## Update dependencies + +Several vLLM dependencies, such as FlashInfer, also depend on PyTorch and need +to be updated accordingly. Rather than waiting for all of them to publish new +releases (which would take too much time), they can be built from +source to unblock the update process. + +### FlashInfer +Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271): + +``` +export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX' +export FLASHINFER_ENABLE_SM90=1 +uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1" +``` + +One caveat is that building FlashInfer from source adds approximately 30 +minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a +public location for immediate installation, such as https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl. For future releases, contact the PyTorch release +team if you want to get the package published there. + +### xFormers +Similar to FlashInfer, here is how to build and install xFormers from source: + +``` +export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX' +MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30" +``` + +### Mamba + +``` +uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" +``` + +### causal-conv1d + +``` +uv pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' +``` + +## Update all the different vLLM platforms + +Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable +to handle some platforms separately. The separation of requirements and Dockerfiles +for different platforms in vLLM CI/CD allows us to selectively choose +which platforms to update. For instance, updating XPU requires the corresponding +release from https://github.com/intel/intel-extension-for-pytorch by Intel. +While https://github.com/vllm-project/vllm/pull/16859 updated vLLM to PyTorch +2.7.0 on CPU, CUDA, and ROCm, https://github.com/vllm-project/vllm/pull/17444 +completed the update for XPU. diff --git a/docs/contributing/README.md b/docs/contributing/README.md index 65ae9cc963676..10c50e0072434 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -130,7 +130,7 @@ pytest -s -v tests/test_logger.py If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -!!! warning +!!! important If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability). ## Pull Requests & Code Reviews diff --git a/docs/contributing/ci-failures.md b/docs/contributing/ci-failures.md index 4d8f78197f336..7caaf10ceb5c8 100644 --- a/docs/contributing/ci-failures.md +++ b/docs/contributing/ci-failures.md @@ -64,15 +64,13 @@ Download the full log file from Buildkite locally. Strip timestamps and colorization: -```bash -# Strip timestamps -sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' ci.log + -# Strip colorization -sed -i -r 's/\x1B\[[0-9;]*[mK]//g' ci.log +```bash +./ci-clean-log.sh ci.log ``` -Use a tool for quick copy-pasting: +Use a tool [wl-clipboard](https://github.com/bugaevc/wl-clipboard) for quick copy-pasting: ```bash tail -525 ci_build.log | wl-copy @@ -89,10 +87,10 @@ tail -525 ci_build.log | wl-copy CI test failures may be flaky. Use a bash loop to run repeatedly: + + ```bash -COUNT=1; while pytest -sv tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]; do - COUNT=$[$COUNT + 1]; echo "RUN NUMBER ${COUNT}"; -done +./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp] ``` ## Submitting a PR diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 892ab9098407c..bed6d4e653d65 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -48,8 +48,8 @@ Further update the model as follows: return vision_embeddings ``` -!!! warning - The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. +!!! important + The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. - Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. @@ -100,8 +100,8 @@ Further update the model as follows: ``` !!! note - The model class does not have to be named `*ForCausalLM`. - Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. + The model class does not have to be named `*ForCausalLM`. + Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. ## 2. Specify processing information diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md index 7a7bd79140585..a6dc1e32dfb95 100644 --- a/docs/contributing/model/registration.md +++ b/docs/contributing/model/registration.md @@ -18,7 +18,7 @@ After you have implemented your model (see [tutorial][new-model-basic]), put it Then, add your model class to `_VLLM_MODELS` in so that it is automatically registered upon importing vLLM. Finally, update our [list of supported models][supported-models] to promote your model! -!!! warning +!!! important The list of models in each section should be maintained in alphabetical order. ## Out-of-tree models @@ -49,6 +49,6 @@ def register(): ) ``` -!!! warning +!!! important If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface. Read more about that [here][supports-multimodal]. diff --git a/docs/contributing/model/tests.md b/docs/contributing/model/tests.md index 67f8eda61dc54..a8cb457453b91 100644 --- a/docs/contributing/model/tests.md +++ b/docs/contributing/model/tests.md @@ -15,7 +15,7 @@ Without them, the CI for your PR will fail. Include an example HuggingFace repository for your model in . This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM. -!!! warning +!!! important The list of models in each section should be maintained in alphabetical order. !!! tip diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index 6b08c4960d028..7430f99a5396c 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -5,19 +5,22 @@ title: Using Kubernetes Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes. -* [Deployment with CPUs](#deployment-with-cpus) -* [Deployment with GPUs](#deployment-with-gpus) +- [Deployment with CPUs](#deployment-with-cpus) +- [Deployment with GPUs](#deployment-with-gpus) +- [Troubleshooting](#troubleshooting) + - [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated) +- [Conclusion](#conclusion) Alternatively, you can deploy vLLM to Kubernetes using any of the following: -* [Helm](frameworks/helm.md) -* [InftyAI/llmaz](integrations/llmaz.md) -* [KServe](integrations/kserve.md) -* [kubernetes-sigs/lws](frameworks/lws.md) -* [meta-llama/llama-stack](integrations/llamastack.md) -* [substratusai/kubeai](integrations/kubeai.md) -* [vllm-project/aibrix](https://github.com/vllm-project/aibrix) -* [vllm-project/production-stack](integrations/production-stack.md) +- [Helm](frameworks/helm.md) +- [InftyAI/llmaz](integrations/llmaz.md) +- [KServe](integrations/kserve.md) +- [kubernetes-sigs/lws](frameworks/lws.md) +- [meta-llama/llama-stack](integrations/llamastack.md) +- [substratusai/kubeai](integrations/kubeai.md) +- [vllm-project/aibrix](https://github.com/vllm-project/aibrix) +- [vllm-project/production-stack](integrations/production-stack.md) ## Deployment with CPUs @@ -351,6 +354,17 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) If the service is correctly deployed, you should receive a response from the vLLM model. +## Troubleshooting + +### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated" + +If the startup or readiness probe failureThreshold is too low for the time needed to startup the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened: + +1. container log contains "KeyboardInterrupt: terminated" +2. `kubectl get events` shows message `Container $NAME failed startup probe, will be restarted` + +To mitigate, increase the failureThreshold to allow more time for the model server to start serving. You can identify an ideal failureThreshold by removing the probes from the manifest and measuring how much time it takes for the model server to show it's ready to serve. + ## Conclusion Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. diff --git a/docs/design/multiprocessing.md b/docs/design/v1/multiprocessing.md similarity index 98% rename from docs/design/multiprocessing.md rename to docs/design/v1/multiprocessing.md index 4d58fae20f06c..06ebd77258582 100644 --- a/docs/design/multiprocessing.md +++ b/docs/design/v1/multiprocessing.md @@ -7,7 +7,7 @@ page for information on known issues and how to solve them. ## Introduction -!!! warning +!!! important The source code references are to the state of the code at the time of writing in December, 2024. The use of Python multiprocessing in vLLM is complicated by: @@ -123,7 +123,7 @@ what is happening. First, a log message from vLLM: WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/usage/debugging.html#python-multiprocessing + https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. ``` diff --git a/docs/design/v1/prefix_caching.md b/docs/design/v1/prefix_caching.md index bbdfb255214dd..e87e4c6a48b73 100644 --- a/docs/design/v1/prefix_caching.md +++ b/docs/design/v1/prefix_caching.md @@ -144,7 +144,7 @@ As a result, we will have the following components when the KV cache manager is **Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation: -1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps: +1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps: 1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate. 2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on. 3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it. diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 19b6681729028..afb9a6d4df9ae 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -211,7 +211,7 @@ for o in outputs: Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). -!!! warning +!!! important A chat template is **required** to use Chat Completions API. For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`. diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index 71f62065f63d2..614b43dd00444 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -7,16 +7,16 @@ Quantization trades off model precision for smaller memory footprint, allowing l Contents: -- [Supported_Hardware](supported_hardware.md) -- [Auto_Awq](auto_awq.md) -- [Bnb](bnb.md) -- [Bitblas](bitblas.md) -- [Gguf](gguf.md) -- [Gptqmodel](gptqmodel.md) -- [Int4](int4.md) -- [Int8](int8.md) -- [Fp8](fp8.md) -- [Modelopt](modelopt.md) -- [Quark](quark.md) -- [Quantized_Kvcache](quantized_kvcache.md) -- [Torchao](torchao.md) +- [Supported Hardware](supported_hardware.md) +- [AutoAWQ](auto_awq.md) +- [BitsAndBytes](bnb.md) +- [BitBLAS](bitblas.md) +- [GGUF](gguf.md) +- [GPTQModel](gptqmodel.md) +- [INT4 W4A16](int4.md) +- [INT8 W8A8](int8.md) +- [FP8 W8A8](fp8.md) +- [NVIDIA TensorRT Model Optimizer](modelopt.md) +- [AMD Quark](quark.md) +- [Quantized KV Cache](quantized_kvcache.md) +- [TorchAO](torchao.md) diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index 51da98cc09d3f..35e9dbe2609be 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -1,5 +1,5 @@ --- -title: AMD QUARK +title: AMD Quark --- [](){ #quark } diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index cbcb246912f4c..59ef10d9c963b 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -142,51 +142,6 @@ for chunk in stream: Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). -## Structured output - -The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now. - -```bash -vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1 -``` - -The following is an example client: - -```python -from openai import OpenAI -from pydantic import BaseModel - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - -class People(BaseModel): - name: str - age: int - -json_schema = People.model_json_schema() - -prompt = ("Generate a JSON with the name and age of one random person.") -completion = client.chat.completions.create( - model=model, - messages=[{ - "role": "user", - "content": prompt, - }], - extra_body={"guided_json": json_schema}, -) -print("reasoning_content: ", completion.choices[0].message.reasoning_content) -print("content: ", completion.choices[0].message.content) -``` - ## Tool Calling The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`. diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index f96b598cff98d..044c796609923 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -39,9 +39,10 @@ client = OpenAI( base_url="http://localhost:8000/v1", api_key="-", ) +model = client.models.list().data[0].id completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", + model=model, messages=[ {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} ], @@ -54,7 +55,7 @@ The next example shows how to use the `guided_regex`. The idea is to generate an ```python completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", + model=model, messages=[ { "role": "user", @@ -92,26 +93,32 @@ class CarDescription(BaseModel): json_schema = CarDescription.model_json_schema() completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", + model=model, messages=[ { "role": "user", "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", } ], - extra_body={"guided_json": json_schema}, + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "car-description", + "schema": CarDescription.model_json_schema() + }, + }, ) print(completion.choices[0].message.content) ``` !!! tip While not strictly necessary, normally it´s better to indicate in the prompt the - JSON schema and how the fields should be populated. This can improve the + JSON schema and how the fields should be populated. This can improve the results notably in most cases. Finally we have the `guided_grammar` option, which is probably the most difficult to use, but it´s really powerful. It allows us to define complete -languages like SQL queries. It works by using a context free EBNF grammar. +languages like SQL queries. It works by using a context free EBNF grammar. As an example, we can use to define a specific format of simplified SQL queries: ```python @@ -130,7 +137,7 @@ simplified_sql_grammar = """ """ completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", + model=model, messages=[ { "role": "user", @@ -142,7 +149,48 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -Full example: +See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) + +## Reasoning Outputs + +You can also use structured outputs with for reasoning models. + +```bash +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1 +``` + +Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema: + +```python +from pydantic import BaseModel + + +class People(BaseModel): + name: str + age: int + + +completion = client.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": "Generate a JSON with the name and age of one random person.", + } + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "people", + "schema": People.model_json_schema() + } + }, +) +print("reasoning_content: ", completion.choices[0].message.reasoning_content) +print("content: ", completion.choices[0].message.content) +``` + +See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) ## Experimental Automatic Parsing (OpenAI API) @@ -163,14 +211,14 @@ class Info(BaseModel): age: int client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") +model = client.models.list().data[0].id completion = client.beta.chat.completions.parse( - model="meta-llama/Llama-3.1-8B-Instruct", + model=model, messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, ], response_format=Info, - extra_body=dict(guided_decoding_backend="outlines"), ) message = completion.choices[0].message @@ -203,15 +251,13 @@ class MathResponse(BaseModel): steps: list[Step] final_answer: str -client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") completion = client.beta.chat.completions.parse( - model="meta-llama/Llama-3.1-8B-Instruct", + model=model, messages=[ {"role": "system", "content": "You are a helpful expert math tutor."}, {"role": "user", "content": "Solve 8x + 31 = 2."}, ], response_format=MathResponse, - extra_body=dict(guided_decoding_backend="outlines"), ) message = completion.choices[0].message @@ -232,11 +278,11 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa Answer: x = -29/8 ``` -An example of using `structural_tag` can be found here: +An example of using `structural_tag` can be found here: ## Offline Inference -Offline inference allows for the same types of guided decoding. +Offline inference allows for the same types of structured outputs. To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`. The main available options inside `GuidedDecodingParams` are: @@ -247,7 +293,7 @@ The main available options inside `GuidedDecodingParams` are: - `structural_tag` These parameters can be used in the same way as the parameters from the Online -Serving examples above. One example for the usage of the `choice` parameter is +Serving examples above. One example for the usage of the `choice` parameter is shown below: ```python @@ -265,4 +311,4 @@ outputs = llm.generate( print(outputs[0].outputs[0].text) ``` -Full example: +See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) diff --git a/docs/getting_started/installation/.nav.yml b/docs/getting_started/installation/.nav.yml index 7acfc015ff508..d4a727c926406 100644 --- a/docs/getting_started/installation/.nav.yml +++ b/docs/getting_started/installation/.nav.yml @@ -2,4 +2,6 @@ nav: - README.md - gpu.md - cpu.md - - ai_accelerator.md \ No newline at end of file + - google_tpu.md + - intel_gaudi.md + - aws_neuron.md diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index 36bb16cc02249..c5348adfa5283 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -14,7 +14,6 @@ vLLM supports the following hardware platforms: - [ARM AArch64](cpu.md#arm-aarch64) - [Apple silicon](cpu.md#apple-silicon) - [IBM Z (S390X)](cpu.md#ibm-z-s390x) -- [Other AI accelerators](ai_accelerator.md) - - [Google TPU](ai_accelerator.md#google-tpu) - - [Intel Gaudi](ai_accelerator.md#intel-gaudi) - - [AWS Neuron](ai_accelerator.md#aws-neuron) +- [Google TPU](google_tpu.md) +- [Intel Gaudi](intel_gaudi.md) +- [AWS Neuron](aws_neuron.md) diff --git a/docs/getting_started/installation/ai_accelerator.md b/docs/getting_started/installation/ai_accelerator.md deleted file mode 100644 index a4f136a172fed..0000000000000 --- a/docs/getting_started/installation/ai_accelerator.md +++ /dev/null @@ -1,117 +0,0 @@ -# Other AI accelerators - -vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions: - -=== "Google TPU" - - --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:installation" - -=== "Intel Gaudi" - - --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:installation" - -=== "AWS Neuron" - - --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:installation" - -## Requirements - -=== "Google TPU" - - --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:requirements" - -=== "Intel Gaudi" - - --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:requirements" - -=== "AWS Neuron" - - --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:requirements" - -## Configure a new environment - -=== "Google TPU" - - --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:configure-a-new-environment" - -=== "Intel Gaudi" - - --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:configure-a-new-environment" - -=== "AWS Neuron" - - --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:configure-a-new-environment" - -## Set up using Python - -### Pre-built wheels - -=== "Google TPU" - - --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-wheels" - -=== "Intel Gaudi" - - --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-wheels" - -=== "AWS Neuron" - - --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-wheels" - -### Build wheel from source - -=== "Google TPU" - - --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-wheel-from-source" - -=== "Intel Gaudi" - - --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-wheel-from-source" - -=== "AWS Neuron" - - --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-wheel-from-source" - -## Set up using Docker - -### Pre-built images - -=== "Google TPU" - - --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-images" - -=== "Intel Gaudi" - - --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-images" - -=== "AWS Neuron" - - --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-images" - -### Build image from source - -=== "Google TPU" - - --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-image-from-source" - -=== "Intel Gaudi" - - --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-image-from-source" - -=== "AWS Neuron" - - --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-image-from-source" - -## Extra information - -=== "Google TPU" - - --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:extra-information" - -=== "Intel Gaudi" - - --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:extra-information" - -=== "AWS Neuron" - - --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:extra-information" diff --git a/docs/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/getting_started/installation/aws_neuron.md similarity index 53% rename from docs/getting_started/installation/ai_accelerator/neuron.inc.md rename to docs/getting_started/installation/aws_neuron.md index 86c12472fb360..6b2efd85f06b1 100644 --- a/docs/getting_started/installation/ai_accelerator/neuron.inc.md +++ b/docs/getting_started/installation/aws_neuron.md @@ -1,15 +1,14 @@ -# --8<-- [start:installation] +# AWS Neuron -[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and - generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2, - and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores. - This tab describes how to set up your environment to run vLLM on Neuron. +[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and +generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2, +and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores. +This describes how to set up your environment to run vLLM on Neuron. !!! warning There are no pre-built wheels or images for this device, so you must build vLLM from source. -# --8<-- [end:installation] -# --8<-- [start:requirements] +## Requirements - OS: Linux - Python: 3.9 or newer @@ -21,36 +20,32 @@ ### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies -The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this +The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this [quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image). - After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance - Once inside your instance, activate the pre-installed virtual environment for inference by running + ```console source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate ``` -Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html) +Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html) for alternative setup instructions including using Docker and manually installing dependencies. !!! note - NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) - library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html). + NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) + library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html). -# --8<-- [end:requirements] -# --8<-- [start:set-up-using-python] +## Set up using Python -# --8<-- [end:set-up-using-python] -# --8<-- [start:pre-built-wheels] +### Pre-built wheels Currently, there are no pre-built Neuron wheels. -# --8<-- [end:pre-built-wheels] -# --8<-- [start:build-wheel-from-source] +### Build wheel from source -#### Install vLLM from source - -Install vllm as follows: +To build and install vLLM from source, run: ```console git clone https://github.com/vllm-project/vllm.git @@ -59,14 +54,14 @@ pip install -U -r requirements/neuron.txt VLLM_TARGET_DEVICE="neuron" pip install -e . ``` -AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at - [https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2), which contains several features in addition to what's - available on vLLM V0. Please utilize the AWS Fork for the following features: +AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at +, which contains several features in addition to what's +available on vLLM V0. Please utilize the AWS Fork for the following features: - Llama-3.2 multi-modal support -- Multi-node distributed inference +- Multi-node distributed inference -Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html) +Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html) for more details and usage examples. To install the AWS Neuron fork, run the following: @@ -80,75 +75,73 @@ VLLM_TARGET_DEVICE="neuron" pip install -e . Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested. -# --8<-- [end:build-wheel-from-source] -# --8<-- [start:set-up-using-docker] +## Set up using Docker -# --8<-- [end:set-up-using-docker] -# --8<-- [start:pre-built-images] +### Pre-built images Currently, there are no pre-built Neuron images. -# --8<-- [end:pre-built-images] -# --8<-- [start:build-image-from-source] +### Build image from source See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image. Make sure to use in place of the default Dockerfile. -# --8<-- [end:build-image-from-source] -# --8<-- [start:extra-information] +## Extra information [](){ #feature-support-through-nxd-inference-backend } + ### Feature support through NxD Inference backend -The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend - to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most - [features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration. +The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend +to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most +[features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration. -To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override +To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include + ```console override_neuron_config={ "enable_bucketing":False, } ``` + or when launching vLLM from the CLI, pass + ```console --override-neuron-config "{\"enable_bucketing\":false}" ``` -Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts -(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads. +Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts +(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads. ### Known limitations - EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this - [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility) - for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI. -- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this - [Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html) - to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM. -- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at - runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py) + [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility) + for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI. +- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this + [Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html) + to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM. +- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at + runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py) - Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed - to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature. + to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature. - Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer - to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node) - to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main. -- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches - max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt - to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support - for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is - implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic. - + to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node) + to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main. +- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches + max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt + to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support + for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is + implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic. ### Environment variables -- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid - compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the - artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set, - but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts - under this specified path. + +- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid + compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the + artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set, + but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts + under this specified path. - `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend). - `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend). - -# --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index 18c96b264ad82..00bb5cae43f00 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -110,8 +110,9 @@ vLLM CPU backend supports the following vLLM features: ## Related runtime environment variables -- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. -- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. +- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`. +- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node. By setting to `all`, the OpenMP threads of each rank uses all CPU cores available on the system. Default value is `auto`. +- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `0`. - `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False). ## Performance tips @@ -133,7 +134,15 @@ export VLLM_CPU_OMP_THREADS_BIND=0-29 vllm serve facebook/opt-125m ``` -- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: + or using default auto thread binding: + +```console +export VLLM_CPU_KVCACHE_SPACE=40 +export VLLM_CPU_NUM_OF_RESERVED_CPU=2 +vllm serve facebook/opt-125m +``` + +- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: ```console $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores @@ -178,6 +187,12 @@ $ python examples/offline_inference/basic/basic.py VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` + or using default auto thread binding: + + ```console + VLLM_CPU_KVCACHE_SPACE=40 vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + ``` + - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node. - Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory. diff --git a/docs/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/getting_started/installation/google_tpu.md similarity index 81% rename from docs/getting_started/installation/ai_accelerator/tpu.inc.md rename to docs/getting_started/installation/google_tpu.md index d0b1681201376..0cb10b8de835e 100644 --- a/docs/getting_started/installation/ai_accelerator/tpu.inc.md +++ b/docs/getting_started/installation/google_tpu.md @@ -1,4 +1,4 @@ -# --8<-- [start:installation] +# Google TPU Tensor Processing Units (TPUs) are Google's custom-developed application-specific integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs @@ -33,8 +33,7 @@ information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp !!! warning There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source. -# --8<-- [end:installation] -# --8<-- [start:requirements] +## Requirements - Google Cloud TPU VM - TPU versions: v6e, v5e, v5p, v4 @@ -58,6 +57,7 @@ assigned to your Google Cloud project for your immediate exclusive use. ### Provision Cloud TPUs with GKE For more information about using TPUs with GKE, see: + - - - @@ -70,40 +70,41 @@ Create a TPU v5e with 4 TPU chips: ```console gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ ---node-id TPU_NAME \ ---project PROJECT_ID \ ---zone ZONE \ ---accelerator-type ACCELERATOR_TYPE \ ---runtime-version RUNTIME_VERSION \ ---service-account SERVICE_ACCOUNT + --node-id TPU_NAME \ + --project PROJECT_ID \ + --zone ZONE \ + --accelerator-type ACCELERATOR_TYPE \ + --runtime-version RUNTIME_VERSION \ + --service-account SERVICE_ACCOUNT ``` | Parameter name | Description | |--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | QUEUED_RESOURCE_ID | The user-assigned ID of the queued resource request. | -| TPU_NAME | The user-assigned name of the TPU which is created when the queued | +| TPU_NAME | The user-assigned name of the TPU which is created when the queued resource request is allocated. | | PROJECT_ID | Your Google Cloud project | -| ZONE | The GCP zone where you want to create your Cloud TPU. The value you use | -| ACCELERATOR_TYPE | The TPU version you want to use. Specify the TPU version, for example | -| RUNTIME_VERSION | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images](https://cloud.google.com/tpu/docs/runtimes). | -
Parameter descriptions
+| ZONE | The GCP zone where you want to create your Cloud TPU. The value you use depends on the version of TPUs you are using. For more information, see [TPU regions and zones] | +| ACCELERATOR_TYPE | The TPU version you want to use. Specify the TPU version, for example `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information, see [TPU versions]. | +| RUNTIME_VERSION | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images]. | +| SERVICE_ACCOUNT | The email address for your service account. You can find it in the IAM Cloud Console under *Service Accounts*. For example: `tpu-service-account@.iam.gserviceaccount.com` | -Connect to your TPU using SSH: +Connect to your TPU VM using SSH: ```bash -gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE +gcloud compute tpus tpu-vm ssh TPU_NAME --project PROJECT_ID --zone ZONE ``` -# --8<-- [end:requirements] -# --8<-- [start:set-up-using-python] +[TPU versions]: https://cloud.google.com/tpu/docs/runtimes +[TPU VM images]: https://cloud.google.com/tpu/docs/runtimes +[TPU regions and zones]: https://cloud.google.com/tpu/docs/regions-zones -# --8<-- [end:set-up-using-python] -# --8<-- [start:pre-built-wheels] +## Set up using Python + +### Pre-built wheels Currently, there are no pre-built TPU wheels. -# --8<-- [end:pre-built-wheels] -# --8<-- [start:build-wheel-from-source] +### Build wheel from source Install Miniconda: @@ -136,7 +137,7 @@ Install build dependencies: ```bash pip install -r requirements/tpu.txt -sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev +sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev ``` Run the setup script: @@ -145,16 +146,13 @@ Run the setup script: VLLM_TARGET_DEVICE="tpu" python -m pip install -e . ``` -# --8<-- [end:build-wheel-from-source] -# --8<-- [start:set-up-using-docker] +## Set up using Docker -# --8<-- [end:set-up-using-docker] -# --8<-- [start:pre-built-images] +### Pre-built images See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`. -# --8<-- [end:pre-built-images] -# --8<-- [start:build-image-from-source] +### Build image from source You can use to build a Docker image with TPU support. @@ -188,11 +186,5 @@ docker run --privileged --net host --shm-size=16G -it vllm-tpu Install OpenBLAS with the following command: ```console - sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev + sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev ``` - -# --8<-- [end:build-image-from-source] -# --8<-- [start:extra-information] - -There is no extra information for this device. -# --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 64dccef63d73d..409efece30888 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -254,7 +254,10 @@ The latest code can contain bugs and may not be stable. Please use it with cauti See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image. -## Supported features +# --8<-- [end:build-image-from-source] +# --8<-- [start:supported-features] See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information. + +# --8<-- [end:supported-features] # --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 8b7dc6dd09d34..8019fb50f4ddd 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -217,7 +217,10 @@ docker run -it \ Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. -## Supported features +# --8<-- [end:build-image-from-source] +# --8<-- [start:supported-features] See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information. + +# --8<-- [end:supported-features] # --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md index bee9a7ebb717b..128fff164c3aa 100644 --- a/docs/getting_started/installation/gpu/xpu.inc.md +++ b/docs/getting_started/installation/gpu/xpu.inc.md @@ -63,7 +63,8 @@ $ docker run -it \ vllm-xpu-env ``` -## Supported features +# --8<-- [end:build-image-from-source] +# --8<-- [start:supported-features] XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following: @@ -78,4 +79,6 @@ python -m vllm.entrypoints.openai.api_server \ ``` By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. + +# --8<-- [end:supported-features] # --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/getting_started/installation/intel_gaudi.md similarity index 97% rename from docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md rename to docs/getting_started/installation/intel_gaudi.md index 00935a37417e5..f5970850aae71 100644 --- a/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md +++ b/docs/getting_started/installation/intel_gaudi.md @@ -1,12 +1,11 @@ -# --8<-- [start:installation] +# Intel Gaudi -This tab provides instructions on running vLLM with Intel Gaudi devices. +This page provides instructions on running vLLM with Intel Gaudi devices. !!! warning There are no pre-built wheels or images for this device, so you must build vLLM from source. -# --8<-- [end:installation] -# --8<-- [start:requirements] +## Requirements - OS: Ubuntu 22.04 LTS - Python: 3.10 @@ -56,16 +55,13 @@ docker run \ vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` -# --8<-- [end:requirements] -# --8<-- [start:set-up-using-python] +## Set up using Python -# --8<-- [end:set-up-using-python] -# --8<-- [start:pre-built-wheels] +### Pre-built wheels Currently, there are no pre-built Intel Gaudi wheels. -# --8<-- [end:pre-built-wheels] -# --8<-- [start:build-wheel-from-source] +### Build wheel from source To build and install vLLM from source, run: @@ -86,16 +82,13 @@ pip install -r requirements/hpu.txt python setup.py develop ``` -# --8<-- [end:build-wheel-from-source] -# --8<-- [start:set-up-using-docker] +## Set up using Docker -# --8<-- [end:set-up-using-docker] -# --8<-- [start:pre-built-images] +### Pre-built images Currently, there are no pre-built Intel Gaudi images. -# --8<-- [end:pre-built-images] -# --8<-- [start:build-image-from-source] +### Build image from source ```console docker build -f docker/Dockerfile.hpu -t vllm-hpu-env . @@ -112,10 +105,9 @@ docker run \ !!! tip If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. -# --8<-- [end:build-image-from-source] -# --8<-- [start:extra-information] +## Extra information -## Supported features +### Supported features - [Offline inference][offline-inference] - Online serving via [OpenAI-Compatible Server][openai-compatible-server] @@ -129,14 +121,14 @@ docker run \ for accelerating low-batch latency and throughput - Attention with Linear Biases (ALiBi) -## Unsupported features +### Unsupported features - Beam search - LoRA adapters - Quantization - Prefill chunking (mixed-batch inferencing) -## Supported configurations +### Supported configurations The following configurations have been validated to function with Gaudi2 devices. Configurations that are not listed may or may not work. @@ -183,7 +175,6 @@ Currently in vLLM for HPU we support four execution modes, depending on selected | 0 | 0 | torch.compile | | 0 | 1 | PyTorch eager mode | | 1 | 0 | HPU Graphs | -
vLLM execution modes
!!! warning In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. @@ -401,4 +392,3 @@ the below: higher batches. You can do that by adding `--enforce-eager` flag to server (for online serving), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference). -# --8<-- [end:extra-information] diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index d24e75e8141d8..38fc9925eb51c 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -61,7 +61,8 @@ from vllm import LLM, SamplingParams ``` The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here][sampling-params]. -!!! warning + +!!! important By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified. However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance. @@ -116,7 +117,7 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct !!! note By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it [here][chat-template]. -!!! warning +!!! important By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. To disable this behavior, please pass `--generation-config vllm` when launching the server. diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py index f67941d2ad1b5..1a84039abc14f 100644 --- a/docs/mkdocs/hooks/remove_announcement.py +++ b/docs/mkdocs/hooks/remove_announcement.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os +from pathlib import Path from typing import Literal @@ -8,10 +9,9 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag": # remove the warning banner if the version is a tagged release - docs_dir = os.path.dirname(__file__) - announcement_path = os.path.join(docs_dir, - "mkdocs/overrides/main.html") + mkdocs_dir = Path(__file__).parent.parent + announcement_path = mkdocs_dir / "overrides/main.html" # The file might be removed already if the build is triggered multiple # times (readthedocs build both HTML and PDF versions separately) - if os.path.exists(announcement_path): + if announcement_path.exists(): os.remove(announcement_path) diff --git a/docs/mkdocs/javascript/edit_and_feedback.js b/docs/mkdocs/javascript/edit_and_feedback.js new file mode 100644 index 0000000000000..68dec725f530c --- /dev/null +++ b/docs/mkdocs/javascript/edit_and_feedback.js @@ -0,0 +1,47 @@ +/** + * edit_and_feedback.js + * + * Enhances MkDocs Material docs pages by: + * + * 1. Adding a "Question? Give us feedback" link + * below the "Edit" button. + * + * - The link opens a GitHub issue with a template, + * auto-filled with the current page URL and path. + * + * 2. Ensuring the edit button opens in a new tab + * with target="_blank" and rel="noopener". + */ +document.addEventListener("DOMContentLoaded", function () { + const url = window.location.href; + const page = document.body.dataset.mdUrl || location.pathname; + + const feedbackLink = document.createElement("a"); + feedbackLink.href = `https://github.com/vllm-project/vllm/issues/new?template=100-documentation.yml&title=${encodeURIComponent( + `[Docs] Feedback for \`${page}\`` + )}&body=${encodeURIComponent(`📄 **Reference:**\n${url}\n\n📝 **Feedback:**\n_Your response_`)}`; + feedbackLink.target = "_blank"; + feedbackLink.rel = "noopener"; + feedbackLink.title = "Provide feedback"; + feedbackLink.className = "md-content__button"; + feedbackLink.innerHTML = ` + + + +`; + + const editButton = document.querySelector('.md-content__button[href*="edit"]'); + + if (editButton && editButton.parentNode) { + editButton.insertAdjacentElement("beforebegin", feedbackLink); + + editButton.setAttribute("target", "_blank"); + editButton.setAttribute("rel", "noopener"); + } +}); diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css index 088143ed59563..220657f83d5fc 100644 --- a/docs/mkdocs/stylesheets/extra.css +++ b/docs/mkdocs/stylesheets/extra.css @@ -34,3 +34,77 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link . color: rgba(255, 255, 255, 0.75) !important; font-weight: 700; } + +/* Custom admonitions */ +:root { + --md-admonition-icon--announcement: url('data:image/svg+xml;charset=utf-8,'); + --md-admonition-icon--important: url('data:image/svg+xml;charset=utf-8,'); +} + +.md-typeset .admonition.announcement, +.md-typeset details.announcement { + border-color: rgb(255, 110, 66); +} +.md-typeset .admonition.important, +.md-typeset details.important { + border-color: rgb(239, 85, 82); +} + +.md-typeset .announcement > .admonition-title, +.md-typeset .announcement > summary { + background-color: rgb(255, 110, 66, 0.1); +} +.md-typeset .important > .admonition-title, +.md-typeset .important > summary { + background-color: rgb(239, 85, 82, 0.1); +} + +.md-typeset .announcement > .admonition-title::before, +.md-typeset .announcement > summary::before { + background-color: rgb(239, 85, 82); + -webkit-mask-image: var(--md-admonition-icon--announcement); + mask-image: var(--md-admonition-icon--announcement); +} +.md-typeset .important > .admonition-title::before, +.md-typeset .important > summary::before { + background-color: rgb(239, 85, 82); + -webkit-mask-image: var(--md-admonition-icon--important); + mask-image: var(--md-admonition-icon--important); +} + +/* Make label fully visible on hover */ +.md-content__button[href*="edit"]:hover::after { + opacity: 1; +} + +/* Hide edit button on generated docs/examples pages */ +@media (min-width: 960px) { + .md-content__button[href*="docs/examples/"] { + display: none !important; + } +} + +.md-content__button-wrapper { + position: absolute; + top: 0.6rem; + right: 0.8rem; + display: flex; + flex-direction: row; + align-items: center; + gap: 0.4rem; + z-index: 1; +} + +.md-content__button-wrapper a { + display: inline-flex; + align-items: center; + justify-content: center; + height: 24px; + width: 24px; + color: var(--md-default-fg-color); + text-decoration: none; +} + +.md-content__button-wrapper a:hover { + color: var(--md-accent-fg-color); +} diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index 566b1c29fca9f..e52c5ae01cb8a 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -51,7 +51,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -!!! warning +!!! important By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified. However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance. @@ -81,7 +81,7 @@ The [chat][vllm.LLM.chat] method implements chat functionality on top of [genera In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat) and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt. -!!! warning +!!! important In general, only instruction-tuned models have a chat template. Base models may perform poorly as they are not trained to respond to the chat conversation. diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index a8a6f3417e546..60f7dacebfa21 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -299,78 +299,80 @@ See [this page][generative-models] for more information on how to use generative Specified using `--task generate`. -| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | -|---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------| -| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | -| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | -| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | -| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | -| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | -| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | -| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | -| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | -| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | -| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | -| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | | ✅︎ | -| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | | ✅︎ | -| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | | ✅︎ | -| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | -| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | -| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | -| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | -| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | -| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | -| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | -| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | -| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | -| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | -| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | -| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | -| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | -| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | -| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | -| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | -| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | -| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | -| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | -| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | -| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | -| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | -| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | -| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | -| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | -| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | -| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | -| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | -| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | -| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | -| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | -| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | -| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | -| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | -| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | -| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | -| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | -| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | -| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | -| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | | ✅︎ | -| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | -| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | -| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | -| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | -| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | -| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ | -| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | -| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ | -| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | -| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | -| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | -| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | -| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | -| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | -| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | -| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | +|---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| +| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ | +| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | | +| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | | +| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | +| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | +| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | | ✅︎ | ✅︎ | +| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | | ✅︎ | ✅︎ | +| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | | ✅︎ | ✅︎ | +| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ | +| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ | +| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | | +| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ | +| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ | +| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ | +| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | | +| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | +| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | +| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ | +| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | | +| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | | +| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | | +| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ | +| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | | +| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | ✅︎ | +| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | ✅︎ | +| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ | +| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ | +| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ | +| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | | ✅︎ | ✅︎ | +| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ | +| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | | +| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ | ✅︎ | +| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ | ✅︎ | +| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ | +| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ | +| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`etc. | | | | +| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | | +| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | | !!! note Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. @@ -379,7 +381,7 @@ Specified using `--task generate`. See [this page](./pooling_models.md) for more information on how to use pooling models. -!!! warning +!!! important Since some model architectures support both generative and pooling tasks, you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. @@ -387,18 +389,19 @@ See [this page](./pooling_models.md) for more information on how to use pooling Specified using `--task embed`. -| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | -|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------| -| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | -| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | -| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | ︎ | | -| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | ︎ | ︎ | -| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | ︎ | ︎ | -| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎ | ︎ | -| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | -| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | -| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | +|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|----------------------|---------------------------|-----------------------| +| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | | +| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | +| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | ︎ | | | +| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | ︎ | ︎ | | +| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | ︎ | ︎ | | +| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎ | ︎ | | +| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | | +| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | | +| `Qwen3Model`, `Qwen3ForCausalLM` | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | | +| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | | !!! note `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. @@ -422,16 +425,16 @@ of the whole prompt are extracted from the normalized hidden state corresponding Specified using `--task reward`. -| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | -|---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------| -| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | -| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | -| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | +|---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| +| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | | +| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | | +| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | | If your model is not in the above list, we will try to automatically convert the model using [as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly. -!!! warning +!!! important For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. @@ -439,9 +442,9 @@ If your model is not in the above list, we will try to automatically convert the Specified using `--task classify`. -| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | -|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------| -| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | +|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------| +| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | | If your model is not in the above list, we will try to automatically convert the model using [as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. @@ -450,12 +453,19 @@ If your model is not in the above list, we will try to automatically convert the Specified using `--task score`. -| Architecture | Models | Example HF Models | -|---------------------------------------|-------------------|----------------------------------------------| -| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | -| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | -| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | +| Architecture | Models | Example HF Models | [V1](gh-issue:8779) | +|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------| +| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | +| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | | +| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | +| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | +!!! note + Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: . + + ```bash + vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' + ``` [](){ #supported-mm-models } ## List of Multimodal Language Models @@ -477,7 +487,7 @@ On the other hand, modalities separated by `/` are mutually exclusive. See [this page][multimodal-inputs] on how to pass multi-modal inputs to the model. -!!! warning +!!! important **To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference) or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: @@ -513,45 +523,45 @@ Specified using `--task generate`. | Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | |----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| -| `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | | | ✅︎ | -| `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ | -| `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ | -| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | | ✅︎ | ✅︎ | -| `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | | ✅︎ | ✅︎ | +| `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | | | ✅︎ | +| `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ | +| `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ | +| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | | ✅︎ | ✅︎ | +| `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | | ✅︎ | ✅︎ | | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc. | | | | -| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | | ✅︎ | ✅︎ | +| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | | ✅︎ | ✅︎ | | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | -| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* | -| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ | -| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | -| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | -| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ | -| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + IE+ | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ | -| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ | -| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ | +| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* | +| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ | +| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | +| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | +| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ | +| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + IE+ | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ | +| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ | +| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ | | `MiniCPMO` | MiniCPM-O | T + IE+ + VE+ + AE+ | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ | -| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | | +| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ | +| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ | | `Mistral3ForConditionalGeneration` | Mistral3 | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MllamaForConditionalGeneration` | Llama 3.2 | T + I+ | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | +| `MllamaForConditionalGeneration` | Llama 3.2 | T + I+ | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | | `MolmoForCausalLM` | Molmo | T + I+ | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ | -| `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ | -| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ | -| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ | -| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `PixtralForConditionalGeneration` | Pixtral | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ | +| `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ | +| `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ | +| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ | +| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ | +| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `PixtralForConditionalGeneration` | Pixtral | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ | | `QwenVLForConditionalGeneration`^ | Qwen-VL | T + IE+ | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ | +| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ | | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* | -| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | -| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | -| `TarsierForConditionalGeneration` | Tarsier | T + IE+ | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ | +| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* | +| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | +| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | +| `TarsierForConditionalGeneration` | Tarsier | T + IE+ | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ | ^ You need to set the architecture name via `--hf-overrides` to match the one in vLLM.     • For example, to use DeepSeek-VL2 series models: @@ -628,11 +638,21 @@ Specified using `--task generate`. Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1. `--mm-processor-kwargs '{"use_audio_in_video": true}'`. +#### Transcription + +Specified using `--task transcription`. + +Speech2Text models trained specifically for Automatic Speech Recognition. + +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | +|----------------------------------------------|------------------|------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| +| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | | + ### Pooling Models See [this page](./pooling_models.md) for more information on how to use pooling models. -!!! warning +!!! important Since some model architectures support both generative and pooling tasks, you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. @@ -647,19 +667,10 @@ Any text generation model can be converted into an embedding model by passing `- The following table lists those that are tested in vLLM. -| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | -|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------| -| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | -| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | - -#### Transcription - -Specified using `--task transcription`. - -Speech2Text models trained specifically for Automatic Speech Recognition. - -| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | -|----------------|----------|---------------------|------------------------|-----------------------------| +| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | +|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------| +| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | | +| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | | --- diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index c2e39d029dd5a..3002b2f92e4d5 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -36,7 +36,7 @@ print(completion.choices[0].message) vLLM supports some parameters that are not supported by OpenAI, `top_k` for example. You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`. -!!! warning +!!! important By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. To disable this behavior, please pass `--generation-config vllm` when launching the server. @@ -250,7 +250,7 @@ and passing a list of `messages` in the request. Refer to the examples below for --chat-template examples/template_vlm2vec.jinja ``` - !!! warning + !!! important Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed` to run this model in embedding mode instead of text generation mode. @@ -294,13 +294,13 @@ and passing a list of `messages` in the request. Refer to the examples below for --chat-template examples/template_dse_qwen2_vl.jinja ``` - !!! warning + !!! important Like with VLM2Vec, we have to explicitly pass `--task embed`. Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled by a custom chat template: - !!! warning + !!! important `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code example below for details. diff --git a/docs/usage/security.md b/docs/usage/security.md index 1209cc8dd4572..76140434dcb36 100644 --- a/docs/usage/security.md +++ b/docs/usage/security.md @@ -31,6 +31,7 @@ refer to the [PyTorch Security Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features). Key points from the PyTorch security guide: + - PyTorch Distributed features are intended for internal communication only - They are not built for use in untrusted environments or networks - No authorization protocol is included for performance reasons diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index 889cfccdacac6..e9ab425a1d063 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -40,7 +40,7 @@ If other strategies don't solve the problem, it's likely that the vLLM instance - `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging. - `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem. - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL. -- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. +- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time. ## Incorrect network setup diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index baeb5411bcfdf..28c501439325e 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -1,6 +1,8 @@ # vLLM V1 -**We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.** +!!! announcement + + We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details. V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack). @@ -32,36 +34,9 @@ Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha- This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1. -### Supports Overview -#### Hardware +## Current Status -| Hardware | Status | -|----------|------------------------------------------| -| **NVIDIA** | 🚀 Natively Supported | -| **AMD** | 🚧 WIP | -| **TPU** | 🚧 WIP | -| **CPU** | 🚧 WIP | - -#### Feature / Model - -| Feature / Model | Status | -|-----------------|-----------------------------------------------------------------------------------| -| **Prefix Caching** | 🚀 Optimized | -| **Chunked Prefill** | 🚀 Optimized | -| **LoRA** | 🚀 Optimized | -| **Logprobs Calculation** | 🟢 Functional | -| **Multimodal Models** | 🟢 Functional | -| **FP8 KV Cache** | 🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))| -| **Spec Decode** | 🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))| -| **Prompt Logprobs with Prefix Caching** | 🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))| -| **Structured Output Alternative Backends** | 🟡 Planned | -| **Embedding Models** | 🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188)) | -| **Mamba Models** | 🟡 Planned | -| **Encoder-Decoder Models** | 🟠 Delayed | -| **Request-level Structured Output Backend** | 🔴 Deprecated | -| **best_of** | 🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))| -| **Per-Request Logits Processors** | 🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360)) | -| **GPU <> CPU KV Cache Swapping** | 🔴 Deprecated | +For each item, our progress towards V1 support falls into one of the following states: - **🚀 Optimized**: Nearly fully optimized, with no further work currently planned. - **🟢 Functional**: Fully operational, with ongoing optimizations. @@ -70,15 +45,87 @@ This living user guide outlines a few known **important changes and limitations* - **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later. - **🔴 Deprecated**: Not planned for V1 unless there is strong demand. -**Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same -way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically -allocate a fixed token budget per request, enabling features like chunked prefills, -prefix caching, and speculative decoding without a strict separation between prefill -and decode phases. +### Hardware -### Semantic Changes and Deprecated Features +| Hardware | Status | +|------------|------------------------------------| +| **NVIDIA** | 🚀 | +| **AMD** | 🟢 | +| **TPU** | 🟢 | +| **CPU** | 🟢 (x86) 🟡 (MacOS) | -#### Logprobs +!!! note + + More hardware platforms may be supported via plugins, e.g.: + + - [vllm-ascend](https://github.com/vllm-project/vllm-ascend) + - [vllm-spyre](https://github.com/vllm-project/vllm-spyre) + - [vllm-openvino](https://github.com/vllm-project/vllm-openvino) + + Please check their corresponding repositories for more details. + +### Models + +| Model Type | Status | +|-----------------------------|------------------------------------------------------------------------------------| +| **Decoder-only Models** | 🚀 Optimized | +| **Encoder-Decoder Models** | 🟠 Delayed | +| **Embedding Models** | 🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188)) | +| **Mamba Models** | 🚧 WIP ([PR #19327](https://github.com/vllm-project/vllm/pull/19327)) | +| **Multimodal Models** | 🟢 Functional | + +vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol. + +!!! tip + + This corresponds to the V1 column in our [list of supported models][supported-models]. + +See below for the status of models that are still not yet supported in V1. + +#### Embedding Models + +The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188). + +Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), +which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) +to enable simultaneous generation and embedding using the same engine instance in V1. + +#### Mamba Models + +Models using selective state-space mechanisms instead of standard transformer attention (e.g., `MambaForCausalLM`, `JambaForCausalLM`) +will be supported via [PR #19327](https://github.com/vllm-project/vllm/pull/19327). + +#### Encoder-Decoder Models + +Models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`) +are not yet supported. + +### Features + +| Feature | Status | +|---------------------------------------------|-----------------------------------------------------------------------------------| +| **Prefix Caching** | 🚀 Optimized | +| **Chunked Prefill** | 🚀 Optimized | +| **LoRA** | 🚀 Optimized | +| **Logprobs Calculation** | 🟢 Functional | +| **FP8 KV Cache** | 🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))| +| **Spec Decode** | 🚀 Optimized | +| **Prompt Logprobs with Prefix Caching** | 🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))| +| **Structured Output Alternative Backends** | 🟢 Functional | +| **Request-level Structured Output Backend** | 🔴 Deprecated | +| **best_of** | 🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))| +| **Per-Request Logits Processors** | 🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360)) | +| **GPU <> CPU KV Cache Swapping** | 🔴 Deprecated | + +!!! note + + vLLM V1’s unified scheduler treats both prompt and output tokens the same + way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically + allocate a fixed token budget per request, enabling features like chunked prefills, + prefix caching, and speculative decoding without a strict separation between prefill + and decode phases. + +#### Semantic Changes to Logprobs vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic differences compared to V0: @@ -115,46 +162,4 @@ to handle request preemptions. **Structured Output features** -- **Request-level Structured Output Backend**: Deprecated, alternative backends - (outlines, guidance) with fallbacks is WIP. -### Feature & Model Support in Progress - -Although we have re-implemented and partially optimized many features and models from V0 in vLLM V1, optimization work is still ongoing for some, and others remain unsupported. - -#### Features to Be Optimized - -These features are already supported in vLLM V1, but their optimization is still -in progress. - -- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There - will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode. - -- **Multimodal Models**: V1 is almost fully compatible with V0 except that interleaved modality input is not supported yet. - See [here](https://github.com/orgs/vllm-project/projects/8) for the status of upcoming features and optimizations. - -#### Features to Be Supported - -- **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently - supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar. - Details about the structured outputs can be found - [here](https://docs.vllm.ai/en/latest/features/structured_outputs.html). - -#### Models to Be Supported - -vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol, -and the majority fall into the following categories. V1 support for these models will be added eventually. - -**Embedding Models** -The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188). - -Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1. - -**Mamba Models** -Models using selective state-space mechanisms (instead of standard transformer attention) -are not yet supported (e.g., `MambaForCausalLM`, `JambaForCausalLM`). - -**Encoder-Decoder Models** -vLLM V1 is currently optimized for decoder-only transformers. Models requiring - cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`). - -For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html). +- **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now. diff --git a/examples/offline_inference/basic/README.md b/examples/offline_inference/basic/README.md index 5cb0177b355df..0a2bd6e2b70b3 100644 --- a/examples/offline_inference/basic/README.md +++ b/examples/offline_inference/basic/README.md @@ -70,7 +70,7 @@ Try one yourself by passing one of the following models to the `--model` argumen vLLM supports models that are quantized using GGUF. -Try one yourself by downloading a GUFF quantised model and using the following arguments: +Try one yourself by downloading a quantized GGUF model and using the following arguments: ```python from huggingface_hub import hf_hub_download diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py index ce977ee99bb8f..f4193fdb8bd38 100644 --- a/examples/offline_inference/eagle.py +++ b/examples/offline_inference/eagle.py @@ -137,4 +137,8 @@ def main(): if __name__ == "__main__": + print( + "[WARNING] Use examples/offline_inference/spec_decode.py" + " instead of this script." + ) main() diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py index 6ff8faabd748b..26f7505f2fa53 100644 --- a/examples/offline_inference/neuron_multimodal.py +++ b/examples/offline_inference/neuron_multimodal.py @@ -64,7 +64,7 @@ def print_outputs(outputs): print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -if __name__ == "__main__": +def main(): assert ( len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS) ), f"""Text, image prompts and sampling parameters should have the @@ -104,3 +104,7 @@ if __name__ == "__main__": # test batch-size = 4 outputs = llm.generate(batched_inputs, batched_sample_params) print_outputs(outputs) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py index 5200be82694ab..dfcbd8c8d3605 100644 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -70,7 +70,7 @@ def main(args: argparse.Namespace): return -if __name__ == "__main__": +def parse_args(): parser = FlexibleArgumentParser( description="Benchmark the latency of processing a single batch of " "requests till completion." @@ -102,5 +102,9 @@ if __name__ == "__main__": ) parser = EngineArgs.add_cli_args(parser) - args = parser.parse_args() + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() main(args) diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/qwen3_reranker.py new file mode 100644 index 0000000000000..27c4071bf094e --- /dev/null +++ b/examples/offline_inference/qwen3_reranker.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 + +from vllm import LLM + +model_name = "Qwen/Qwen3-Reranker-0.6B" + +# What is the difference between the official original version and one +# that has been converted into a sequence classification model? +# Qwen3-Reranker is a language model that doing reranker by using the +# logits of "no" and "yes" tokens. +# It needs to computing 151669 tokens logits, making this method extremely +# inefficient, not to mention incompatible with the vllm score API. +# A method for converting the original model into a sequence classification +# model was proposed. See:https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3 +# Models converted offline using this method can not only be more efficient +# and support the vllm score API, but also make the init parameters more +# concise, for example. +# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score") + +# If you want to load the official original version, the init parameters are +# as follows. + +model = LLM( + model=model_name, + task="score", + hf_overrides={ + "architectures": ["Qwen3ForSequenceClassification"], + "classifier_from_token": ["no", "yes"], + "is_original_qwen3_reranker": True, + }, +) + +# Why do we need hf_overrides for the official original version: +# vllm converts it to Qwen3ForSequenceClassification when loaded for +# better performance. +# - Firstly, we need using `"architectures": ["Qwen3ForSequenceClassification"],` +# to manually route to Qwen3ForSequenceClassification. +# - Then, we will extract the vector corresponding to classifier_from_token +# from lm_head using `"classifier_from_token": ["no", "yes"]`. +# - Third, we will convert these two vectors into one vector. The use of +# conversion logic is controlled by `using "is_original_qwen3_reranker": True`. + +# Please use the query_template and document_template to format the query and +# document for better reranker results. + +prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n' +suffix = "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" + +query_template = "{prefix}: {instruction}\n: {query}\n" +document_template = ": {doc}{suffix}" + +if __name__ == "__main__": + instruction = ( + "Given a web search query, retrieve relevant passages that answer the query" + ) + + queries = [ + "What is the capital of China?", + "Explain gravity", + ] + + documents = [ + "The capital of China is Beijing.", + "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.", + ] + + queries = [ + query_template.format(prefix=prefix, instruction=instruction, query=query) + for query in queries + ] + documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents] + + outputs = model.score(queries, documents) + + print([output.outputs.score for output in outputs]) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py new file mode 100644 index 0000000000000..eece8beced510 --- /dev/null +++ b/examples/offline_inference/spec_decode.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: Apache-2.0 + +from transformers import AutoTokenizer + +from vllm import LLM, SamplingParams +from vllm.benchmarks.datasets import add_dataset_parser, get_samples +from vllm.v1.metrics.reader import Counter, Vector + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + + +def parse_args(): + parser = FlexibleArgumentParser() + add_dataset_parser(parser) + parser.add_argument( + "--dataset", + type=str, + default="./examples/data/gsm8k.jsonl", + help="downloaded from the eagle repo " + "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/", + ) + parser.add_argument( + "--method", type=str, default="eagle", choices=["ngram", "eagle", "eagle3"] + ) + parser.add_argument("--max-num-seqs", type=int, default=8) + parser.add_argument("--num-spec-tokens", type=int, default=2) + parser.add_argument("--prompt-lookup-max", type=int, default=5) + parser.add_argument("--prompt-lookup-min", type=int, default=2) + parser.add_argument("--tp", type=int, default=1) + parser.add_argument("--draft-tp", type=int, default=1) + parser.add_argument("--enforce-eager", action="store_true") + parser.add_argument("--enable-chunked-prefill", action="store_true") + parser.add_argument("--max-num-batched-tokens", type=int, default=2048) + parser.add_argument("--temp", type=float, default=0) + parser.add_argument("--top-p", type=float, default=1.0) + parser.add_argument("--top-k", type=int, default=-1) + parser.add_argument("--print-output", action="store_true") + parser.add_argument("--output-len", type=int, default=256) + return parser.parse_args() + + +def main(): + args = parse_args() + args.endpoint_type = "openai-chat" + + model_dir = "meta-llama/Llama-3.1-8B-Instruct" + tokenizer = AutoTokenizer.from_pretrained(model_dir) + max_model_len = 2048 + + prompts = get_samples(args, tokenizer) + # add_special_tokens is False to avoid adding bos twice when using chat templates + prompt_ids = [ + tokenizer.encode(prompt.prompt, add_special_tokens=False) for prompt in prompts + ] + + if args.method == "eagle" or args.method == "eagle3": + if args.method == "eagle": + eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" + elif args.method == "eagle3": + eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" + speculative_config = { + "method": args.method, + "model": eagle_dir, + "num_speculative_tokens": args.num_spec_tokens, + "draft_tensor_parallel_size": args.draft_tp, + "max_model_len": max_model_len, + } + elif args.method == "ngram": + speculative_config = { + "method": "ngram", + "num_speculative_tokens": args.num_spec_tokens, + "prompt_lookup_max": args.prompt_lookup_max, + "prompt_lookup_min": args.prompt_lookup_min, + "max_model_len": max_model_len, + } + else: + raise ValueError(f"unknown method: {args.method}") + + llm = LLM( + model=model_dir, + trust_remote_code=True, + tensor_parallel_size=args.tp, + enable_chunked_prefill=args.enable_chunked_prefill, + max_num_batched_tokens=args.max_num_batched_tokens, + enforce_eager=args.enforce_eager, + max_model_len=max_model_len, + max_num_seqs=args.max_num_seqs, + gpu_memory_utilization=0.8, + speculative_config=speculative_config, + disable_log_stats=False, + ) + + sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len) + outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params) + + # print the generated text + if args.print_output: + for output in outputs: + print("-" * 50) + print(f"prompt: {output.prompt}") + print(f"generated text: {output.outputs[0].text}") + print("-" * 50) + + try: + metrics = llm.get_metrics() + except AssertionError: + print("Metrics are not supported in the V0 engine.") + return + + num_drafts = num_accepted = 0 + acceptance_counts = [0] * args.num_spec_tokens + for metric in metrics: + if metric.name == "vllm:spec_decode_num_drafts": + assert isinstance(metric, Counter) + num_drafts += metric.value + elif metric.name == "vllm:spec_decode_num_accepted_tokens": + assert isinstance(metric, Counter) + num_accepted += metric.value + elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos": + assert isinstance(metric, Vector) + for pos in range(len(metric.values)): + acceptance_counts[pos] += metric.values[pos] + + print("-" * 50) + print(f"mean acceptance length: {1 + (num_accepted / num_drafts):.2f}") + print("-" * 50) + + # print acceptance at each token position + for i in range(len(acceptance_counts)): + print(f"acceptance at token {i}:{acceptance_counts[i] / num_drafts:.2f}") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index ea7a793d026b4..e55181e4f490f 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -289,6 +289,106 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_llava(question: str, image_urls: list[str]) -> ModelRequestData: + # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs, + # it will generate poor response for multi-image inputs! + model_name = "llava-hf/llava-1.5-7b-hf" + engine_args = EngineArgs( + model=model_name, + max_num_seqs=16, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + +def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "llava-hf/llava-v1.6-mistral-7b-hf" + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + +def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf" + engine_args = EngineArgs( + model=model_name, + max_model_len=16384, + max_num_seqs=16, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" @@ -737,6 +837,9 @@ model_example_map = { "idefics3": load_idefics3, "internvl_chat": load_internvl, "kimi_vl": load_kimi_vl, + "llava": load_llava, + "llava-next": load_llava_next, + "llava-onevision": load_llava_onevision, "llama4": load_llama4, "mistral3": load_mistral3, "mllama": load_mllama, diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py deleted file mode 100644 index 5c55d53138a8f..0000000000000 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ /dev/null @@ -1,175 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -To run this example, you need to start the vLLM server: - -```bash -vllm serve Qwen/Qwen2.5-3B-Instruct -``` -""" - -from enum import Enum - -from openai import BadRequestError, OpenAI -from pydantic import BaseModel - -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - - -# Guided decoding by Choice (list of possible options) -def guided_choice_completion(client: OpenAI, model: str): - completion = client.chat.completions.create( - model=model, - messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} - ], - extra_body={"guided_choice": ["positive", "negative"]}, - ) - return completion.choices[0].message.content - - -# Guided decoding by Regex -def guided_regex_completion(client: OpenAI, model: str): - prompt = ( - "Generate an email address for Alan Turing, who works in Enigma." - "End in .com and new line. Example result:" - "alan.turing@enigma.com\n" - ) - - completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": prompt, - } - ], - extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, - ) - return completion.choices[0].message.content - - -# Guided decoding by JSON using Pydantic schema -class CarType(str, Enum): - sedan = "sedan" - suv = "SUV" - truck = "Truck" - coupe = "Coupe" - - -class CarDescription(BaseModel): - brand: str - model: str - car_type: CarType - - -def guided_json_completion(client: OpenAI, model: str): - json_schema = CarDescription.model_json_schema() - - prompt = ( - "Generate a JSON with the brand, model and car_type of" - "the most iconic car from the 90's" - ) - completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": prompt, - } - ], - extra_body={"guided_json": json_schema}, - ) - return completion.choices[0].message.content - - -# Guided decoding by Grammar -def guided_grammar_completion(client: OpenAI, model: str): - simplified_sql_grammar = """ - root ::= select_statement - - select_statement ::= "SELECT " column " from " table " where " condition - - column ::= "col_1 " | "col_2 " - - table ::= "table_1 " | "table_2 " - - condition ::= column "= " number - - number ::= "1 " | "2 " - """ - - prompt = ( - "Generate an SQL query to show the 'username' and 'email'" - "from the 'users' table." - ) - completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": prompt, - } - ], - extra_body={"guided_grammar": simplified_sql_grammar}, - ) - return completion.choices[0].message.content - - -# Extra backend options -def extra_backend_options_completion(client: OpenAI, model: str): - prompt = ( - "Generate an email address for Alan Turing, who works in Enigma." - "End in .com and new line. Example result:" - "alan.turing@enigma.com\n" - ) - - try: - # The guided_decoding_disable_fallback option forces vLLM to use - # xgrammar, so when it fails you get a 400 with the reason why - completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "user", - "content": prompt, - } - ], - extra_body={ - "guided_regex": r"\w+@\w+\.com\n", - "stop": ["\n"], - "guided_decoding_disable_fallback": True, - }, - ) - return completion.choices[0].message.content - except BadRequestError as e: - print("This error is expected:", e) - - -def main(): - client: OpenAI = OpenAI( - base_url=openai_api_base, - api_key=openai_api_key, - ) - - model = client.models.list().data[0].id - - print("Guided Choice Completion:") - print(guided_choice_completion(client, model)) - - print("\nGuided Regex Completion:") - print(guided_regex_completion(client, model)) - - print("\nGuided JSON Completion:") - print(guided_json_completion(client, model)) - - print("\nGuided Grammar Completion:") - print(guided_grammar_completion(client, model)) - - print("\nExtra Backend Options Completion:") - print(extra_backend_options_completion(client, model)) - - -if __name__ == "__main__": - main() diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py deleted file mode 100644 index ec7d8b95472e6..0000000000000 --- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py +++ /dev/null @@ -1,87 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from openai import OpenAI - -# This example demonstrates the `structural_tag` response format. -# It can be used to specify a structured output format that occurs between -# specific tags in the response. This example shows how it could be used -# to enforce the format of a tool call response, but it could be used for -# any structured output within a subset of the response. - -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - - -def main(): - client = OpenAI( - base_url=openai_api_base, - api_key=openai_api_key, - ) - - messages = [ - { - "role": "user", - "content": """ -You have access to the following function to retrieve the weather in a city: - - { - "name": "get_weather", - "parameters": { - "city": { - "param_type": "string", - "description": "The city to get the weather for", - "required": True - } - } - } - -If a you choose to call a function ONLY reply in the following format: -<{start_tag}={function_name}>{parameters}{end_tag} -where - -start_tag => ` a JSON dict with the function argument name as key and function - argument value as value. -end_tag => `` - -Here is an example, -{"example_name": "example_value"} - -Reminder: -- Function calls MUST follow the specified format -- Required parameters MUST be specified -- Only call one function at a time -- Put the entire function call reply on one line -- Always add your sources when using search results to answer the user query - -You are a helpful assistant. - -Given the previous instructions, what is the weather in New York City, Boston, -and San Francisco? -""", - } - ] - - response = client.chat.completions.create( - model=client.models.list().data[0].id, - messages=messages, - response_format={ - "type": "structural_tag", - "structures": [ - { - "begin": "", - "schema": { - "type": "object", - "properties": {"city": {"type": "string"}}, - }, - "end": "", - } - ], - "triggers": ["Reasoning unavailable for this model.", + unsafe_allow_html=True, + ) + # reason remains False + +# Update the input handling section if prompt := st.chat_input("Type your message here..."): - # Save user message to session + # Save and display user message st.session_state.messages.append({"role": "user", "content": prompt}) st.session_state.sessions[st.session_state.current_session] = ( st.session_state.messages ) - - # Display user message with st.chat_message("user"): st.write(prompt) - # Prepare messages for llm - messages_for_llm = [ + # Prepare LLM messages + msgs = [ {"role": m["role"], "content": m["content"]} for m in st.session_state.messages ] - # Generate and display llm response + # Stream assistant response with st.chat_message("assistant"): - message_placeholder = st.empty() - full_response = "" - - # Get streaming response from llm - response = get_llm_response(messages_for_llm, model) - if isinstance(response, str): - message_placeholder.markdown(response) - full_response = response - else: - for chunk in response: - if hasattr(chunk.choices[0].delta, "content"): - content = chunk.choices[0].delta.content - if content: - full_response += content - message_placeholder.markdown(full_response + "▌") - - message_placeholder.markdown(full_response) - - # Save llm response to session history - st.session_state.messages.append({"role": "assistant", "content": full_response}) + # Placeholders: reasoning above, content below + reason_ph = st.empty() + content_ph = st.empty() + full, think = get_llm_response(msgs, model, reason, content_ph, reason_ph) + # Determine index for this new assistant message + message_index = len(st.session_state.messages) + # Save assistant reply + st.session_state.messages.append({"role": "assistant", "content": full}) + # Persist reasoning in session state if any + if reason and think: + st.session_state.show_reasoning[message_index] = think diff --git a/examples/online_serving/structured_outputs/README.md b/examples/online_serving/structured_outputs/README.md new file mode 100644 index 0000000000000..c9b97f11eefd7 --- /dev/null +++ b/examples/online_serving/structured_outputs/README.md @@ -0,0 +1,54 @@ +# Structured Outputs + +This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server. +It can run individual constraint type or all of them. +It supports both streaming responses and concurrent non-streaming requests. + +To use this example, you must start an vLLM server with any model of your choice. + +```bash +vllm serve Qwen/Qwen2.5-3B-Instruct +``` + +To serve a reasoning model, you can use the following command: + +```bash +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1 +``` + +If you want to run this script standalone with `uv`, you can use the following: + +```bash +uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs structured-output +``` + +See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information. + +!!! tip + If vLLM is running remotely, then set `OPENAI_BASE_URL=` before running the script. + +## Usage + +Run all constraints, non-streaming: + +```bash +uv run structured_outputs.py +``` + +Run all constraints, streaming: + +```bash +uv run structured_outputs.py --stream +``` + +Run certain constraints, for example `structural_tag` and `regex`, streaming: + +```bash +uv run structured_outputs.py --constraint structural_tag regex --stream +``` + +Run all constraints, with reasoning models and streaming: + +```bash +uv run structured_outputs.py --reasoning --stream +``` diff --git a/examples/online_serving/structured_outputs/pyproject.toml b/examples/online_serving/structured_outputs/pyproject.toml new file mode 100644 index 0000000000000..8f31405ff584a --- /dev/null +++ b/examples/online_serving/structured_outputs/pyproject.toml @@ -0,0 +1,8 @@ +[project] +name = "examples-online-structured-outputs" +requires-python = ">=3.9, <3.13" +dependencies = ["openai==1.78.1", "pydantic==2.11.4"] +version = "0.0.0" + +[project.scripts] +structured-outputs = "structured_outputs:main" diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py new file mode 100644 index 0000000000000..2a8f4637260c2 --- /dev/null +++ b/examples/online_serving/structured_outputs/structured_outputs.py @@ -0,0 +1,272 @@ +# ruff: noqa: E501 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import argparse +import asyncio +import enum +import os +from typing import TYPE_CHECKING, Any, Literal + +import openai +import pydantic + +if TYPE_CHECKING: + from openai.types.chat import ChatCompletionChunk + + +ConstraintsFormat = Literal[ + "choice", + "regex", + "json", + "grammar", + "structural_tag", +] + + +async def print_stream_response( + stream_response: openai.AsyncStream[ChatCompletionChunk], + title: str, + args: argparse.Namespace, +): + print(f"\n\n{title} (Streaming):") + + local_reasoning_header_printed = False + local_content_header_printed = False + + async for chunk in stream_response: + delta = chunk.choices[0].delta + + reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None) + content_chunk_text = delta.content + + if args.reasoning: + if reasoning_chunk_text: + if not local_reasoning_header_printed: + print(" Reasoning: ", end="") + local_reasoning_header_printed = True + print(reasoning_chunk_text, end="", flush=True) + + if content_chunk_text: + if not local_content_header_printed: + if local_reasoning_header_printed: + print() + print(" Content: ", end="") + local_content_header_printed = True + print(content_chunk_text, end="", flush=True) + else: + if content_chunk_text: + if not local_content_header_printed: + print(" Content: ", end="") + local_content_header_printed = True + print(content_chunk_text, end="", flush=True) + print() + + +class CarType(str, enum.Enum): + SEDAN = "SEDAN" + SUV = "SUV" + TRUCK = "TRUCK" + COUPE = "COUPE" + + +class CarDescription(pydantic.BaseModel): + brand: str + model: str + car_type: CarType + + +PARAMS: dict[ConstraintsFormat, dict[str, Any]] = { + "choice": { + "messages": [ + { + "role": "user", + "content": "Classify this sentiment: vLLM is wonderful!", + } + ], + "extra_body": {"guided_choice": ["positive", "negative"]}, + }, + "regex": { + "messages": [ + { + "role": "user", + "content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'", + } + ], + "extra_body": { + "guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n", + }, + }, + "json": { + "messages": [ + { + "role": "user", + "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", + } + ], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "car-description", + "schema": CarDescription.model_json_schema(), + }, + }, + }, + "grammar": { + "messages": [ + { + "role": "user", + "content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.", + } + ], + "extra_body": { + "guided_grammar": """ +root ::= select_statement + +select_statement ::= "SELECT " column " from " table " where " condition + +column ::= "col_1 " | "col_2 " + +table ::= "table_1 " | "table_2 " + +condition ::= column "= " number + +number ::= "1 " | "2 " +""", + }, + }, + "structural_tag": { + "messages": [ + { + "role": "user", + "content": """ +You have access to the following function to retrieve the weather in a city: + +{ + "name": "get_weather", + "parameters": { + "city": { + "param_type": "string", + "description": "The city to get the weather for", + "required": True + } + } +} + +If a you choose to call a function ONLY reply in the following format: +<{start_tag}={function_name}>{parameters}{end_tag} +where + +start_tag => ` a JSON dict with the function argument name as key and function + argument value as value. +end_tag => `` + +Here is an example, +{"example_name": "example_value"} + +Reminder: +- Function calls MUST follow the specified format +- Required parameters MUST be specified +- Only call one function at a time +- Put the entire function call reply on one line +- Always add your sources when using search results to answer the user query + +You are a helpful assistant. + +Given the previous instructions, what is the weather in New York City, Boston, +and San Francisco?""", + }, + ], + "response_format": { + "type": "structural_tag", + "structures": [ + { + "begin": "", + "schema": { + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"], + }, + "end": "", + } + ], + "triggers": ["= 4.51.1 -huggingface-hub[hf_xet] >= 0.32.0 # Required for Xet downloads. +huggingface-hub[hf_xet] >= 0.33.0 # Required for Xet downloads. tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. @@ -31,20 +31,16 @@ pyzmq >= 25.0.0 msgspec gguf >= 0.13.0 importlib_metadata; python_version < '3.10' -mistral_common[opencv] >= 1.5.4 +mistral_common[opencv] >= 1.6.2 opencv-python-headless >= 4.11.0 # required for video IO pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.10.0 # required for compressed-tensors +compressed-tensors == 0.10.1 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files python-json-logger # Used by logging as per examples/others/logging_configuration.md scipy # Required for phi-4-multimodal-instruct ninja # Required for xgrammar, rocm, tpu, xpu -opentelemetry-sdk>=1.26.0 # vllm.tracing -opentelemetry-api>=1.26.0 # vllm.tracing -opentelemetry-exporter-otlp>=1.26.0 # vllm.tracing -opentelemetry-semantic-conventions-ai>=0.4.1 # vllm.tracing diff --git a/requirements/cpu.txt b/requirements/cpu.txt index e43b443977524..d7b0fc6d80a74 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -27,3 +27,5 @@ triton==3.2.0; platform_machine == "x86_64" # Intel Extension for PyTorch, only for x86_64 CPUs intel-openmp==2024.2.1; platform_machine == "x86_64" intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64" +py-libnuma; platform_system != "Darwin" +psutil; platform_system != "Darwin" diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 3475ada9f4c96..00acda3662608 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -10,7 +10,7 @@ pytest-timeout librosa # required by audio tests in entrypoints/openai sentence-transformers # required for embedding tests -transformers==4.51.3 +transformers==4.52.4 transformers_stream_generator # required for qwen-vl test numba == 0.61.2; python_version > '3.9' # testing utils @@ -41,6 +41,7 @@ matplotlib # required for qwen-vl test num2words # required for smolvlm test pqdm timm # required for internvl test +mistral-common==1.6.2 -schemathesis>=3.39.15 # Required for openai schema test. +schemathesis==3.39.15 # Required for openai schema test. mteb>=1.38.11, <2 # required for mteb test diff --git a/requirements/test.in b/requirements/test.in index bbbd41e168a60..e8f44059fcf87 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -28,15 +28,15 @@ torchvision==0.22.0 transformers_stream_generator # required for qwen-vl test mamba_ssm # required for plamo2 test matplotlib # required for qwen-vl test -mistral_common[opencv] >= 1.5.4 # required for pixtral test +mistral_common[opencv] >= 1.6.2 # required for pixtral test num2words # required for smolvlm test opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.8 # required for model evaluation test -mteb>=1.38.11, <2 # required for mteb test +mteb[bm25s]>=1.38.11, <2 # required for mteb test transformers==4.52.4 tokenizers==0.21.1 -huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads. +huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. # quantization bitsandbytes>=0.45.3 diff --git a/requirements/test.txt b/requirements/test.txt index fb0eede080ff1..16d8ee54adcff 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -51,6 +51,8 @@ black==24.10.0 # via datamodel-code-generator blobfile==3.0.0 # via -r requirements/test.in +bm25s==0.2.13 + # via mteb boto3==1.35.57 # via tensorizer botocore==1.35.57 @@ -190,7 +192,7 @@ h11==0.14.0 # via httpcore harfile==0.3.0 # via schemathesis -hf-xet==0.1.4 +hf-xet==1.1.3 # via huggingface-hub hiredis==3.0.0 # via tensorizer @@ -200,7 +202,7 @@ httpx==0.27.2 # via # -r requirements/test.in # schemathesis -huggingface-hub==0.30.1 +huggingface-hub==0.33.0 # via # -r requirements/test.in # accelerate @@ -303,7 +305,7 @@ mbstrdecoder==1.1.3 # typepy mdurl==0.1.2 # via markdown-it-py -mistral-common==1.5.4 +mistral-common==1.6.2 # via -r requirements/test.in more-itertools==10.5.0 # via lm-eval @@ -344,6 +346,7 @@ numpy==1.26.4 # -r requirements/test.in # accelerate # bitsandbytes + # bm25s # contourpy # cupy-cuda12x # datasets @@ -534,6 +537,8 @@ pyparsing==3.2.0 # via matplotlib pyrate-limiter==3.7.0 # via schemathesis +pystemmer==3.0.0 + # via mteb pytablewriter==1.2.0 # via lm-eval pytest==8.3.3 @@ -668,6 +673,7 @@ scikit-learn==1.5.2 # sentence-transformers scipy==1.13.1 # via + # bm25s # librosa # mteb # scikit-learn diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index 163185b90be91..ec6b20f5e04b9 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -8,6 +8,7 @@ import uvicorn from fastapi.responses import JSONResponse, Response import vllm.entrypoints.api_server +import vllm.envs as envs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.utils import FlexibleArgumentParser @@ -46,9 +47,8 @@ if __name__ == "__main__": engine_args = AsyncEngineArgs.from_cli_args(args) engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) vllm.entrypoints.api_server.engine = engine - uvicorn.run( - app, - host=args.host, - port=args.port, - log_level="debug", - timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE) + uvicorn.run(app, + host=args.host, + port=args.port, + log_level="debug", + timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE) diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index a3181952677fd..bfcf274727e27 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -31,6 +31,8 @@ def test_bench_serve(server): server.host, "--port", str(server.port), + "--dataset-name", + "random", "--random-input-len", "32", "--random-output-len", diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 60334f5e4f683..ace4d25534cdd 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -1,13 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Sequence from copy import deepcopy from typing import Callable, Union from torch import fx +from torch._ops import OpOverload -from vllm.compilation.fx_utils import (find_specified_fn, - find_specified_fn_maybe) +from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.inductor_pass import InductorPass from vllm.config import get_current_vllm_config @@ -48,18 +49,19 @@ class TestBackend: # assign by reference, will reflect the final state of the graph self.final_graph = graph - def check_before_ops(self, ops, - find_fn=find_specified_fn, \ - find_fn_maybe=find_specified_fn_maybe, \ - ops_fully_replaced=True): + def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced=True): for op in ops: - find_fn(self.graph_pre_pass.nodes, op) - if ops_fully_replaced: - assert find_fn_maybe(self.graph_post_pass.nodes, op) is None + num_pre = len(list(find_op_nodes(op, self.graph_pre_pass))) + num_post = len(list(find_op_nodes(op, self.graph_post_pass))) + assert num_pre > 0, f"Op {op.name()} not found in pre-pass graph" + assert num_pre > num_post, f"All nodes remain for op {op.name()}" + if fully_replaced: + assert num_post == 0, \ + f"Unexpected op {op.name()} in post-pass graph" - def check_after_ops(self, ops, - find_fn=find_specified_fn, \ - find_fn_maybe=find_specified_fn_maybe): + def check_after_ops(self, ops: Sequence[OpOverload]): for op in ops: - find_fn(self.graph_post_pass.nodes, op) - assert find_fn_maybe(self.graph_pre_pass.nodes, op) is None + num_pre = len(list(find_op_nodes(op, self.graph_pre_pass))) + num_post = len(list(find_op_nodes(op, self.graph_post_pass))) + assert num_pre == 0, f"Unexpected op {op.name()} in pre-pass graph" + assert num_post > 0, f"Op {op.name()} not found in post-pass graph" \ No newline at end of file diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py deleted file mode 100644 index d86ca37109237..0000000000000 --- a/tests/compile/conftest.py +++ /dev/null @@ -1,15 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -# TEST V1: this should be removed. Right now V1 overrides -# all the torch compile logic. We should re-enable this -# as we add torch compile support back to V1. -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index 134bade486079..c1f5d9658af16 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -2,15 +2,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import os +import weakref +from contextlib import ExitStack import pytest +from tests.utils import wait_for_gpu_memory_to_clear from vllm import LLM, SamplingParams from vllm.config import CompilationConfig from vllm.platforms import current_platform -MODEL = "Qwen/Qwen2-1.5B-Instruct" - @contextlib.contextmanager def temporary_environ(env_vars): @@ -31,64 +32,119 @@ def temporary_environ(env_vars): os.environ[k] = v -@pytest.fixture(scope="module") -def full_cudagraph_llm(): +@pytest.fixture(scope="class") +def llm_pair(request): + model = request.param + with temporary_environ({ "VLLM_USE_V1": "1", "VLLM_FLASH_ATTN_VERSION": "3" }): - return LLM(model=MODEL, - gpu_memory_utilization=0.3, - compilation_config=CompilationConfig(full_cuda_graph=True)) - - -@pytest.fixture(scope="module") -def piecewise_llm(): - with temporary_environ({ - "VLLM_USE_V1": "1", - "VLLM_FLASH_ATTN_VERSION": "3" - }): - return LLM(model=MODEL, - gpu_memory_utilization=0.6, - compilation_config=CompilationConfig()) - - -def generate_text(llm: LLM, batch_size: int, max_tokens: int): - prompts = ["Hi my name is"] * batch_size - sampling_params = SamplingParams(temperature=0.0, - max_tokens=max_tokens, - top_p=0.95) - - return llm.generate(prompts, sampling_params) + full = LLM( + model=model, + gpu_memory_utilization=0.45, + trust_remote_code=True, + max_model_len=1024, + compilation_config=CompilationConfig(full_cuda_graph=True), + ) + piecewise = LLM( + model=model, + gpu_memory_utilization=0.45, + trust_remote_code=True, + max_model_len=1024, + compilation_config=CompilationConfig(), + ) + + # PyTest caches the fixture values so we use weakref.proxy to enable GC + yield weakref.proxy(full), weakref.proxy(piecewise) + del full + del piecewise + + wait_for_gpu_memory_to_clear( + devices=[0], + threshold_ratio=0.1, + ) +@pytest.mark.parametrize( + "llm_pair", + [ + # Model names for the llm_pair fixture + "deepseek-ai/DeepSeek-V2-Lite", + "Qwen/Qwen2-1.5B-Instruct" + ], + indirect=True) @pytest.mark.skipif(current_platform.get_device_capability() != (9, 0), - reason="Only Hopper GPUs support FlashAttention 3") -@pytest.mark.parametrize(("batch_size", "max_tokens"), [(1, 10), (7, 10), - (16, 10), (25, 10), - (32, 10), (45, 10), - (64, 10), (8, 5), - (8, 20), (8, 200)]) -def test_full_cudagraph(batch_size, max_tokens, full_cudagraph_llm, - piecewise_llm): + reason="Only Hopper GPUs support FA3 and FlashMLA") +class TestFullCUDAGraph: """ - Load full cudagraph model and piecewise model once, and at the same time to - reuse them across various test cases. + Use a class such that an llm pair is constructed once for all + batch_size/max_tokens combinations and released immediately after. - Test various batch sizes and max_tokens to ensure that the full cudagraph - compilation works for padded cases too. + Module-scope fixtures would stick around the whole time, + meaning there would be multiple LLM instances hogging memory simultaneously. """ - piecewise_responses = generate_text(piecewise_llm, - batch_size=batch_size, - max_tokens=max_tokens) - full_cudagraph_responses = generate_text(full_cudagraph_llm, - batch_size=batch_size, - max_tokens=max_tokens) - # Check that all responses are the same - for i in range(len(piecewise_responses)): - assert piecewise_responses[i].outputs[ - 0].text == full_cudagraph_responses[i].outputs[0].text + @pytest.mark.parametrize(("batch_size", "max_tokens"), [ + (1, 10), + (7, 10), + (16, 10), + (25, 10), + (32, 10), + (45, 10), + (64, 10), + (123, 10), + (8, 5), + (8, 30), + ]) + def test_full_cudagraph(self, batch_size, max_tokens, + llm_pair: tuple[LLM, LLM]): + """ + Test various batch sizes and max_tokens to ensure that the + full cudagraph compilation works for padded cases too. + """ + + piecewise_llm, full_cudagraph_llm = llm_pair + + prompts = ["Hello, my name is"] * batch_size + sampling_params = SamplingParams(temperature=0.0, + max_tokens=max_tokens, + top_p=0.95) + + piecewise_responses = piecewise_llm.generate(prompts, sampling_params) + full_responses = full_cudagraph_llm.generate(prompts, sampling_params) + + # Check that all responses are the same + for piecewise_res, full_res in zip(piecewise_responses, + full_responses): + assert piecewise_res.outputs[0].text == full_res.outputs[0].text + + +@pytest.mark.parametrize( + "model, supported", + [ + ("Qwen/Qwen2-1.5B-Instruct", True), + # MLA does not support capturing CUDA Graphs with size > max_num_seqs + ("deepseek-ai/DeepSeek-V2-Lite", False), + ]) +@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0), + reason="Only Hopper GPUs support FA3 and FlashMLA") +def test_lower_max_num_seqs(model, supported): + with temporary_environ({ + "VLLM_USE_V1": "1", + "VLLM_FLASH_ATTN_VERSION": "3" + }), ExitStack() as stack: + if not supported: + stack.enter_context(pytest.raises(RuntimeError)) + + llm = LLM(model=model, + max_num_seqs=256, + trust_remote_code=True, + max_model_len=1024, + compilation_config=CompilationConfig( + full_cuda_graph=True, + cudagraph_capture_sizes=[64, 256, 512])) + llm.generate(["Hello, my name is"] * 10) def test_full_cudagraph_with_invalid_backend(): @@ -97,5 +153,5 @@ def test_full_cudagraph_with_invalid_backend(): "VLLM_FLASH_ATTN_VERSION": "2" #FA2 not supported with full_cuda_graph }), pytest.raises(RuntimeError): - LLM(model=MODEL, + LLM(model="Qwen/Qwen2-1.5B-Instruct", compilation_config=CompilationConfig(full_cuda_graph=True)) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 852aa44d47aa5..06ac3527e1fb8 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -4,7 +4,7 @@ Test the piecewise compilation with a simple model so that we can exactly calculate the expected output and side effects. """ - +import pytest import torch from torch import nn from torch.library import Library @@ -13,6 +13,8 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, set_current_vllm_config) +from vllm.envs import VLLM_USE_V1 +from vllm.forward_context import set_forward_context from vllm.utils import direct_register_custom_op global_counter = 0 @@ -75,7 +77,9 @@ class SillyModel(nn.Module): return x -def _test_simple_piecewise_compile(*, use_inductor): +@pytest.mark.parametrize("use_inductor", [True, False]) +def test_simple_piecewise_compile(use_inductor): + assert VLLM_USE_V1 vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, @@ -95,9 +99,9 @@ def _test_simple_piecewise_compile(*, use_inductor): num_piecewise_graphs_seen=5, # 2 * num_layers + 1 num_piecewise_capturable_graphs_seen=3, # 1 + num_layers num_backend_compilations=3, # num_piecewise_capturable_graphs_seen - num_cudagraph_caputured= + num_cudagraph_captured= 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - ): + ), set_forward_context({}, vllm_config=vllm_config): model(inputs) @@ -110,11 +114,3 @@ def _test_simple_piecewise_compile(*, use_inductor): output = model(input) assert global_counter == 2 assert torch.allclose(output.cpu(), torch.tensor([3., 1.])) - - -def test_simple_piecewise_compile_inductor(): - _test_simple_piecewise_compile(use_inductor=True) - - -def test_simple_piecewise_compile_no_inductor(): - _test_simple_piecewise_compile(use_inductor=False) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 2464d7889861f..b7ed8353b3cef 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -11,6 +11,7 @@ initialized randomly with a fixed seed. from dataclasses import dataclass from typing import Any, Optional +import pytest import torch from torch import nn from torch.library import Library @@ -19,6 +20,7 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, set_current_vllm_config) +from vllm.forward_context import set_forward_context from vllm.utils import direct_register_custom_op # create a library to hold the custom op @@ -285,29 +287,32 @@ def run_model(llama_config, vllm_config=vllm_config, prefix="").eval().cuda() - B = 16 # max batch size - input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() - positions = torch.arange(B).cuda() + with set_forward_context({}, vllm_config=vllm_config): + B = 16 # max batch size + input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() + positions = torch.arange(B).cuda() - model(input_ids, positions) - model(input_ids[:2], positions[:2]) - model(input_ids[:1], positions[:1]) + model(input_ids, positions) + model(input_ids[:2], positions[:2]) + model(input_ids[:1], positions[:1]) - input_ids[:2].zero_() - output = model(input_ids[:2], positions[:2]) + input_ids[:2].zero_() + output = model(input_ids[:2], positions[:2]) - output = output.cpu() + output = output.cpu() - if llama_config.tractable_init: - expected_output = tractable_computation(input_ids[:2], positions[:2], - llama_config).cpu() + if llama_config.tractable_init: + expected_output = tractable_computation(input_ids[:2], + positions[:2], + llama_config).cpu() - assert torch.allclose(output, expected_output) - else: - return output.cpu() + assert torch.allclose(output, expected_output) + else: + return output.cpu() -def _test_toy_llama(*, use_inductor): +@pytest.mark.parametrize("use_inductor", [True, False]) +def test_toy_llama(use_inductor: bool): # compare output with and without piecewise compilation llama_config = LlamaConfig(hidden_size=128, @@ -327,7 +332,7 @@ def _test_toy_llama(*, use_inductor): num_piecewise_graphs_seen=0, num_piecewise_capturable_graphs_seen=0, num_backend_compilations=0, - num_cudagraph_caputured=0, + num_cudagraph_captured=0, ): outputs.append( run_model(llama_config, use_inductor=False, use_compile=False)) @@ -343,7 +348,7 @@ def _test_toy_llama(*, use_inductor): num_piecewise_graphs_seen=1, num_piecewise_capturable_graphs_seen=1, num_backend_compilations=1, # num_piecewise_capturable_graphs_seen - num_cudagraph_caputured= + num_cudagraph_captured= 2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen **kwargs, ): @@ -361,7 +366,7 @@ def _test_toy_llama(*, use_inductor): llama_config.num_layers, # 1 + num_layers num_backend_compilations=1 + llama_config.num_layers, # num_piecewise_capturable_graphs_seen - num_cudagraph_caputured=2 * + num_cudagraph_captured=2 * (1 + llama_config.num_layers ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): @@ -379,14 +384,6 @@ def _test_toy_llama(*, use_inductor): assert torch.allclose(outputs[0], outputs[i]) -def test_toy_llama_inductor(): - _test_toy_llama(use_inductor=True) - - -def test_toy_no_inductor(): - _test_toy_llama(use_inductor=False) - - @torch.inference_mode def benchmark(): from triton.testing import do_bench diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 1e4ee571f1af5..62804e721e3dc 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -169,8 +169,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int, # In pre-nodes, all gather or reduce scatter should exist, # fused_matmul_reduce_scatter or fused_all_gather_matmul should not - backend.check_before_ops(model.ops_in_model_before(), - ops_fully_replaced=False) + backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False) # In post-nodes, fused_matmul_reduce_scatter or \ # fused_all_gather_matmul should exist @@ -223,7 +222,7 @@ def test_async_tp_pass_correctness( "VLLM_USE_V1": "1", } - aysnc_tp_args = [ + async_tp_args = [ *common_args, "--tensor-parallel-size", str(tp_size), @@ -242,7 +241,7 @@ def test_async_tp_pass_correctness( ] compare_two_settings(model_id, - aysnc_tp_args, + async_tp_args, tp_args, async_tp_env, tp_env, diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py new file mode 100644 index 0000000000000..52e0fcc2881fb --- /dev/null +++ b/tests/compile/test_config.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + +import vllm +from vllm.compilation.counter import compilation_counter +from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, + set_current_vllm_config) + +from .piecewise.test_simple import SillyModel + + +def test_use_cudagraphs_dynamic(monkeypatch): + assert vllm.envs.VLLM_USE_V1 + vllm_config = VllmConfig() + assert vllm_config.compilation_config.use_cudagraph + + monkeypatch.setenv('VLLM_USE_V1', '0') + vllm_config = VllmConfig() + assert not vllm_config.compilation_config.use_cudagraph + + +@pytest.mark.parametrize("enabled", [True, False]) +def test_use_cudagraphs(enabled): + assert vllm.envs.VLLM_USE_V1 + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=enabled, + cudagraph_capture_sizes=[100], + )) + with set_current_vllm_config(vllm_config): + model = SillyModel(vllm_config=vllm_config, prefix='') + + inputs = torch.randn(100, device="cuda") + + with compilation_counter.expect( + num_graphs_seen=1, # one graph for the model + num_cudagraph_captured=1 if enabled else 0, + ): + # first run is warmup + model(inputs) + # second run does CUDAGraphs recording (if enabled) + model(inputs) diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 0c25aae52d465..040fd176fec12 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -7,8 +7,7 @@ import torch import vllm.envs as envs import vllm.plugins from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey, - FusionPass, QuantKey) -from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe + FusionPass, GroupShape, QuantKey) from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.config import (CompilationConfig, CompilationLevel, PassConfig, VllmConfig) @@ -30,9 +29,10 @@ class TestModel(torch.nn.Module): self.cutlass_fp8_enabled = cutlass_fp8_enabled self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)] self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] + group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN self.key = QuantKey(dtype=FP8_DTYPE, static=static, - per_tensor=static, + group_shape=group_shape, symmetric=True) if static: self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] @@ -122,9 +122,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL) # In pre-nodes, fp8 quant should be there and fused kernels should not - backend.check_before_ops(model.ops_in_model_before(), find_auto_fn, - find_auto_fn_maybe) + backend.check_before_ops(model.ops_in_model_before()) # In post-nodes, fused kernels should be there and fp8 quant should not - backend.check_after_ops(model.ops_in_model_after(), find_auto_fn, - find_auto_fn_maybe) + backend.check_after_ops(model.ops_in_model_after()) diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py new file mode 100644 index 0000000000000..5e6679adfbdc9 --- /dev/null +++ b/tests/compile/test_fusion_attn.py @@ -0,0 +1,131 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import pytest +import torch._dynamo + +from tests.compile.backend import TestBackend +from tests.models.utils import check_outputs_equal +from vllm import LLM, SamplingParams +from vllm.compilation.fusion import QUANT_OPS, QuantKey, kFp8StaticTensorSym +from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass +from vllm.compilation.fx_utils import find_op_nodes +from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.config import CompilationConfig, CompilationLevel, VllmConfig +from vllm.platforms import current_platform + +# globals needed for string-import custom Dynamo backend field +backend: Optional[TestBackend] = None +backend_unfused: Optional[TestBackend] = None + + +@pytest.mark.parametrize( + "model, quant_key", + [("amd/Llama-3.1-8B-Instruct-FP8-KV", kFp8StaticTensorSym)]) +@pytest.mark.parametrize( + "use_triton_fa", [True, False] if current_platform.is_rocm() else [False]) +@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8") +@pytest.mark.skipif(not current_platform.is_cuda_alike(), + reason="Only test CUDA and ROCm") +def test_attention_fusion(example_prompts, monkeypatch, model: str, + quant_key: QuantKey, use_triton_fa: bool): + # Clean Dynamo cache to avoid reusing other test cases + # (for some reason the reset at the end is not enough) + torch._dynamo.reset() + + # Use global backends + global backend, backend_unfused + + use_v1 = False # can be made a param once V1 support added + monkeypatch.setenv("VLLM_USE_V1", str(int(use_v1))) + monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa))) + + # Prompt 4 seems too open-ended, differs between fused and unfused + # (both outputs look reasonable though) + prompts = example_prompts[:4] + example_prompts[5:] + + compile_config = CompilationConfig( + # DYNAMO_AS_IS triggers custom backend & does full Dynamo compilation + # DYNAMO_ONCE does not properly propagate shapes. + level=CompilationLevel.DYNAMO_AS_IS, + backend="tests.compile.test_fusion_attn.backend_unfused", + ) + vllm_config = VllmConfig(compilation_config=compile_config) + backend_unfused = TestBackend(NoOpEliminationPass(vllm_config)) + + llm = LLM(model, + enforce_eager=True, + compilation_config=compile_config, + gpu_memory_utilization=0.9, + max_model_len=2048) + + sampling_params = SamplingParams(temperature=0.0, + max_tokens=10, + top_p=0.95) + + unfused_output = llm.generate(prompts, sampling_params) + backend_unfused = None # Reset backend to make sure llm gets released + del llm + + compile_config = CompilationConfig( + # DYNAMO_AS_IS triggers custom backend & does full Dynamo compilation + # DYNAMO_ONCE does not properly propagate shapes. + level=CompilationLevel.DYNAMO_AS_IS, + backend="tests.compile.test_fusion_attn.backend", + ) + vllm_config = VllmConfig(compilation_config=compile_config) + + # AttnFusionPass needs attention layers to be registered in config upon init + # so we initialize it during compilation. + attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw) + backend = TestBackend(NoOpEliminationPass(vllm_config), attn_pass) + llm2 = LLM(model, + enforce_eager=True, + compilation_config=compile_config, + gpu_memory_utilization=0.9, + max_model_len=2048) + + # check support + attn_fusion_supported = [ + layer.impl.fused_output_quant_supported(quant_key.dtype, + quant_key.static, + quant_key.group_shape) + for key, layer in compile_config.static_forward_context.items() + ] + + print(f"{attn_fusion_supported=}") + if any(attn_fusion_supported): + # Check quant ops + backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=False) + + # attention ops present in both, just output_scale param changes + attn_nodes_pre = list(find_op_nodes(ATTN_OP, backend.graph_pre_pass)) + attn_nodes_post = list(find_op_nodes(ATTN_OP, backend.graph_post_pass)) + assert len(attn_nodes_pre) == len(attn_nodes_post) + + for i in range(len(attn_nodes_pre)): + assert attn_nodes_pre[i].kwargs["output_scale"] is None + fused = attn_nodes_post[i].kwargs["output_scale"] is not None + assert fused == attn_fusion_supported[i], \ + f"Node {i} {'' if fused else 'not '} expected " \ + f"to have fused output quant" + + # check outputs + fused_output = llm2.generate(prompts, sampling_params) + + # transform outputs to format expected by check_outputs_equal + sample_outs = lambda s: (list(s.token_ids), s.text) + outs_lst = lambda ros: [sample_outs(ro.outputs[0]) for ro in ros] + + check_outputs_equal( + outputs_0_lst=outs_lst(unfused_output), + outputs_1_lst=outs_lst(fused_output), + name_0="unfused", + name_1="fused", + ) + + # Clean Dynamo cache to avoid polluting other case(s) + torch._dynamo.reset() + + # Reset backend to make sure llm2 gets released + backend = None diff --git a/tests/conftest.py b/tests/conftest.py index 5ec3926bd31f4..294805a8164f8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -727,8 +727,12 @@ class HfRunner: **kwargs) -> list[list[torch.Tensor]]: return self.model.encode(prompts, *args, **kwargs) - def predict(self, prompts: list[list[str]]) -> torch.Tensor: - return self.model.predict(prompts, convert_to_tensor=True) + def predict(self, prompts: list[list[str]], *args, + **kwargs) -> torch.Tensor: + return self.model.predict(prompts, + *args, + convert_to_tensor=True, + **kwargs) def __enter__(self): return self @@ -1037,8 +1041,10 @@ class VllmRunner: self, text_1: Union[str, list[str]], text_2: Union[str, list[str]], + *args, + **kwargs, ) -> list[float]: - req_outputs = self.model.score(text_1, text_2) + req_outputs = self.model.score(text_1, text_2, *args, **kwargs) return [req_output.outputs.score for req_output in req_outputs] def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index f296c81e17685..93222b564ebe7 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -437,8 +437,8 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, "enable_prefix_caching": True, }]) @pytest.mark.parametrize("seed", [1]) -def test_auto_prefix_caching_after_evition_start(baseline_llm_generator, - test_llm_generator): +def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator, + test_llm_generator): """Verify block manager v2 with auto prefix caching could works normal even when eviction started. With APC enabled, all blocks are held by native block at the beginning. diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index 3429a858dda59..4d67eea2264b2 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -33,8 +33,8 @@ BLOCK_SIZE = 16 @pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) -def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, - batch_size, seed, backend, monkeypatch): +def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator, + batch_size, seed, backend, monkeypatch): """ The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then asks for value of one of them (which is outside the sliding window). @@ -100,7 +100,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed, backend, monkeypatch): """ - This is similar to test_sliding_window_retrival, however, it doesn't + This is similar to test_sliding_window_retrieval, however, it doesn't compare against the v1 block manager since v1 doesn't support chunked prefill with sliding window. diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index db78a9d556422..591e1780c11c6 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -594,8 +594,8 @@ def test_decode_schedule_preempted(): # should be preempted. 1 will also be preempted. budget = create_token_budget() output = scheduler._schedule_running(budget, curr_loras) - remainig_running = scheduler.running - assert len(remainig_running) == 0 + remaining_running = scheduler.running + assert len(remaining_running) == 0 assert len(output.decode_seq_groups) == 1 assert len(output.prefill_seq_groups) == 0 assert output.decode_seq_groups[0].seq_group.request_id == "0" @@ -1041,3 +1041,297 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds(): for seq in scheduled_seq_group.seq_group.seqs: seq.status = SequenceStatus.FINISHED_STOPPED scheduler.free_finished_seq_groups() + + +def test_remove_seq_from_computed_blocks_tracker(): + """ + Test that computed_blocks_tracker correctly removes stale sequences + during scheduling. + + The test covers 9 scheduling branches where stale seqs are removed: + - 1 in _schedule_swapped + - 1 in _schedule_priority_preemption + - 7 in _schedule_prefill + + Each branch is tested to ensure proper cleanup of + _seq_id_to_num_tokens_computed. + """ + # Budget can not schedule in swapped + block_size = 2 + max_seq_group = 3 + seq_tokens_with_swapped: list[list[int]] = [] + blocks_to_swap_out: list[tuple[int, int]] = [] + curr_loras: set[int] = set() + + scheduler = initialize_scheduler( + block_size=block_size, + num_cpu_blocks=64, + num_gpu_blocks=16, + max_num_seqs=max_seq_group, + enable_prefix_caching=True, + ) + budget = create_token_budget(token_budget=15) + + seq_length = 16 + num_seqs = 3 + for i in range(num_seqs): + seq_tokens_with_swapped.append([i] * seq_length) + + seq_and_seq_groups = [ + create_dummy_prompt(f"{i}", + prompt_tokens=seq_tokens_with_swapped[i], + block_size=block_size) + for i in range(len(seq_tokens_with_swapped)) + ] + + for _, seq_group in seq_and_seq_groups: + scheduler._allocate_and_set_running(seq_group) + scheduler._swap_out(seq_group, blocks_to_swap_out) + scheduler._add_seq_group_to_swapped(seq_group) + + scheduler._schedule_swapped(budget, curr_loras) + seq_id_to_num_tokens_computed = ( + scheduler.block_manager._computed_blocks_tracker. + _seq_id_to_num_tokens_computed.get(1)) + assert seq_id_to_num_tokens_computed is None + + # Prefill schedule don't have a space for another LoRA, so + # we ignore this request for now. + block_size = 4 + lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) + scheduler = initialize_scheduler(lora_config=lora_config, + block_size=block_size, + num_cpu_blocks=64, + num_gpu_blocks=64, + enable_prefix_caching=True) + budget = create_token_budget(token_budget=120) + num_seqs = 2 + for i in range(num_seqs): + _, seq_group = create_dummy_prompt(str(i), + prompt_length=seq_length, + block_size=block_size, + lora_request=LoRARequest( + lora_name=str(i), + lora_int_id=i + 1, + lora_path="abc")) + scheduler.add_seq_group(seq_group) + + scheduler._schedule_prefills(budget, curr_loras) + seq_id_to_num_tokens_computed = ( + scheduler.block_manager._computed_blocks_tracker. + _seq_id_to_num_tokens_computed.get(1)) + assert seq_id_to_num_tokens_computed is None + + # Priority preemption schedule + scheduler._schedule_priority_preemption(budget) + seq_id_to_num_tokens_computed = ( + scheduler.block_manager._computed_blocks_tracker. + _seq_id_to_num_tokens_computed.get(1)) + assert seq_id_to_num_tokens_computed is None + + # Prefill scheduler does not schedule batches with prompt tokens and + # prompt embeddings co-mingled. + block_size = 2 + max_seq_group = 3 + scheduler = initialize_scheduler( + block_size=block_size, + num_cpu_blocks=16, + num_gpu_blocks=16, + max_num_seqs=max_seq_group, + max_model_len=100, + enable_prefix_caching=True, + ) + seq_length = 7 + embedding_size = 5 + seq_tokens_with_embedding: list[list[int]] = [] + seq_embeds: list[Optional[torch.Tensor]] = [] + + seq_tokens_with_embedding.append(list(range(seq_length))) + seq_embeds.append(None) + seq_tokens_with_embedding.append([0] * seq_length) + seq_embeds.append(torch.rand(embedding_size)) + + seq_and_seq_groups = [ + create_dummy_prompt(f"{i}", + prompt_tokens=seq_tokens_with_embedding[i], + prompt_embeds=seq_embeds[i], + block_size=block_size) + for i in range(len(seq_tokens_with_embedding)) + ] + + for _, seq_group in seq_and_seq_groups: + scheduler.add_seq_group(seq_group) + + scheduler._schedule_default() + seq_id_to_num_tokens_computed = ( + scheduler.block_manager._computed_blocks_tracker. + _seq_id_to_num_tokens_computed.get(1)) + assert seq_id_to_num_tokens_computed is None + + # Prefill scheduler budget num_batched_tokens + # >= scheduler_config max_num_batched_tokens + block_size = 2 + max_seq_group = 3 + seq_tokens_prefill_budget: list[list[int]] = [] + + scheduler = initialize_scheduler( + block_size=block_size, + max_token_budget=8, + num_cpu_blocks=16, + num_gpu_blocks=16, + max_num_seqs=max_seq_group, + max_model_len=5, + enable_prefix_caching=True, + ) + seq_length = 4 + num_seqs = 3 + for i in range(num_seqs): + seq_tokens_prefill_budget.append([i] * seq_length) + + seq_and_seq_groups = [ + create_dummy_prompt(f"{i}", + prompt_tokens=seq_tokens_prefill_budget[i], + block_size=block_size) + for i in range(len(seq_tokens_prefill_budget)) + ] + + for _, seq_group in seq_and_seq_groups: + scheduler.add_seq_group(seq_group) + + scheduler._schedule_default() + seq_id_to_num_tokens_computed = ( + scheduler.block_manager._computed_blocks_tracker. + _seq_id_to_num_tokens_computed.get(2)) + assert seq_id_to_num_tokens_computed is None + + # Budget can not schedule in waiting + block_size = 2 + max_seq_group = 3 + + scheduler = initialize_scheduler( + block_size=block_size, + max_token_budget=30, + num_cpu_blocks=16, + num_gpu_blocks=16, + max_num_seqs=max_seq_group, + max_model_len=30, + enable_prefix_caching=True, + ) + seq_length = 16 + num_seqs = 3 + seq_tokens_prefill_budget_waiting: list[list[int]] = [] + + for i in range(num_seqs): + seq_tokens_prefill_budget_waiting.append(list(range(seq_length))) + + seq_and_seq_groups = [ + create_dummy_prompt(f"{i}", + prompt_tokens=seq_tokens_prefill_budget_waiting[i], + block_size=block_size) + for i in range(len(seq_tokens_prefill_budget_waiting)) + ] + + for _, seq_group in seq_and_seq_groups: + scheduler.add_seq_group(seq_group) + + scheduler._schedule_default() + seq_id_to_num_tokens_computed = ( + scheduler.block_manager._computed_blocks_tracker. + _seq_id_to_num_tokens_computed.get(1)) + assert seq_id_to_num_tokens_computed is None + + # Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED + block_size = 2 + max_seq_group = 3 + scheduler = initialize_scheduler( + block_size=block_size, + num_cpu_blocks=16, + num_gpu_blocks=16, + max_num_seqs=max_seq_group, + max_model_len=30, + enable_prefix_caching=True, + ) + + seq_length = 31 + seq_tokens_prompt_limit: list[list[int]] = [] + seq_tokens_prompt_limit.append(list(range(seq_length))) + seq_and_seq_groups = [ + create_dummy_prompt("0", + prompt_tokens=seq_tokens_prompt_limit[0], + block_size=block_size) + ] + for _, seq_group in seq_and_seq_groups: + scheduler.add_seq_group(seq_group) + scheduler._schedule_default() + seq_id_to_num_tokens_computed = ( + scheduler.block_manager._computed_blocks_tracker. + _seq_id_to_num_tokens_computed.get(0)) + assert seq_id_to_num_tokens_computed is None + + # Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED + block_size = 2 + max_seq_group = 3 + scheduler = initialize_scheduler( + block_size=block_size, + num_cpu_blocks=160, + num_gpu_blocks=160, + max_num_seqs=max_seq_group, + max_model_len=320, + enable_prefix_caching=True, + ) + + seq_length = 320 + num_seqs = 1 + seq_tokens_never: list[list[int]] = [] + for i in range(num_seqs): + seq_tokens_never.append(list(range(seq_length))) + + seq_and_seq_groups = [ + create_dummy_prompt(f"{i}", + prompt_tokens=seq_tokens_never[i], + block_size=block_size) + for i in range(len(seq_tokens_never)) + ] + + for _, seq_group in seq_and_seq_groups: + scheduler.add_seq_group(seq_group) + + scheduler._schedule_default() + seq_id_to_num_tokens_computed = ( + scheduler.block_manager._computed_blocks_tracker. + _seq_id_to_num_tokens_computed.get(0)) + assert seq_id_to_num_tokens_computed is None + + # Budget can not allocate, AllocStatus is LATER + block_size = 2 + max_seq_group = 3 + scheduler = initialize_scheduler( + block_size=block_size, + num_cpu_blocks=160, + num_gpu_blocks=160, + max_num_seqs=max_seq_group, + max_model_len=320, + enable_prefix_caching=True, + ) + + seq_length = 160 + num_seqs = 2 + seq_tokens_later: list[list[int]] = [] + for i in range(num_seqs): + seq_tokens_later.append(list(range(seq_length))) + + seq_and_seq_groups = [ + create_dummy_prompt(f"{i}", + prompt_tokens=seq_tokens_later[i], + block_size=block_size) + for i in range(len(seq_tokens_later)) + ] + + for _, seq_group in seq_and_seq_groups: + scheduler.add_seq_group(seq_group) + + scheduler._schedule_default() + seq_id_to_num_tokens_computed = ( + scheduler.block_manager._computed_blocks_tracker. + _seq_id_to_num_tokens_computed.get(1)) + assert seq_id_to_num_tokens_computed is None diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 4676dc992a879..707891f6bdd8d 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -25,6 +25,12 @@ TOKEN_IDS = [ ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + """We can run both engines for this test.""" + pass + + @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to @@ -104,3 +110,22 @@ def test_multiple_sampling_params(llm: LLM): # sampling_params is None, default params should be applied outputs = llm.generate(PROMPTS, sampling_params=None) assert len(PROMPTS) == len(outputs) + + +def test_max_model_len(): + max_model_len = 20 + llm = LLM( + model=MODEL_NAME, + max_model_len=max_model_len, + gpu_memory_utilization=0.10, + enforce_eager=True, # reduce test time + ) + sampling_params = SamplingParams(max_tokens=max_model_len + 10) + outputs = llm.generate(PROMPTS, sampling_params) + for output in outputs: + num_total_tokens = len(output.prompt_token_ids) + len( + output.outputs[0].token_ids) + # Total tokens must not exceed max_model_len. + # It can be less if generation finishes due to other reasons (e.g., EOS) + # before reaching the absolute model length limit. + assert num_total_tokens <= max_model_len diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb_embed.py similarity index 73% rename from tests/entrypoints/openai/correctness/test_mteb.py rename to tests/entrypoints/openai/correctness/test_mteb_embed.py index 437c485113520..12a86f9bdd59e 100644 --- a/tests/entrypoints/openai/correctness/test_mteb.py +++ b/tests/entrypoints/openai/correctness/test_mteb_embed.py @@ -7,34 +7,30 @@ import pytest from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS, MTEB_EMBED_TOL, OpenAIClientMtebEncoder, - run_mteb_embed_task, - run_mteb_embed_task_st) + run_mteb_embed_task) from tests.utils import RemoteOpenAIServer os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" -MODEL_NAME = "BAAI/bge-m3" -DTYPE = "float16" -MAIN_SCORE = 0.7873427091972599 +MODEL_NAME = "intfloat/e5-small" +MAIN_SCORE = 0.7422994752439667 @pytest.fixture(scope="module") def server(): args = [ - "--task", "embed", "--dtype", DTYPE, "--enforce-eager", - "--max-model-len", "512" + "--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server -def test_mteb(server): +def test_mteb_embed(server): client = server.get_client() encoder = OpenAIClientMtebEncoder(MODEL_NAME, client) vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS) - st_main_score = MAIN_SCORE or run_mteb_embed_task_st( - MODEL_NAME, MTEB_EMBED_TASKS) + st_main_score = MAIN_SCORE print("VLLM main score: ", vllm_main_score) print("SentenceTransformer main score: ", st_main_score) diff --git a/tests/entrypoints/openai/correctness/test_mteb_score.py b/tests/entrypoints/openai/correctness/test_mteb_score.py new file mode 100644 index 0000000000000..f90fc0b9be002 --- /dev/null +++ b/tests/entrypoints/openai/correctness/test_mteb_score.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os + +import pytest + +# yapf conflicts with isort for this block +# yapf: disable +from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS, + MTEB_RERANK_TASKS, + MTEB_RERANK_TOL, + RerankClientMtebEncoder, + ScoreClientMtebEncoder, + run_mteb_rerank) +# yapf: enable +from tests.utils import RemoteOpenAIServer + +os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" + +MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2" +MAIN_SCORE = 0.33437 + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", "score", "--enforce-eager", "--disable-uvicorn-access-log" + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +def test_mteb_score(server): + url = server.url_for("score") + encoder = ScoreClientMtebEncoder(MODEL_NAME, url) + vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, + MTEB_RERANK_LANGS) + st_main_score = MAIN_SCORE + + print("VLLM main score: ", vllm_main_score) + print("SentenceTransformer main score: ", st_main_score) + print("Difference: ", st_main_score - vllm_main_score) + + assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL) + + +def test_mteb_rerank(server): + url = server.url_for("rerank") + encoder = RerankClientMtebEncoder(MODEL_NAME, url) + vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, + MTEB_RERANK_LANGS) + st_main_score = MAIN_SCORE + + print("VLLM main score: ", vllm_main_score) + print("SentenceTransformer main score: ", st_main_score) + print("Difference: ", st_main_score - vllm_main_score) + + assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL) diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index daa4a78c935a7..6e32887f5ed0a 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" assert chatml_jinja_path.exists() # Define models, templates, and their corresponding expected outputs -MODEL_TEMPLATE_GENERATON_OUTPUT = [ +MODEL_TEMPLATE_GENERATION_OUTPUT = [ ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user Hello<|im_end|> <|im_start|>assistant @@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike(): @pytest.mark.parametrize( "model,template,add_generation_prompt,continue_final_message,expected_output", - MODEL_TEMPLATE_GENERATON_OUTPUT) + MODEL_TEMPLATE_GENERATION_OUTPUT) def test_get_gen_prompt(model, template, add_generation_prompt, continue_final_message, expected_output): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 1cb0a39df5139..8117e774951ee 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -74,19 +74,29 @@ async def test_bad_requests(mary_had_lamb): language="hh", temperature=0.0) - # Expect audio too long: repeat the timeseries - mary_had_lamb.seek(0) - audio, sr = librosa.load(mary_had_lamb) - repeated_audio = np.tile(audio, 10) - # Repeated audio to buffer - buffer = io.BytesIO() - sf.write(buffer, repeated_audio, sr, format='WAV') - buffer.seek(0) - with pytest.raises(openai.BadRequestError): - await client.audio.transcriptions.create(model=model_name, - file=buffer, - language="en", - temperature=0.0) + +@pytest.mark.asyncio +async def test_long_audio_request(mary_had_lamb): + model_name = "openai/whisper-large-v3-turbo" + server_args = ["--enforce-eager"] + + mary_had_lamb.seek(0) + audio, sr = librosa.load(mary_had_lamb) + repeated_audio = np.tile(audio, 10) + # Repeated audio to buffer + buffer = io.BytesIO() + sf.write(buffer, repeated_audio, sr, format='WAV') + buffer.seek(0) + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + transcription = await client.audio.transcriptions.create( + model=model_name, + file=buffer, + language="en", + response_format="text", + temperature=0.0) + out = json.loads(transcription)['text'] + assert out.count("Mary had a little lamb") == 10 @pytest.mark.asyncio diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 2d381a99be60c..7269d19183bf2 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -10,6 +10,7 @@ import torch from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.utils import opcheck from vllm import _custom_ops as ops +from vllm.attention.layer import Attention, MultiHeadAttention from vllm.platforms import current_platform from vllm.utils import get_max_shared_memory_bytes @@ -506,3 +507,18 @@ def test_multi_query_kv_attention_with_alibi( device, use_alibi=True, ) + + +@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention]) +def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None: + head_size = 64 + scale = float(1.0 / (head_size**0.5)) + num_heads = 16 + num_kv_heads = 5 + with pytest.raises(AssertionError): + _ = attention_cls( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + ) diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index e508505c2b05d..7895076155801 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -72,8 +72,8 @@ def test_copy_blocks( # destination blocks. assert 2 * num_mappings <= num_blocks src_blocks = random.sample(range(num_blocks), num_mappings) - remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) - dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) + remaining_blocks = list(set(range(num_blocks)) - set(src_blocks)) + dst_blocks = random.sample(remaining_blocks, 2 * num_mappings) block_mapping: list[tuple[int, int]] = [] for i in range(num_mappings): src = src_blocks[i] @@ -189,12 +189,12 @@ def test_reshape_and_cache( # Run the reference implementation. reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) - block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_indicies_lst = block_indicies.cpu().tolist() + block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_indices_lst = block_indices.cpu().tolist() block_offsets = slot_mapping % block_size block_offsets_lst = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indicies_lst[i] + block_idx = block_indices_lst[i] block_offset = block_offsets_lst[i] cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i] @@ -322,12 +322,12 @@ def test_reshape_and_cache_flash( kv_dtype=kv_cache_dtype) # Run the reference implementation. - block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_indicies_lst = block_indicies.cpu().tolist() + block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_indices_lst = block_indices.cpu().tolist() block_offsets = slot_mapping % block_size block_offsets_lst = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indicies_lst[i] + block_idx = block_indices_lst[i] block_offset = block_offsets_lst[i] if kv_cache_layout == "NHD": cloned_key_cache[block_idx, block_offset, :, :] = key[i] diff --git a/tests/kernels/attention/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py index c6ce7b0cce40d..a2e6986460904 100644 --- a/tests/kernels/attention/test_encoder_decoder_attn.py +++ b/tests/kernels/attention/test_encoder_decoder_attn.py @@ -46,7 +46,7 @@ CUDA_DEVICE = "cuda:0" MAX_DEC_SEQ_LENS = [128] MAX_ENC_SEQ_LENS = [128] -# Narrow teest-cases for unsupported-scenario +# Narrow test-cases for unsupported-scenario # tests HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]] @@ -99,7 +99,7 @@ class TestResources(NamedTuple): Attributes: * scale: 1/sqrt(d) scale factor for attn - * attn_backend: implementatino of abstraction + * attn_backend: implementations of abstraction attention interface using a particular kernel library i.e. XFormers diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py index db0fdcbf5ef22..d1fd960bf115c 100644 --- a/tests/kernels/core/test_rotary_embedding.py +++ b/tests/kernels/core/test_rotary_embedding.py @@ -39,10 +39,10 @@ def rotary_embedding_opcheck(rot, @pytest.mark.parametrize("head_size", [32, 108]) @pytest.mark.parametrize("seq_len", [11, 1024]) @pytest.mark.parametrize("use_key", [True, False]) -@pytest.mark.parametrize("head_stride_is_contingous", [True, False]) +@pytest.mark.parametrize("head_stride_is_contiguous", [True, False]) def test_rotary_embedding_opcheck(dist_init, device, max_position, is_neox_style, rotary_dim, head_size, - seq_len, use_key, head_stride_is_contingous): + seq_len, use_key, head_stride_is_contiguous): batch_size = 1 base = 10000 num_heads = 7 @@ -52,7 +52,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position, positions = torch.randint(0, max_position, (batch_size, seq_len), device=device) - head_stride = head_size + (64 if head_stride_is_contingous else 0) + head_stride = head_size + (64 if head_stride_is_contiguous else 0) query = torch.randn(batch_size, seq_len, @@ -72,7 +72,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position, # if we have a contiguous head stride, test the alternate # [..., num_heads * head_dim] shape/layout - if head_stride_is_contingous: + if head_stride_is_contiguous: rotary_embedding_opcheck( rot, positions, query.flatten(start_dim=-2), key.flatten(start_dim=-2) if use_key else None) diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index abed1252a3ce6..ccf0ff6abd169 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -107,15 +107,15 @@ def generate_random_inputs(batch_size, return A, dt, X, B, C -def generate_continous_batched_examples(example_lens_by_batch, - num_examples, - full_length, - last_taken, - exhausted, - n_heads, - d_head, - itype, - device='cuda'): +def generate_continuous_batched_examples(example_lens_by_batch, + num_examples, + full_length, + last_taken, + exhausted, + n_heads, + d_head, + itype, + device='cuda'): # this function generates a random examples of certain length # and then cut according to "example_lens_by_batch" and feed @@ -269,11 +269,10 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, exhausted: dict = {} # map: eg -> boolean indicating example is exhausted states = None - for Y_min, cu_seqlens, seq_idx, (A, dt, X, B, - C) in generate_continous_batched_examples( - cases, num_examples, seqlen, - last_taken, exhausted, n_heads, - d_head, itype): + for Y_min, cu_seqlens, seq_idx, ( + A, dt, X, B, C) in generate_continuous_batched_examples( + cases, num_examples, seqlen, last_taken, exhausted, n_heads, + d_head, itype): chunk_indices, chunk_offsets = \ _query_start_loc_to_chunk_indices_offsets( diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 474745f94815f..ce420901e3177 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -29,6 +29,7 @@ MNK_FACTORS = [ (224, 1024, 1536), (224, 3072, 1024), (224, 3072, 1536), + (1024 * 128, 1024, 1024), ] vllm_config = VllmConfig(parallel_config=ParallelConfig( diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 7238813a299d6..bed374cf4d564 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -15,7 +15,8 @@ import vllm.model_executor.layers.fused_moe # noqa from tests.kernels.utils import opcheck, stack_and_dev, torch_moe from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_topk, modular_triton_fused_moe) from vllm.model_executor.layers.fused_moe.moe_torch_iterative import ( fused_moe as iterative_moe) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( @@ -76,6 +77,13 @@ def test_fused_moe( else: e_map = None + m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=False, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + per_channel_quant=False, + block_shape=None) + with set_current_vllm_config(vllm_config): torch_output = torch_moe(a, w1, w2, score, topk, e_map) iterative_output = iterative_moe(a, @@ -103,7 +111,20 @@ def test_fused_moe( expert_map=e_map, renormalize=False) + topk_weights, topk_ids, _ = fused_topk(a, score, topk, False) + m_triton_output = m_fused_moe(a, + w1, + w2, + topk_weights, + topk_ids, + global_num_experts=e, + expert_map=e_map) + torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) + torch.testing.assert_close(m_triton_output, + torch_output, + atol=2e-2, + rtol=0) torch.testing.assert_close(iterative_output, torch_output, atol=2e-2, diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index ef3e6adcfa364..d90202dfcb3bd 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + import pytest import torch -from tests.pplx_utils import ProcessGroupInfo, parallel_launch from vllm import _custom_ops as ops from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul @@ -14,6 +15,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) from vllm.platforms import current_platform +from .deepep_utils import ProcessGroupInfo, parallel_launch + try: from pplx_kernels import AllToAll from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id, @@ -64,6 +67,7 @@ def pplx_cutlass_moe( out_dtype, per_act_token: bool, per_out_ch: bool, + group_name: Optional[str], ): from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( PplxPrepareAndFinalize) @@ -84,7 +88,7 @@ def pplx_cutlass_moe( else: scale_elems = (hidden_dim + block_size - 1) // block_size - ata = AllToAll.internode( + args = dict( max_num_tokens=max_num_tokens, num_experts=num_experts, experts_per_token=topk, @@ -96,6 +100,12 @@ def pplx_cutlass_moe( hidden_dim_scale_bytes=scale_elems * torch.float32.itemsize, ) + if group_name is None: + ata = AllToAll.internode(**args) + else: + args["group_name"] = group_name + ata = AllToAll.intranode(**args) + w1 = w1.to(device) w2 = w2.to(device) w1_scale = w1_scale.to(device) @@ -113,7 +123,10 @@ def pplx_cutlass_moe( ) experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size, - out_dtype, per_act_token, per_out_ch) + out_dtype, + per_act_token, + per_out_ch, + use_batched_format=True) fused_cutlass_experts = FusedMoEModularKernel( prepare_finalize, @@ -184,11 +197,17 @@ def _pplx_moe( w2_full: torch.Tensor, per_act_token: bool, per_out_ch: bool, + use_internode: bool, ): - uid = nvshmem_get_unique_id( - ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() - torch.distributed.broadcast(uid, src=0) - nvshmem_init(uid, pgi.rank, pgi.world_size) + if use_internode: + uid = nvshmem_get_unique_id( + ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() + torch.distributed.broadcast(uid, src=0) + nvshmem_init(uid, pgi.rank, pgi.world_size) + else: + group_ranks = list(range(pgi.world_size)) + cpu_group = torch.distributed.new_group(group_ranks, backend="gloo") + group_name = cpu_group.group_name with set_current_vllm_config(vllm_config): torch_output = torch_moe2(a_full, w1_full, w2_full, topk_weights, @@ -196,7 +215,7 @@ def _pplx_moe( pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale, w2_scale, topk_weights, topk_ids, a1_scale, out_dtype, per_act_token, - per_out_ch) + per_out_ch, group_name) torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(pplx_output.device) @@ -207,7 +226,8 @@ def _pplx_moe( torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0) - nvshmem_finalize() + if use_internode: + nvshmem_finalize() @pytest.mark.parametrize("m", [2, 224]) @@ -218,6 +238,7 @@ def _pplx_moe( @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("world_dp_size", [[2, 1]]) #, [4, 2]]) +@pytest.mark.parametrize("use_internode", [False]) @pytest.mark.skipif( (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))( current_platform.get_device_capability()), @@ -232,6 +253,7 @@ def test_cutlass_moe_pplx( per_act_token: bool, per_out_ch: bool, world_dp_size: tuple[int, int], + use_internode: bool, ): current_platform.seed_everything(7) @@ -284,4 +306,5 @@ def test_cutlass_moe_pplx( parallel_launch(world_size, _pplx_moe, dp_size, a, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, a_scale1, - dtype, a, w1_d, w2_d, per_act_token, per_out_ch) + dtype, a, w1_d, w2_d, per_act_token, per_out_ch, + use_internode) diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index bbfe31d0e650f..2d6a8f39cec5f 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -18,7 +18,6 @@ try: except ImportError: has_pplx = False -from tests.pplx_utils import ProcessGroupInfo, parallel_launch from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import override_config @@ -30,6 +29,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) from vllm.platforms import current_platform +from .deepep_utils import ProcessGroupInfo, parallel_launch + requires_pplx = pytest.mark.skipif( not has_pplx, reason="Requires PPLX kernels", @@ -153,7 +154,10 @@ def batched_moe( num_experts = w1.shape[0] fused_experts = FusedMoEModularKernel( - BatchedPrepareAndFinalize(a.shape[0], world_size=1, dp_size=1, rank=0), + BatchedPrepareAndFinalize(max_num_tokens=a.shape[0], + world_size=1, + dp_size=1, + rank=0), BatchedExperts(max_num_tokens=a.shape[0], dp_size=1, world_size=1)) return fused_experts(a, w1, w2, topk_weight, topk_ids, num_experts) @@ -229,9 +233,15 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor: return t[(r * chunk):(r + 1) * chunk] -def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor, - topk_weight: torch.Tensor, topk_ids: torch.Tensor, - num_experts: int) -> torch.Tensor: +def pplx_prepare_finalize( + pgi: ProcessGroupInfo, + dp_size: int, + a: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + group_name: Optional[str], +) -> torch.Tensor: from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( PplxPrepareAndFinalize) @@ -245,7 +255,7 @@ def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor, world_size = pgi.world_size max_num_tokens = rank_chunk(num_tokens, 0, world_size) - ata = AllToAll.internode( + args = dict( max_num_tokens=max_num_tokens, num_experts=num_experts, experts_per_token=topk, @@ -259,6 +269,12 @@ def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor, torch.float32.itemsize)), ) + if group_name is None: + ata = AllToAll.internode(**args) + else: + args["group_name"] = group_name + ata = AllToAll.intranode(**args) + topk_ids = topk_ids.to(dtype=torch.uint32) prepare_finalize = PplxPrepareAndFinalize( @@ -274,7 +290,7 @@ def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor, chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device) chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device) - b_a, b_a_scale, expert_num_tokens = prepare_finalize.prepare( + b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare( a_chunk, None, None, @@ -318,11 +334,19 @@ def _pplx_prepare_finalize( score: torch.Tensor, topk: torch.Tensor, num_experts: int, + use_internode: bool, ): - uid = nvshmem_get_unique_id( - ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() - torch.distributed.broadcast(uid, src=0) - nvshmem_init(uid, pgi.rank, pgi.world_size) + if use_internode: + uid = nvshmem_get_unique_id( + ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() + torch.distributed.broadcast(uid, src=0) + nvshmem_init(uid, pgi.rank, pgi.world_size) + group_name = None + else: + group_ranks = list(range(pgi.world_size)) + cpu_group = torch.distributed.new_group(group_ranks, backend="gloo") + group_name = cpu_group.group_name + device = pgi.device topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) @@ -335,14 +359,15 @@ def _pplx_prepare_finalize( a.dtype) pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids, - num_experts) + num_experts, group_name) torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(pplx_output.device) torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0) - nvshmem_finalize() + if use_internode: + nvshmem_finalize() # TODO (bnell): this test point does not work for odd M due to how the test is @@ -353,6 +378,7 @@ def _pplx_prepare_finalize( @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("world_dp_size", [[2, 1]]) +@pytest.mark.parametrize("use_internode", [False]) @requires_pplx def test_pplx_prepare_finalize( mnk: tuple[int, int, int], @@ -360,6 +386,7 @@ def test_pplx_prepare_finalize( topk: int, dtype: torch.dtype, world_dp_size: tuple[int, int], + use_internode: bool, ): current_platform.seed_everything(7) m, n, k = mnk @@ -369,10 +396,11 @@ def test_pplx_prepare_finalize( score = torch.randn((m, e), device=device, dtype=dtype) parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score, - topk, e) + topk, e, use_internode) def pplx_moe( + group_name: Optional[str], rank: int, world_size: int, dp_size: int, @@ -394,7 +422,7 @@ def pplx_moe( topk = topk_ids.shape[1] max_num_tokens = rank_chunk(a.shape[0], 0, world_size) - ata = AllToAll.internode( + args = dict( max_num_tokens=max_num_tokens, num_experts=num_experts, experts_per_token=topk, @@ -408,6 +436,12 @@ def pplx_moe( torch.float32.itemsize)), ) + if group_name is None: + ata = AllToAll.internode(**args) + else: + args["group_name"] = group_name + ata = AllToAll.intranode(**args) + topk_ids = topk_ids.to(dtype=torch.uint32) prepare_finalize = PplxPrepareAndFinalize( @@ -522,11 +556,18 @@ def _pplx_moe( w2: torch.Tensor, score: torch.Tensor, topk: int, + use_internode: bool, ): - uid = nvshmem_get_unique_id( - ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() - torch.distributed.broadcast(uid, src=0) - nvshmem_init(uid, pgi.rank, pgi.world_size) + if use_internode: + uid = nvshmem_get_unique_id( + ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() + torch.distributed.broadcast(uid, src=0) + nvshmem_init(uid, pgi.rank, pgi.world_size) + group_name = None + else: + group_ranks = list(range(pgi.world_size)) + cpu_group = torch.distributed.new_group(group_ranks, backend="gloo") + group_name = cpu_group.group_name m, k = a.shape e, _, n = w2.shape @@ -536,8 +577,8 @@ def _pplx_moe( with set_current_vllm_config(vllm_config), override_config(moe_config): topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids) - pplx_output = pplx_moe(pgi.rank, pgi.world_size, dp_size, a, w1, w2, - topk_weight, topk_ids) + pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size, + a, w1, w2, topk_weight, topk_ids) # TODO (bnell): fix + re-enable #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, # topk_ids) @@ -548,7 +589,8 @@ def _pplx_moe( torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0) #torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0) - nvshmem_finalize() + if use_internode: + nvshmem_finalize() @pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS) @@ -556,6 +598,7 @@ def _pplx_moe( @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("world_dp_size", [[2, 1]]) +@pytest.mark.parametrize("use_internode", [False]) @requires_pplx def test_pplx_moe( mnk: tuple[int, int, int], @@ -563,6 +606,7 @@ def test_pplx_moe( topk: int, dtype: torch.dtype, world_dp_size: tuple[int, int], + use_internode: bool, ): current_platform.seed_everything(7) m, n, k = mnk @@ -572,4 +616,5 @@ def test_pplx_moe( w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 score = torch.randn((m, e), device="cuda", dtype=dtype) - parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk) + parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, + use_internode) diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index 8c5ee98743d72..eec59573792db 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -13,7 +13,8 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( _valid_deep_gemm_shape, deep_gemm_moe_fp8) -from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_topk, modular_triton_fused_moe) from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( moe_align_block_size) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( @@ -45,7 +46,7 @@ N = [128, 512, 7168, 7748, 13824] K = [256, 3884, 4096, 13824, 16384] # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8 # and its hidden size is 7168. -M_moe = [1, 2, 7, 83, 128, 2048] +M_moe = [1, 2, 7, 83, 128, 2048, 1024 * 128] M_moe_dg = [128, 192, 1335, 2048] N_moe = [128, 256, 1024, 4608] # [13824] K_moe = [256, 512, 7168] # [13824] @@ -214,6 +215,13 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): score = torch.randn((M, E), dtype=dtype) + m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=True, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + per_channel_quant=False, + block_shape=block_size) + # Set the context to avoid lots of warning spam. with set_current_vllm_config(vllm_config): out = fused_moe( @@ -231,6 +239,16 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_size) + topk_weights, topk_ids, _ = fused_topk(a, score, topk, False) + m_out = m_fused_moe(a, + w1, + w2, + topk_weights, + topk_ids, + global_num_experts=E, + w1_scale=w1_s, + w2_scale=w2_s) + #print(f"{out.sum()=}") #print(f"{ref_out.sum()=}") @@ -239,6 +257,11 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): torch.mean(torch.abs(ref_out.to(torch.float32)))) assert rel_diff < 0.03 + rel_diff = (torch.mean( + torch.abs(m_out.to(torch.float32) - ref_out.to(torch.float32))) / + torch.mean(torch.abs(ref_out.to(torch.float32)))) + assert rel_diff < 0.03 + def per_block_cast_to_fp8( x: torch.Tensor, diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py index 63ccf4a917369..5a37b976db9eb 100644 --- a/tests/kernels/quantization/test_int8_quant.py +++ b/tests/kernels/quantization/test_int8_quant.py @@ -11,6 +11,7 @@ from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] HIDDEN_SIZES = [16, 67, 768, 5137, 8193] # Arbitrary values for testing +HIDDEN_SIZES += list(range(1024, 1033)) # vectorized conversion edge cases NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing SEEDS = [0] SCALE = [0.1, 2.1] diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py index 040ddac10258f..74d29e79d96c5 100644 --- a/tests/kernels/test_flex_attention.py +++ b/tests/kernels/test_flex_attention.py @@ -51,7 +51,6 @@ def test_flex_attention_vs_default_backend(monkeypatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") - m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") set_seed(seed) @@ -66,7 +65,6 @@ def test_flex_attention_vs_default_backend(monkeypatch): # Run with default backend with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") set_seed(seed) llm_default = LLM( model_name, diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 0737bb886e43e..4908f9a060f7f 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -164,11 +164,6 @@ def mixtral_lora_files(): return snapshot_download(repo_id="SangBinCho/mixtral-lora") -@pytest.fixture(scope="session") -def gemma_lora_files(): - return snapshot_download(repo_id="wskwon/gemma-7b-test-lora") - - @pytest.fixture(scope="session") def chatglm3_lora_files(): return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider") diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 23819f03dc51f..3ac3b80ec827c 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -4,9 +4,6 @@ import subprocess import sys from typing import Union -import pytest -import ray - import vllm from vllm import LLM from vllm.lora.request import LoRARequest @@ -121,37 +118,6 @@ def test_llama_lora(sql_lora_files): generate_and_test(llm, sql_lora_files) -# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks -# used by the engine yet. -@pytest.mark.skip_v1 -@create_new_process_for_each_test() -def test_llama_lora_warmup(sql_lora_files): - """Test that the LLM initialization works with a warmup LORA path and - is more conservative""" - - @ray.remote(num_gpus=1) - def get_num_gpu_blocks_lora(): - llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16) - num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks - return num_gpu_blocks_lora_warmup - - @ray.remote(num_gpus=1) - def get_num_gpu_blocks_no_lora(): - llm = vllm.LLM(MODEL_PATH, max_num_seqs=16) - num_gpu_blocks_no_lora_warmup = ( - llm.llm_engine.cache_config.num_gpu_blocks) - return num_gpu_blocks_no_lora_warmup - - num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote()) - num_gpu_blocks_no_lora_warmup = ray.get( - get_num_gpu_blocks_no_lora.remote()) - assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, ( - "The warmup with lora should be more " - "conservative than without lora, therefore the number of " - "memory blocks for the KV cache should be " - "less when using lora than when not using lora") - - @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_llama_lora_tp4(sql_lora_files): diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index e9a52e1b63573..50c60341f0d88 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -15,13 +15,6 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf" LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" LORA_RANK = 8 -# @pytest.fixture(autouse=True) -# def v1(run_with_both_engines_lora): -# # Simple autouse wrapper to run both engines for each test -# # This can be promoted up to conftest.py to run for every -# # test in a package -# pass - def make_lora_request(lora_id: int): return LoRARequest(lora_name=f"{lora_id}", diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index a21de070517b1..9d75512a248be 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -11,14 +11,6 @@ MODEL_PATH = "microsoft/phi-2" PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format( @@ -59,7 +51,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: # Skipping for V1 for now as we are hitting, # "Head size 80 is not supported by FlashAttention." error. -@pytest.mark.skip_v1 +@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention") def test_phi2_lora(phi2_lora_files): # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI, # Otherwise, the lora-test will fail due to CUDA OOM. diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transformers_model.py similarity index 100% rename from tests/lora/test_transfomers_model.py rename to tests/lora/test_transformers_model.py diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 6f13e663a78bb..9999c1be54ea5 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -16,6 +16,8 @@ from vllm.lora.request import LoRARequest from vllm.v1.worker.gpu_worker import Worker as V1Worker from vllm.worker.worker import Worker +NUM_LORAS = 16 + @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files): @@ -58,12 +60,12 @@ def test_worker_apply_lora(sql_lora_files): device_config=DeviceConfig("cuda"), cache_config=CacheConfig( block_size=16, - gpu_memory_utilization=1.0, swap_space=0, cache_dtype="auto", ), - lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, - max_loras=32), + lora_config=LoRAConfig(max_lora_rank=8, + max_cpu_loras=NUM_LORAS, + max_loras=NUM_LORAS), ) worker = worker_cls( vllm_config=vllm_config, @@ -78,9 +80,9 @@ def test_worker_apply_lora(sql_lora_files): set_active_loras(worker, []) assert worker.list_loras() == set() - n_loras = 32 lora_requests = [ - LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras) + LoRARequest(str(i + 1), i + 1, sql_lora_files) + for i in range(NUM_LORAS) ] set_active_loras(worker, lora_requests) @@ -89,12 +91,12 @@ def test_worker_apply_lora(sql_lora_files): for lora_request in lora_requests } - for i in range(32): + for i in range(NUM_LORAS): random.seed(i) iter_lora_requests = random.choices(lora_requests, - k=random.randint(1, n_loras)) + k=random.randint(1, NUM_LORAS)) random.shuffle(iter_lora_requests) - iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)] + iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)] set_active_loras(worker, lora_requests) assert worker.list_loras().issuperset( {lora_request.lora_int_id diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py index 7d8acab5e8343..b4c771840196c 100644 --- a/tests/models/language/generation/test_bart.py +++ b/tests/models/language/generation/test_bart.py @@ -118,7 +118,7 @@ def run_test( # default to enforce_eager=True if enforce_eager # is left unspecified. However, the # VllmRunner test fixture (which wraps around the LLM class) defaults to - # enforce_eager=False (a behavior which a number of already-exisitng + # enforce_eager=False (a behavior which a number of already-existing # decoder-only unit tests expect), so when testing an encoder/decoder # model we must explicitly specify enforce_eager=True in the VllmRunner # constructor. diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 0a047951db443..21d55c418c363 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -1,14 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import tempfile from collections.abc import Sequence +from typing import Optional import mteb import numpy as np import pytest +import requests -from tests.models.utils import EmbedModelInfo +from tests.models.utils import EmbedModelInfo, RerankModelInfo -# Most models on the STS12 task (See #17175): +# Most embedding models on the STS12 task (See #17175): # - Model implementation and minor changes in tensor dtype # results in differences less than 1e-4 # - Different model results in differences more than 1e-3 @@ -16,6 +20,11 @@ from tests.models.utils import EmbedModelInfo MTEB_EMBED_TASKS = ["STS12"] MTEB_EMBED_TOL = 1e-4 +# See #19344 +MTEB_RERANK_TASKS = ["NFCorpus"] +MTEB_RERANK_LANGS = ["en"] +MTEB_RERANK_TOL = 1e-3 + class VllmMtebEncoder(mteb.Encoder): @@ -39,6 +48,27 @@ class VllmMtebEncoder(mteb.Encoder): embeds = embeds[np.argsort(r)] return embeds + def predict( + self, + sentences: list[tuple[str, str, + Optional[str]]], # query, corpus, prompt + *args, + **kwargs, + ) -> np.ndarray: + r = self.rng.permutation(len(sentences)) + sentences = [sentences[i] for i in r] + + queries = [s[0] for s in sentences] + corpus = [s[1] for s in sentences] + + outputs = self.model.score(queries, + corpus, + truncate_prompt_tokens=-1, + use_tqdm=False) + scores = np.array(outputs) + scores = scores[np.argsort(r)] + return scores + class OpenAIClientMtebEncoder(mteb.Encoder): @@ -62,21 +92,72 @@ class OpenAIClientMtebEncoder(mteb.Encoder): return embeds +class ScoreClientMtebEncoder(mteb.Encoder): + + def __init__(self, model_name: str, url): + super().__init__() + self.model_name = model_name + self.url = url + self.rng = np.random.default_rng(seed=42) + + def predict( + self, + sentences: list[tuple[str, str, + Optional[str]]], # query, corpus, prompt + *args, + **kwargs, + ) -> np.ndarray: + r = self.rng.permutation(len(sentences)) + sentences = [sentences[i] for i in r] + + outputs = [] + for query, corpus, prompt in sentences: + outputs.append(self.get_score(query, corpus)) + + scores = np.array(outputs) + scores = scores[np.argsort(r)] + return scores + + def get_score(self, query, corpus): + response = requests.post(self.url, + json={ + "model": self.model_name, + "text_1": query, + "text_2": corpus, + "truncate_prompt_tokens": -1, + }).json() + return response['data'][0]["score"] + + +class RerankClientMtebEncoder(ScoreClientMtebEncoder): + + def get_score(self, query, corpus): + response = requests.post(self.url, + json={ + "model": self.model_name, + "query": query, + "documents": [corpus], + "truncate_prompt_tokens": -1, + }).json() + return response['results'][0]["relevance_score"] + + def run_mteb_embed_task(encoder, tasks): tasks = mteb.get_tasks(tasks=tasks) evaluation = mteb.MTEB(tasks=tasks) - results = evaluation.run(encoder, verbosity=0, output_folder=None) + results = evaluation.run( + encoder, + verbosity=0, + output_folder=None, + encode_kwargs={ + "show_progress_bar": False, + }, + ) main_score = results[0].scores["test"][0]["main_score"] return main_score -def run_mteb_embed_task_st(model_name, tasks): - from sentence_transformers import SentenceTransformer - model = SentenceTransformer(model_name) - return run_mteb_embed_task(model, tasks) - - def mteb_test_embed_models(hf_runner, vllm_runner, model_info: EmbedModelInfo, @@ -118,3 +199,96 @@ def mteb_test_embed_models(hf_runner, print("Difference:", st_main_score - vllm_main_score) assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL) + + +def run_mteb_rerank(cross_encoder, tasks, languages): + with tempfile.TemporaryDirectory() as results_folder: + bm25s = mteb.get_model("bm25s") + tasks = mteb.get_tasks(tasks=tasks, languages=languages) + + subset = "default" + eval_splits = ["test"] + + evaluation = mteb.MTEB(tasks=tasks) + evaluation.run( + bm25s, + verbosity=0, + eval_splits=eval_splits, + save_predictions=True, + output_folder=f"{results_folder}/stage1", + encode_kwargs={"show_progress_bar": False}, + ) + + results = evaluation.run( + cross_encoder, + verbosity=0, + eval_splits=eval_splits, + top_k=10, + save_predictions=True, + output_folder=f"{results_folder}/stage2", + previous_results= + f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json", + encode_kwargs={"show_progress_bar": False}, + ) + main_score = results[0].scores["test"][0]["main_score"] + return main_score + + +def mteb_test_rerank_models(hf_runner, + vllm_runner, + model_info: RerankModelInfo, + vllm_extra_kwargs=None, + hf_model_callback=None): + if not model_info.enable_test: + # A model family has many models with the same architecture, + # and we don't need to test each one. + pytest.skip("Skipping test.") + + vllm_extra_kwargs = vllm_extra_kwargs or {} + vllm_extra_kwargs["dtype"] = model_info.dtype + + with vllm_runner(model_info.name, + task="score", + max_model_len=None, + **vllm_extra_kwargs) as vllm_model: + + if model_info.architecture: + assert (model_info.architecture + in vllm_model.model.llm_engine.model_config.architectures) + + vllm_main_score = run_mteb_rerank(VllmMtebEncoder(vllm_model), + tasks=MTEB_RERANK_TASKS, + languages=MTEB_RERANK_LANGS) + vllm_dtype = vllm_model.model.llm_engine.model_config.dtype + + with hf_runner(model_info.name, is_cross_encoder=True, + dtype="float32") as hf_model: + + original_predict = hf_model.predict + + def _predict( + sentences: list[tuple[str, str, + Optional[str]]], # query, corpus, prompt + *args, + **kwargs, + ): + # vllm and st both remove the prompt, fair comparison. + prompts = [(s[0], s[1]) for s in sentences] + return original_predict(prompts, *args, **kwargs, batch_size=8) + + hf_model.predict = _predict + hf_model.original_predict = original_predict + + if hf_model_callback is not None: + hf_model_callback(hf_model) + + st_main_score = run_mteb_rerank(hf_model, + tasks=MTEB_RERANK_TASKS, + languages=MTEB_RERANK_LANGS) + st_dtype = next(hf_model.model.model.parameters()).dtype + + print("VLLM:", vllm_dtype, vllm_main_score) + print("SentenceTransformers:", st_dtype, st_main_score) + print("Difference:", st_main_score - vllm_main_score) + + assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL) diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py index 1af3c05d3d907..3990e8ea92c8a 100644 --- a/tests/models/language/pooling/test_baai.py +++ b/tests/models/language/pooling/test_baai.py @@ -2,8 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from .embed_utils import EmbedModelInfo, correctness_test_embed_models -from .mteb_utils import mteb_test_embed_models +from ...utils import EmbedModelInfo, RerankModelInfo +from .embed_utils import correctness_test_embed_models +from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models MODELS = [ ########## BertModel @@ -57,6 +58,20 @@ MODELS = [ enable_test=True), ] +RERANK_MODELS = [ + ########## XLMRobertaForSequenceClassification + RerankModelInfo("BAAI/bge-reranker-base", + architecture="XLMRobertaForSequenceClassification", + enable_test=True), + RerankModelInfo("BAAI/bge-reranker-large", + architecture="XLMRobertaForSequenceClassification", + enable_test=False), + RerankModelInfo("BAAI/bge-reranker-v2-m3", + architecture="XLMRobertaForSequenceClassification", + dtype="float32", + enable_test=False) +] + @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, @@ -70,3 +85,9 @@ def test_embed_models_correctness(hf_runner, vllm_runner, example_prompts) -> None: correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts) + + +@pytest.mark.parametrize("model_info", RERANK_MODELS) +def test_rerank_models_mteb(hf_runner, vllm_runner, + model_info: RerankModelInfo) -> None: + mteb_test_rerank_models(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/pooling/test_cross_encoder.py b/tests/models/language/pooling/test_cross_encoder.py new file mode 100644 index 0000000000000..9a33063d7b469 --- /dev/null +++ b/tests/models/language/pooling/test_cross_encoder.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from .mteb_utils import RerankModelInfo, mteb_test_rerank_models + +RERANK_MODELS = [ + RerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2", + architecture="BertForSequenceClassification"), + RerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", + architecture="Qwen3ForSequenceClassification") +] + + +@pytest.mark.parametrize("model_info", RERANK_MODELS) +def test_rerank_models_mteb(hf_runner, vllm_runner, + model_info: RerankModelInfo) -> None: + mteb_test_rerank_models(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 05bd479f42b95..6a3a0f150b6d7 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -45,6 +45,15 @@ MODELS = [ EmbedModelInfo("Alibaba-NLP/gte-modernbert-base", architecture="ModernBertModel", enable_test=True), + ########## Qwen3ForCausalLM + EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", + architecture="Qwen3ForCausalLM", + dtype="float32", + enable_test=True), + EmbedModelInfo("Qwen/Qwen3-Embedding-4B", + architecture="Qwen3ForCausalLM", + dtype="float32", + enable_test=False), ] diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 33255021ad6ac..0c44683e7486d 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -6,28 +6,10 @@ import pytest from vllm import PoolingParams -from .embed_utils import (EmbedModelInfo, check_embeddings_close, +from ...utils import EmbedModelInfo, RerankModelInfo +from .embed_utils import (check_embeddings_close, correctness_test_embed_models, matryoshka_fy) -from .mteb_utils import mteb_test_embed_models - -SCORING_MODELS = [ - "jinaai/jina-reranker-v2-base-multilingual", # Roberta -] - -TEXTS_1 = ["Organic skincare products for sensitive skin"] - -TEXTS_2 = [ - "Organic skincare for sensitive skin with aloe vera and chamomile.", - "New makeup trends focus on bold colors and innovative techniques", - "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille", - "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken", # noqa: E501 - "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla", # noqa: E501 - "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras", # noqa: E501 - "针对敏感肌专门设计的天然有机护肤产品", - "新的化妆趋势注重鲜艳的颜色和创新的技巧", - "敏感肌のために特別に設計された天然有機スキンケア製品", - "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています", -] +from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models EMBEDDING_MODELS = [ EmbedModelInfo("jinaai/jina-embeddings-v3", @@ -35,47 +17,13 @@ EMBEDDING_MODELS = [ is_matryoshka=True) ] - -@pytest.fixture(scope="module", params=SCORING_MODELS) -def model_name(request): - yield request.param - - -@pytest.mark.parametrize("dtype", ["half"]) -def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str): - - text_pair = [TEXTS_1[0], TEXTS_2[0]] - - with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: - hf_outputs = hf_model.predict([text_pair]).tolist() - - with vllm_runner(model_name, task="score", dtype=dtype, - max_model_len=None) as vllm_model: - vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) - - assert len(vllm_outputs) == 1 - assert len(hf_outputs) == 1 - - assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) - - -@pytest.mark.parametrize("dtype", ["half"]) -def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): - - text_pairs = [[TEXTS_1[0], text] for text in TEXTS_2] - - with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: - hf_outputs = hf_model.predict(text_pairs).tolist() - - with vllm_runner(model_name, task="score", dtype=dtype, - max_model_len=None) as vllm_model: - vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) - - assert len(vllm_outputs) == 10 - assert len(hf_outputs) == 10 - - assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) - assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) +RERANK_MODELS = [ + RerankModelInfo( + "jinaai/jina-reranker-v2-base-multilingual", + architecture="XLMRobertaForSequenceClassification", + dtype="float32", + ) +] @pytest.mark.parametrize("model_info", EMBEDDING_MODELS) @@ -106,6 +54,12 @@ def test_embed_models_correctness(hf_runner, vllm_runner, hf_model_callback=hf_model_callback) +@pytest.mark.parametrize("model_info", RERANK_MODELS) +def test_rerank_models_mteb(hf_runner, vllm_runner, + model_info: RerankModelInfo) -> None: + mteb_test_rerank_models(hf_runner, vllm_runner, model_info) + + @pytest.mark.parametrize("model_info", EMBEDDING_MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dimensions", [16, 32]) diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py new file mode 100644 index 0000000000000..b1e8fd6294ca1 --- /dev/null +++ b/tests/models/language/pooling/test_qwen3_reranker.py @@ -0,0 +1,91 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +import pytest +import torch + +from tests.conftest import HfRunner + +from .mteb_utils import RerankModelInfo, mteb_test_rerank_models + +RERANK_MODELS = [ + RerankModelInfo("Qwen/Qwen3-Reranker-0.6B", + architecture="Qwen3ForSequenceClassification", + dtype="float32", + enable_test=True), + RerankModelInfo("Qwen/Qwen3-Reranker-4B", + architecture="Qwen3ForSequenceClassification", + dtype="float32", + enable_test=False) +] + + +class Qwen3RerankerHfRunner(HfRunner): + + def __init__(self, + model_name: str, + dtype: str = "auto", + *args: Any, + **kwargs: Any) -> None: + from transformers import AutoModelForCausalLM, AutoTokenizer + super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM) + + self.tokenizer = AutoTokenizer.from_pretrained(model_name, + padding_side='left') + self.token_false_id = self.tokenizer.convert_tokens_to_ids("no") + self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes") + + def predict(self, prompts: list[list[str]], *args, + **kwargs) -> torch.Tensor: + + def process_inputs(pairs): + inputs = self.tokenizer(pairs, + padding=False, + truncation='longest_first', + return_attention_mask=False) + for i, ele in enumerate(inputs['input_ids']): + inputs['input_ids'][i] = ele + inputs = self.tokenizer.pad(inputs, + padding=True, + return_tensors="pt") + for key in inputs: + inputs[key] = inputs[key].to(self.model.device) + return inputs + + @torch.no_grad() + def compute_logits(inputs): + batch_scores = self.model(**inputs).logits[:, -1, :] + true_vector = batch_scores[:, self.token_true_id] + false_vector = batch_scores[:, self.token_false_id] + batch_scores = torch.stack([false_vector, true_vector], dim=1) + batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1) + scores = batch_scores[:, 1].exp() + return scores + + scores = [] + for prompt in prompts: + inputs = process_inputs([prompt]) + score = compute_logits(inputs) + scores.append(score[0].item()) + return torch.Tensor(scores) + + +@pytest.mark.parametrize("model_info", RERANK_MODELS) +def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: + + assert model_info.architecture == "Qwen3ForSequenceClassification" + + vllm_extra_kwargs: dict[str, Any] = { + "hf_overrides": { + "architectures": ["Qwen3ForSequenceClassification"], + "classifier_from_token": ["no", "yes"], + "is_original_qwen3_reranker": True, + } + } + + if model_info.name == "Qwen/Qwen3-Reranker-4B": + vllm_extra_kwargs["max_num_seqs"] = 1 + + mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info, + vllm_extra_kwargs) diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py index 32f9472c12d5e..a424bd6798fdb 100644 --- a/tests/models/quantization/test_gguf.py +++ b/tests/models/quantization/test_gguf.py @@ -79,7 +79,7 @@ DOLPHIN_CONFIG = GGUFTestConfig( ) MODELS = [ - LLAMA_CONFIG, + # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458 QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, diff --git a/tests/models/registry.py b/tests/models/registry.py index e6543c197348c..fb93ba60c2e8d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -205,6 +205,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01", trust_remote_code=True), + "MiniMaxM1ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-M1-40k", + trust_remote_code=True), "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"), "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1", # noqa: E501 {"tiny": "TitanML/tiny-mixtral"}), # noqa: E501 @@ -238,6 +240,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"), "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"), + "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"), # noqa: E501 "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"), "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501 v0_only=True), diff --git a/tests/models/utils.py b/tests/models/utils.py index 943b4f5704468..cdf8d02df73c9 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -336,3 +336,10 @@ class EmbedModelInfo(NamedTuple): architecture: str = "" dtype: str = "auto" enable_test: bool = True + + +class RerankModelInfo(NamedTuple): + name: str + architecture: str = "" + dtype: str = "auto" + enable_test: bool = True diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index 9f1b3bbe8e226..0df00c98b72cf 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -8,6 +8,7 @@ from typing import Optional import pytest +from vllm.platforms import current_platform from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_logprobs_close, check_outputs_equal @@ -71,6 +72,12 @@ def test_multi_step_llm( num_logprobs: corresponds to the `logprobs` argument to the OpenAI completions endpoint; `None` -> 1 logprob returned. """ + if current_platform.is_rocm() and \ + (attention_backend == "FLASHINFER" or enable_chunked_prefill): + pytest.skip( + "Multi-Step with FLASHINFER or Chunked-Prefill is not supported" + "on ROCm") + with monkeypatch.context() as m: m.setenv(STR_BACKEND_ENV_VAR, attention_backend) @@ -221,6 +228,9 @@ def test_multi_step_llm_w_prompt_logprobs( @pytest.mark.parametrize("num_prompts", NUM_PROMPTS) @pytest.mark.parametrize("num_logprobs", [None, 5]) @pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"]) +@pytest.mark.skipif( + current_platform.is_rocm(), + reason="Multi-Step + Chunked-Prefill not supported on ROCm") def test_multi_step_llm_chunked_prefill_prefix_cache( vllm_runner, example_prompts, diff --git a/tests/pplx_utils.py b/tests/pplx_utils.py deleted file mode 100644 index 2d5d5be80c3f7..0000000000000 --- a/tests/pplx_utils.py +++ /dev/null @@ -1,123 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -import os -import traceback -from typing import Callable - -import torch -from torch.multiprocessing import ( - spawn) # pyright: ignore[reportPrivateImportUsage] -from typing_extensions import Concatenate, ParamSpec - -P = ParamSpec("P") - - -@dataclasses.dataclass -class ProcessGroupInfo: - world_size: int - world_local_size: int - rank: int - node_rank: int - local_rank: int - device: torch.device - - -def _worker_parallel_launch( - local_rank: int, - world_size: int, - world_local_size: int, - node_rank: int, - init_method: str, - worker: Callable[Concatenate[ProcessGroupInfo, P], None], - *args: P.args, - **kwargs: P.kwargs, -) -> None: - rank = node_rank * world_local_size + local_rank - torch.cuda.set_device(local_rank) - device = torch.device("cuda", local_rank) - torch.distributed.init_process_group( - backend="cpu:gloo,cuda:nccl", - init_method=init_method, - rank=rank, - world_size=world_size, - device_id=device, - ) - barrier = torch.tensor([rank], device=device) - torch.distributed.all_reduce(barrier) - - try: - worker( - ProcessGroupInfo( - world_size=world_size, - world_local_size=world_local_size, - rank=rank, - node_rank=node_rank, - local_rank=local_rank, - device=device, - ), - *args, - **kwargs, - ) - except Exception as ex: - print(ex) - traceback.print_exc() - raise - finally: - torch.distributed.destroy_process_group() - - -def parallel_launch( - world_size: int, - worker: Callable[Concatenate[ProcessGroupInfo, P], None], - *args: P.args, - **kwargs: P.kwargs, -) -> None: - assert not kwargs - spawn( - _worker_parallel_launch, - args=( - world_size, - world_size, - 0, - "tcp://localhost:29500", - worker, - ) + args, - nprocs=world_size, - join=True, - ) - - -def parallel_launch_from_env( - worker: Callable[Concatenate[ProcessGroupInfo, P], None], - *args: P.args, - **kwargs: P.kwargs, -) -> None: - """ - Launches a worker function in parallel across all processes in the current - environment. The environment must have the following variables set: - - WORLD_SIZE: The total number of processes. - - WORLD_LOCAL_SIZE: The number of processes on the current node. - - NODE_RANK: The rank of the current - - MASTER_ADDR: The address of the master process. - - MASTER_PORT: The port of the master process. - """ - assert not kwargs - world_size = int(os.environ["WORLD_SIZE"]) - world_local_size = int(os.environ["WORLD_LOCAL_SIZE"]) - node_rank = int(os.environ["NODE_RANK"]) - assert "MASTER_ADDR" in os.environ - assert "MASTER_PORT" in os.environ - spawn( - _worker_parallel_launch, - args=( - world_size, - world_local_size, - node_rank, - "env://", - worker, - ) + args, - nprocs=world_local_size, - join=True, - ) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 2c07fe29fb0e6..516bf4513816a 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -14,9 +14,10 @@ from compressed_tensors.quantization import QuantizationType from tests.models.utils import check_logprobs_close from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensors24, CompressedTensorsLinearMethod, - CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24, - CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, - CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) + CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4, + CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, + CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, + CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( sparse_cutlass_supported) from vllm.platforms import current_platform @@ -651,9 +652,13 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): assert output -def test_compressed_tensors_nvfp4a16(vllm_runner): - # run weight only example - model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16" +@pytest.mark.parametrize( + "args", + [("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", + CompressedTensorsW4A16Fp4), + ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4)]) +def test_compressed_tensors_nvfp4(vllm_runner, args): + model, scheme = args with vllm_runner(model, enforce_eager=True) as llm: def check_model(model): @@ -662,7 +667,13 @@ def test_compressed_tensors_nvfp4a16(vllm_runner): qkv_proj = layer.self_attn.qkv_proj assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4) + if isinstance(qkv_proj.scheme, scheme) or isinstance( + qkv_proj.scheme, CompressedTensorsW4A16Fp4 + ) and not CompressedTensorsW4A4Fp4.cutlass_fp4_supported(): + assert True + else: + raise AssertionError("FP4 Scheme Mismatch") + assert qkv_proj.scheme.group_size == 16 llm.apply_model(check_model) diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index 54ec595854507..eef3568efea12 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -60,5 +60,20 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner): print(output) +@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") +def test_qwenvl_int8wo_model_loading_with_params(vllm_runner): + torch._dynamo.reset() + model_name = "mobicham/Qwen2.5-VL-3B-Instruct_int8wo_ao" + with vllm_runner(model_name=model_name, + quantization="torchao", + dtype="bfloat16", + pt_load_map_location="cuda:0") as llm: + output = llm.generate_greedy(["The capital of France is"], + max_tokens=32) + + assert output + print(output) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py index 418471b8e5238..119841470bfb5 100644 --- a/tests/samplers/test_typical_acceptance_sampler.py +++ b/tests/samplers/test_typical_acceptance_sampler.py @@ -248,7 +248,7 @@ def test_temperature_zero_target_distribution(seed: int, device: str): size=(batch_size, 1), dtype=torch.int64) # The target probaility distribution is a temperature zero distribution - # with zero entroy. Since our draft token ids don't match the probability + # with zero entropy. Since our draft token ids don't match the probability # 1.0 tokens in the target distribution we will reject all of them and # fallback to the greedy sampling for selecting 1 token for each sequence. # Verify the same. diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py index 98939461422e1..fd838285aba7c 100644 --- a/tests/spec_decode/e2e/test_eagle_correctness.py +++ b/tests/spec_decode/e2e/test_eagle_correctness.py @@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed: * Test greedy equality under various number of speculative tokens. With those tests, we can say at least, EAGLE would not break the -correctess for the target model outputs. +correctness for the target model outputs. """ import pytest diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py index 7608618502966..f15a9224c0030 100644 --- a/tests/spec_decode/e2e/test_integration.py +++ b/tests/spec_decode/e2e/test_integration.py @@ -14,10 +14,13 @@ MAIN_MODEL = "JackFram/llama-68m" @pytest.mark.parametrize( "common_llm_kwargs", [{ + "model_name": "JackFram/llama-68m", # Verify equality when cuda graphs allowed. "enforce_eager": False, - "model_name": "JackFram/llama-68m", + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize( "per_test_common_llm_kwargs", @@ -59,6 +62,9 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", []) @pytest.mark.parametrize( @@ -117,6 +123,9 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index 1629c69f8ee9d..4de7ee05605ad 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -17,7 +17,10 @@ from .conftest import run_equality_correctness_test "model_name": "JackFram/llama-160m", # Skip cuda graph recording for fast test. - "enforce_eager": True + "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @@ -75,6 +78,9 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @@ -128,6 +134,9 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @@ -182,6 +191,9 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @@ -256,8 +268,12 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs, "common_llm_kwargs", [{ "model_name": "JackFram/llama-160m", + # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py index 064a6e10ae6ef..bc9501bd57370 100644 --- a/tests/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/spec_decode/e2e/test_medusa_correctness.py @@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed: * Test greedy equality under various number of speculative tokens. With those tests, we can say at least, Medusa would not break the -correctess for the target model outputs. +correctness for the target model outputs. """ import pytest diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index 9f778ca8d179b..0e41d93eaa190 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -494,6 +494,9 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + + # Precision + "dtype": PRECISION, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py index d4d4d519b7a14..d9c7be8ffe71f 100644 --- a/tests/spec_decode/e2e/test_mtp_correctness.py +++ b/tests/spec_decode/e2e/test_mtp_correctness.py @@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed: * Test greedy equality under various number of speculative tokens. With those tests, we can say at least, mtp would not break the -correctess for the target model outputs. +correctness for the target model outputs. """ import pytest diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index 6d385184d264a..ccc8e745ab371 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -57,6 +57,9 @@ from .conftest import (get_output_from_llm_generator, # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize( "per_test_common_llm_kwargs", @@ -139,6 +142,9 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, # Print spec metrics. "disable_log_stats": False, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize( "per_test_common_llm_kwargs", @@ -216,6 +222,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( # Print spec metrics. "disable_log_stats": False, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize( "per_test_common_llm_kwargs", @@ -279,6 +288,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( [{ # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize( "per_test_common_llm_kwargs", @@ -464,6 +476,8 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( # Skip cuda graph recording for fast test. "enforce_eager": True, + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [ { @@ -523,6 +537,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( # Skip cuda graph recording for fast test. "enforce_eager": True, + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize( "per_test_common_llm_kwargs", @@ -589,6 +605,8 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @@ -655,6 +673,8 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @@ -706,6 +726,8 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @@ -763,6 +785,8 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index c10329a9ba974..58d1a6ca7adda 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -22,8 +22,8 @@ However, we still need to verify below scenario could be passed: * Test greedy equality under preemption * Test greedy equality under various ngram sizes / speculative sizes -With those tests, we can say at least, ngram spec would not break the correctess -for the target model outputs. +With those tests, we can say at least, ngram spec would not break the +correctness for the target model outputs. """ import pytest @@ -40,6 +40,9 @@ from .conftest import run_equality_correctness_test # Print spec metrics. "disable_log_stats": False, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [ { @@ -97,6 +100,9 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, # Print spec metrics. "disable_log_stats": False, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [ { @@ -160,6 +166,9 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [ { @@ -221,6 +230,9 @@ def test_ngram_e2e_greedy_correctness_with_preemption( # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @@ -281,6 +293,9 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @@ -337,6 +352,9 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs, # Skip cuda graph recording for fast test. "enforce_eager": True, + + # The original model is float32, keep it for numerical stability. + "dtype": "float32", }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) diff --git a/tests/test_config.py b/tests/test_config.py index ce383e1b420af..5d5c4453d30d2 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -13,32 +13,32 @@ from vllm.model_executor.layers.pooler import PoolingType from vllm.platforms import current_platform -class TestConfig1: +class _TestConfig1: pass @dataclass -class TestConfig2: +class _TestConfig2: a: int """docstring""" @dataclass -class TestConfig3: +class _TestConfig3: a: int = 1 @dataclass -class TestConfig4: +class _TestConfig4: a: Union[Literal[1], Literal[2]] = 1 """docstring""" @pytest.mark.parametrize(("test_config", "expected_error"), [ - (TestConfig1, "must be a dataclass"), - (TestConfig2, "must have a default"), - (TestConfig3, "must have a docstring"), - (TestConfig4, "must use a single Literal"), + (_TestConfig1, "must be a dataclass"), + (_TestConfig2, "must have a default"), + (_TestConfig3, "must have a docstring"), + (_TestConfig4, "must use a single Literal"), ]) def test_config(test_config, expected_error): with pytest.raises(Exception, match=expected_error): @@ -57,23 +57,23 @@ def test_compile_config_repr_succeeds(): assert 'inductor_passes' in val +@dataclass +class _TestConfigFields: + a: int + b: dict = field(default_factory=dict) + c: str = "default" + + def test_get_field(): - - @dataclass - class TestConfig: - a: int - b: dict = field(default_factory=dict) - c: str = "default" - with pytest.raises(ValueError): - get_field(TestConfig, "a") + get_field(_TestConfigFields, "a") - b = get_field(TestConfig, "b") + b = get_field(_TestConfigFields, "b") assert isinstance(b, Field) assert b.default is MISSING assert b.default_factory is dict - c = get_field(TestConfig, "c") + c = get_field(_TestConfigFields, "c") assert isinstance(c, Field) assert c.default == "default" assert c.default_factory is MISSING @@ -438,3 +438,31 @@ def test_load_config_pt_load_map_location(pt_load_map_location): config = VllmConfig(load_config=load_config) assert config.load_config.pt_load_map_location == pt_load_map_location + + +@pytest.mark.parametrize( + ("model_id", "max_model_len", "expected_max_len", "should_raise"), [ + ("BAAI/bge-reranker-base", None, 512, False), + ("BAAI/bge-reranker-base", 256, 256, False), + ("BAAI/bge-reranker-base", 513, 512, True), + ]) +def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len, + should_raise): + """Test get_and_verify_max_len with different configurations.""" + model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + revision=None, + ) + + if should_raise: + with pytest.raises(ValueError): + model_config.get_and_verify_max_len(max_model_len) + else: + actual_max_len = model_config.get_and_verify_max_len(max_model_len) + assert actual_max_len == expected_max_len diff --git a/tests/test_utils.py b/tests/test_utils.py index a2fd845ea54b7..913188455d8e6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -272,6 +272,15 @@ def test_dict_args(parser): "val5", "--hf_overrides.key-7.key_8", "val6", + # Test data type detection + "--hf_overrides.key9", + "100", + "--hf_overrides.key10", + "100.0", + "--hf_overrides.key11", + "true", + "--hf_overrides.key12.key13", + "null", ] parsed_args = parser.parse_args(args) assert parsed_args.model_name == "something.something" @@ -286,6 +295,12 @@ def test_dict_args(parser): "key-7": { "key_8": "val6", }, + "key9": 100, + "key10": 100.0, + "key11": True, + "key12": { + "key13": None, + }, } diff --git a/tests/utils.py b/tests/utils.py index ade28a481261c..a37872830dade 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -667,42 +667,54 @@ def get_physical_device_indices(devices): @_nvml() -def wait_for_gpu_memory_to_clear(devices: list[int], - threshold_bytes: int, +def wait_for_gpu_memory_to_clear(*, + devices: list[int], + threshold_bytes: Optional[int] = None, + threshold_ratio: Optional[float] = None, timeout_s: float = 120) -> None: + assert threshold_bytes is not None or threshold_ratio is not None # Use nvml instead of pytorch to reduce measurement error from torch cuda # context. devices = get_physical_device_indices(devices) start_time = time.time() while True: output: dict[int, str] = {} - output_raw: dict[int, float] = {} + output_raw: dict[int, tuple[float, float]] = {} for device in devices: if current_platform.is_rocm(): dev_handle = amdsmi_get_processor_handles()[device] mem_info = amdsmi_get_gpu_vram_usage(dev_handle) gb_used = mem_info["vram_used"] / 2**10 + gb_total = mem_info["vram_total"] / 2**10 else: dev_handle = nvmlDeviceGetHandleByIndex(device) mem_info = nvmlDeviceGetMemoryInfo(dev_handle) gb_used = mem_info.used / 2**30 - output_raw[device] = gb_used - output[device] = f'{gb_used:.02f}' + gb_total = mem_info.total / 2**30 + output_raw[device] = (gb_used, gb_total) + output[device] = f'{gb_used:.02f}/{gb_total:.02f}' - print('gpu memory used (GB): ', end='') + print('gpu memory used/total (GiB): ', end='') for k, v in output.items(): print(f'{k}={v}; ', end='') print('') + if threshold_bytes is not None: + is_free = lambda used, total: used <= threshold_bytes / 2**30 + threshold = f"{threshold_bytes/2**30} GiB" + else: + is_free = lambda used, total: used / total <= threshold_ratio + threshold = f"{threshold_ratio:.2f}" + dur_s = time.time() - start_time - if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()): + if all(is_free(used, total) for used, total in output_raw.values()): print(f'Done waiting for free GPU memory on devices {devices=} ' - f'({threshold_bytes/2**30=}) {dur_s=:.02f}') + f'({threshold=}) {dur_s=:.02f}') break if dur_s >= timeout_s: raise ValueError(f'Memory of devices {devices=} not free after ' - f'{dur_s=:.02f} ({threshold_bytes/2**30=})') + f'{dur_s=:.02f} ({threshold=})') time.sleep(5) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index ab7aa02823ab9..347f98c772ffe 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -900,3 +900,19 @@ def test_get_kv_cache_config(): with pytest.raises(NotImplementedError): get_kv_cache_config(vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32) + + # Test num_gpu_blocks_override + vllm_config.cache_config.num_gpu_blocks_override = 16 + kv_cache_config_override_blocks = get_kv_cache_config( + vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32) + assert kv_cache_config_override_blocks == KVCacheConfig( + num_blocks=16, + kv_cache_tensors=[ + KVCacheTensor(size=mem_per_block_per_layer * 16, + shared_by=["layer_1"]), + KVCacheTensor(size=mem_per_block_per_layer * 16, + shared_by=["layer_2"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()) + ]) \ No newline at end of file diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index bf4cb539ebef1..394336624aca8 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -117,7 +117,7 @@ def test_prefill(hash_algo): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == ([1, 2, 3, 4], ) # Check full block metadata parent_block_hash = None @@ -141,13 +141,13 @@ def test_prefill(hash_algo): req1 = make_request("1", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(manager.req_to_block_hashes[req1.request_id]) == 3 - assert computed_blocks.get_block_ids() == [[1, 2, 3]] + assert computed_blocks.get_block_ids() == ([1, 2, 3], ) assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[5]] + assert blocks.get_block_ids() == ([5], ) for block in computed_blocks.blocks[0]: assert block.ref_cnt == 2 @@ -175,13 +175,13 @@ def test_prefill(hash_algo): req2 = make_request("2", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(manager.req_to_block_hashes[req2.request_id]) == 3 - assert computed_blocks.get_block_ids() == [[1, 2, 3]] + assert computed_blocks.get_block_ids() == ([1, 2, 3], ) assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req2, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[6]] + assert blocks.get_block_ids() == ([6], ) # Although we only have 6 free blocks, we have 8 blocks in # the free block queue due to lazy removal. @@ -205,7 +205,7 @@ def test_prefill(hash_algo): len(computed_blocks.blocks[0]) * 16, computed_blocks) # This block ID order also checks the eviction order. - assert blocks.get_block_ids() == [[7, 8, 9, 10, 4, 5, 6, 3, 2, 1]] + assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], ) assert manager.block_pool.free_block_queue.num_free_blocks == 0 assert manager.block_pool.free_block_queue.free_list_head is None assert manager.block_pool.free_block_queue.free_list_tail is None @@ -236,8 +236,8 @@ def test_prefill_hybrid_model(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[1, 2, 3, 4], [5, 6, 7, 8], - [9, 10, 11, 12]] + assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7, + 8], [9, 10, 11, 12]) # Check full block metadata parent_block_hash = None @@ -263,14 +263,14 @@ def test_prefill_hybrid_model(): req1 = make_request("1", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(manager.req_to_block_hashes[req1.request_id]) == 3 - assert computed_blocks.get_block_ids() == [[1, 2, 3], [0, 6, 7], - [0, 10, 11]] + assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6, + 7], [0, 10, 11]) assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[13], [14], [15]] + assert blocks.get_block_ids() == ([13], [14], [15]) for block_per_group in computed_blocks.blocks: for block in block_per_group: if block != manager.block_pool.null_block: @@ -374,7 +374,7 @@ def test_prefill_plp(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == ([1, 2, 3, 4], ) req0_block_hashes = [b.block_hash for b in blocks.blocks[0]] # Check full block metadata @@ -400,13 +400,13 @@ def test_prefill_plp(): req1 = make_request("1", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(manager.req_to_block_hashes[req1.request_id]) == 3 - assert computed_blocks.get_block_ids() == [[1, 2, 3]] + assert computed_blocks.get_block_ids() == ([1, 2, 3], ) assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[5]] + assert blocks.get_block_ids() == ([5], ) for block in computed_blocks.blocks[0]: assert block.ref_cnt == 2 @@ -444,7 +444,7 @@ def test_prefill_plp(): block_ids = blocks.get_block_ids() # Duplicate cached blocks have different ids but same hashes vs request #0 assert [b.block_hash for b in blocks.blocks[0]] == req0_block_hashes - assert block_ids != [[1, 2, 3, 4]] + assert block_ids != ([1, 2, 3, 4], ) # Request #2 block hashes are valid since request #0 hashes are. # Check block reference counts. @@ -474,7 +474,7 @@ def test_decode(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == ([1, 2, 3, 4], ) # Append slots without allocating a new block. req0.num_computed_tokens = 55 @@ -546,12 +546,12 @@ def test_evict(): # Touch the first 2 blocks. req2 = make_request("2", list(range(2 * 16 + 3))) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) - assert computed_blocks.get_block_ids() == [[1, 2]] + assert computed_blocks.get_block_ids() == ([1, 2], ) assert num_computed_tokens == 2 * 16 blocks = manager.allocate_slots(req2, 3, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[10]] + assert blocks.get_block_ids() == ([10], ) assert manager.block_pool.free_block_queue.num_free_blocks == 7 @@ -865,7 +865,7 @@ def test_mm_prefix_caching(): blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == ([1, 2, 3, 4], ) req0.num_computed_tokens = 59 # Append slots without allocating a new block. @@ -926,7 +926,7 @@ def test_cache_key_salting(): blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == ([1, 2, 3, 4], ) req0.num_computed_tokens = 59 # Append slots without allocating a new block. @@ -1042,7 +1042,7 @@ def test_reset_prefix_cache(): all_token_ids = full_block_token_ids + unique_token_ids req0 = make_request("0", all_token_ids) blocks = manager.allocate_slots(req0, 55) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == ([1, 2, 3, 4], ) unique_token_ids = [4] * 7 all_token_ids = full_block_token_ids + unique_token_ids @@ -1053,7 +1053,7 @@ def test_reset_prefix_cache(): blocks = manager.allocate_slots(req1, 7, len(computed_blocks.blocks[0]) * 16, computed_blocks) - assert blocks.get_block_ids() == [[5]] + assert blocks.get_block_ids() == ([5], ) # Failed to reset prefix cache because some blocks are not freed yet. assert not manager.reset_prefix_cache() diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py index d8882b1d94324..277ea3c838505 100644 --- a/tests/v1/e2e/test_correctness_sliding_window.py +++ b/tests/v1/e2e/test_correctness_sliding_window.py @@ -30,7 +30,7 @@ model_config = { ]) @pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("seed", [1]) -def test_sliding_window_retrival(monkeypatch, model, batch_size, seed): +def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed): """ The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then asks for value of one of them (which is outside the sliding window). diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 7dff937c0fd9f..3ae6293972682 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -15,6 +15,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.inputs import PromptType from vllm.platforms import current_platform from vllm.sampling_params import RequestOutputKind +from vllm.utils import set_default_torch_num_threads from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.metrics.loggers import LoggingStatLogger @@ -107,7 +108,8 @@ async def test_load( with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") - engine = AsyncLLM.from_engine_args(engine_args) + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) NUM_REQUESTS = 100 @@ -154,7 +156,8 @@ async def test_abort( with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") - engine = AsyncLLM.from_engine_args(engine_args) + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) NUM_REQUESTS = 100 @@ -226,7 +229,8 @@ async def test_finished_flag( with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") - engine = AsyncLLM.from_engine_args(engine_args) + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) sampling_params = SamplingParams( @@ -260,7 +264,8 @@ async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch, with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") - engine = AsyncLLM.from_engine_args(engine_args) + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) NUM_REQUESTS = 100 @@ -322,10 +327,11 @@ async def test_customize_loggers(monkeypatch): with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") - engine = AsyncLLM.from_engine_args( - TEXT_ENGINE_ARGS, - stat_loggers=[MockLoggingStatLogger], - ) + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args( + TEXT_ENGINE_ARGS, + stat_loggers=[MockLoggingStatLogger], + ) after.callback(engine.shutdown) await engine.do_log_stats() @@ -340,7 +346,8 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") - engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) after.callback(engine.shutdown) sampling_params = SamplingParams(max_tokens=100, diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 1cbbf30371afd..bc7894e92814e 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -12,13 +12,14 @@ from transformers import AutoTokenizer from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform +from vllm.utils import set_default_torch_num_threads from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core import EngineCore from vllm.v1.executor.abstract import Executor, UniProcExecutor from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.outputs import ModelRunnerOutput -from ...utils import create_new_process_for_each_test +from ...utils import create_new_process_for_each_test, multi_gpu_test if not current_platform.is_cuda(): pytest.skip(reason="V1 currently only supported on CUDA.", @@ -56,9 +57,10 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): vllm_config = engine_args.create_engine_config() executor_class = Executor.get_class(vllm_config) - engine_core = EngineCore(vllm_config=vllm_config, - executor_class=executor_class, - log_stats=True) + with set_default_torch_num_threads(1): + engine_core = EngineCore(vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True) """Test basic request lifecycle.""" # First request. @@ -190,9 +192,10 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): vllm_config = engine_args.create_engine_config() executor_class = Executor.get_class(vllm_config) - engine_core = EngineCore(vllm_config=vllm_config, - executor_class=executor_class, - log_stats=True) + with set_default_torch_num_threads(1): + engine_core = EngineCore(vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True) """Test basic request lifecycle.""" # First request. request: EngineCoreRequest = make_request() @@ -286,9 +289,10 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): enforce_eager=True, ) vllm_config = engine_args.create_engine_config() - engine_core = EngineCore(vllm_config=vllm_config, - log_stats=False, - executor_class=DummyExecutor) + with set_default_torch_num_threads(1): + engine_core = EngineCore(vllm_config=vllm_config, + log_stats=False, + executor_class=DummyExecutor) assert engine_core.batch_queue is not None # Add two requests in a row. Each request have 12 prompt tokens. @@ -374,3 +378,37 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): # Odd steps schedules a new batch. assert output is None step += 1 + + +@multi_gpu_test(num_gpus=2) +def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch): + """ + Test engine can initialize worker in tp properly + """ + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + """Setup the EngineCore.""" + engine_args = EngineArgs( + model=MODEL_NAME, + tensor_parallel_size=2, + # Reduce startup time. + enforce_eager=True, + ) + vllm_config = engine_args.create_engine_config() + executor_class = Executor.get_class(vllm_config) + + with set_default_torch_num_threads(1): + engine_core = EngineCore(vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True) + + def get_worker_cache_config_field(worker, key: str): + return getattr(worker.cache_config, key) + + num_gpu_blocks = engine_core.collective_rpc( + get_worker_cache_config_field, args=("num_gpu_blocks", )) + num_cpu_blocks = engine_core.collective_rpc( + get_worker_cache_config_field, args=("num_cpu_blocks", )) + assert all(x is not None for x in num_gpu_blocks) + assert all(x is not None for x in num_cpu_blocks) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index c2dc3b4731b5a..d4db16fe86fab 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -19,6 +19,7 @@ from vllm.distributed.kv_events import (BlockStored, KVEventBatch, from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext +from vllm.utils import set_default_torch_num_threads from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core import EngineCore from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient, @@ -138,13 +139,15 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch, vllm_config = engine_args.create_engine_config( UsageContext.UNKNOWN_CONTEXT) executor_class = Executor.get_class(vllm_config) - client = EngineCoreClient.make_client( - multiprocess_mode=multiprocessing_mode, - asyncio_mode=False, - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=False, - ) + + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=multiprocessing_mode, + asyncio_mode=False, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, + ) MAX_TOKENS = 20 params = SamplingParams(max_tokens=MAX_TOKENS) @@ -223,13 +226,15 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): vllm_config = engine_args.create_engine_config( usage_context=UsageContext.UNKNOWN_CONTEXT) executor_class = Executor.get_class(vllm_config) - client = EngineCoreClient.make_client( - multiprocess_mode=True, - asyncio_mode=True, - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=True, - ) + + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=True, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True, + ) try: MAX_TOKENS = 20 @@ -312,13 +317,14 @@ def test_kv_cache_events( UsageContext.UNKNOWN_CONTEXT) executor_class = Executor.get_class(vllm_config) - client = EngineCoreClient.make_client( - multiprocess_mode=multiprocessing_mode, - asyncio_mode=False, - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=False, - ) + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=multiprocessing_mode, + asyncio_mode=False, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, + ) endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") subscriber = MockSubscriber(endpoint, topic=publisher_config.topic, @@ -394,13 +400,14 @@ async def test_kv_cache_events_dp( UsageContext.UNKNOWN_CONTEXT) executor_class = Executor.get_class(vllm_config) - client = EngineCoreClient.make_client( - multiprocess_mode=multiprocessing_mode, - asyncio_mode=True, - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=False, - ) + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=multiprocessing_mode, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, + ) await asyncio.sleep(1) # Build endpoints for all DP ranks diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py new file mode 100644 index 0000000000000..5c844e0e7095e --- /dev/null +++ b/tests/v1/engine/test_fast_incdec_prefix_err.py @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from transformers import AutoTokenizer + +from vllm.sampling_params import SamplingParams +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.detokenizer import IncrementalDetokenizer + +# ruff: noqa: E501 + + +def test_fast_inc_detok_invalid_utf8_err_case(): + """ + Test edge case where tokenizer can produce non-monotonic, + invalid UTF-8 output, which breaks the internal state of + tokenizers' DecodeStream. + See https://github.com/vllm-project/vllm/issues/17448. + + Thanks to reproducer from @fpaupier: + https://gist.github.com/fpaupier/0ed1375bd7633c5be6c894b1c7ac1be3. + """ + tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it") + + # Create a test request + prompt_token_ids = [107, 4606, 236787, 107] + params = SamplingParams(skip_special_tokens=True) + request = EngineCoreRequest( + "test", + prompt_token_ids, + None, + None, + None, + params, + None, + 0.0, + None, + cache_salt=None, + data_parallel_rank=None, + ) + + detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request) + + assert detokenizer.__class__.__name__ == "FastIncrementalDetokenizer", \ + "Should use FastIncrementalDetokenizer by default" + + # Process tokens incrementally + test_tokens = [ + 236840, 107, 138, 236782, 107, 140, 236775, 6265, 1083, 623, 121908, + 147418, 827, 107, 140, 236775, 6265, 236779, 2084, 1083, 623, 203292, + 827, 107, 140, 236775, 6265, 236779, 7777, 1083, 623, 121908, 147418, + 569, 537, 236789, 65880, 569, 537, 236789, 62580, 853, 115693, 210118, + 35178, 16055, 1270, 759, 215817, 4758, 1925, 1117, 827, 107, 140, + 236775, 5654, 1083, 623, 110733, 46291, 827, 107, 140, 236775, 5654, + 236779, 2084, 1083, 623, 136955, 56731, 827, 107, 140, 236775, 5654, + 236779, 7777, 1083, 623, 194776, 2947, 496, 109811, 1608, 890, 215817, + 4758, 1925, 1117, 2789, 432, 398, 602, 31118, 569, 124866, 134772, 509, + 19478, 1640, 33779, 236743, 236770, 236819, 236825, 236771, 432, 398, + 432, 237167, 827, 107, 140, 236775, 77984, 1083, 623, 2709, 236745, + 2555, 513, 236789, 602, 31118, 569 + ] + + output = "" + for i, token_id in enumerate(test_tokens): + detokenizer.update([token_id], False) + + finished = i == len(test_tokens) - 1 + output += detokenizer.get_next_output_text(finished, delta=True) + + +# fmt: off + assert output == r'''[ + { + "source": "Résultats", + "source_type": "CONCEPT", + "source_description": "Résultats de l'analyse de l'impact des opérations israéliennes sur la frontière libanaise", + "target": "Israël", + "target_type": "ORGANIZATION", + "target_description": "Pays qui a obtenu à sa frontière libanaise « un niveau de calme inédit depuis les années 1960 »", + "relationship": "Obtention d'un niveau de''' diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 622ab6f35db33..a0bcb8f602e11 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -7,7 +7,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( from .utils import create_request, create_scheduler, create_vllm_config -def test_basic_inferface(): +def test_basic_interface(): """Unit test for basic NixlConnector interface functionality.""" vllm_config = create_vllm_config() @@ -25,7 +25,7 @@ def test_basic_inferface(): scheduler.add_request(request) - # Remote Prefill, triggers NixlConnectorMetdata. + # Remote Prefill, triggers NixlConnectorMetadata. scheduler_output = scheduler.schedule() kv_connector_metadata = scheduler_output.kv_connector_metadata assert kv_connector_metadata is not None diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py index 085b2ee09743c..0b135613ff6bd 100644 --- a/tests/v1/sample/test_logprobs_e2e.py +++ b/tests/v1/sample/test_logprobs_e2e.py @@ -32,7 +32,7 @@ def test_prompt_logprobs_e2e(): ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" -def test_promt_logprobs_e2e_server(): +def test_prompt_logprobs_e2e_server(): with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server: url = f"{remote_server.url_for('v1')}/completions" diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py index 63fdeb5a6de84..9d695cd91a972 100644 --- a/tests/v1/sample/test_topk_topp_sampler.py +++ b/tests/v1/sample/test_topk_topp_sampler.py @@ -28,7 +28,7 @@ def reset_default_device(): torch.set_default_device(original_device) -def test_topk_impl_equivalance(): +def test_topk_impl_equivalence(): torch.set_default_device(DEVICE) generator = Generator(device=DEVICE).manual_seed(33) diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py index 7117a66c29584..fe65976a58a1f 100644 --- a/tests/v1/tpu/test_basic.py +++ b/tests/v1/tpu/test_basic.py @@ -67,6 +67,43 @@ def test_basic( assert "1024" in output or "0, 1" in output +@pytest.mark.skipif(not current_platform.is_tpu(), + reason="This is a basic test for TPU only") +@pytest.mark.parametrize("max_tokens", [8]) +@pytest.mark.parametrize("max_num_seqs", [16]) +def test_phi3( + vllm_runner: type[VllmRunner], + monkeypatch: pytest.MonkeyPatch, + max_tokens: int, + max_num_seqs: int, +) -> None: + prompts = [ + "A robot may not injure a human being", + "It is only with the heart that one can see rightly;", + "The greatest glory in living lies not in never falling,", + ] + answers = [ + " or, by violating privacy", + " what is essential is love.", + " but in rising every time we fall.", + ] + # test head dim = 96 + model = "microsoft/Phi-3-mini-128k-instruct" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + with vllm_runner(model, + max_num_batched_tokens=256, + max_num_seqs=max_num_seqs) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) + # vllm_outputs is a list of tuples whose first element is the token id + # and the second element is the output (including the prompt). + for output, answer in zip(vllm_outputs, answers): + generated_text = output[1] + assert answer in generated_text + + TP_SIZE_8 = 8 diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 73c0da45d4ab3..0e7d305fef9ed 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import unittest.mock as mock import pytest @@ -17,24 +16,8 @@ from vllm.v1.worker.tpu_model_runner import ( TPUModelRunner, _get_padded_num_reqs_with_upper_limit, _get_padded_token_len, _get_req_paddings, _get_token_paddings) -# Mock torch_xla module since it may not be available in the test environments -torch_xla_patcher = mock.patch.dict( - "sys.modules", { - "torch_xla": mock.MagicMock(), - "torch_xla.core.xla_model": mock.MagicMock(), - "torch_xla.runtime": mock.MagicMock(), - }) -torch_xla_patcher.start() -# Mock the PallasAttentionBackend -pallas_attention_backend_patcher = mock.patch( - "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", ) -pallas_attention_backend_patcher.start() - - -@pytest.fixture -def model_runner(): - # Patchers have already been started at module level. +def get_vllm_config(): scheduler_config = SchedulerConfig( max_num_seqs=10, max_num_batched_tokens=512, @@ -60,18 +43,19 @@ def model_runner(): cache_config=cache_config, scheduler_config=scheduler_config, ) + return vllm_config + + +def get_model_runner(vllm_config): device = "xla:0" # Mocking TPU device - with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \ - mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \ - mock.patch("vllm.v1.worker.tpu_model_runner.xr"): - return TPUModelRunner(vllm_config, device) + return TPUModelRunner(vllm_config, device) -@pytest.fixture(autouse=True, scope="session") -def cleanup_patches(): - yield - torch_xla_patcher.stop() - pallas_attention_backend_patcher.stop() +@pytest.fixture +def model_runner(): + # Patchers have already been started at module level. + vllm_config = get_vllm_config() + return get_model_runner(vllm_config) def _schedule_new_request(*req_ids: str) -> SchedulerOutput: @@ -87,7 +71,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: mm_hashes=[], mm_positions=[], sampling_params=SamplingParams(), - block_ids=[[0]], # block_ids should be list[list[int]] + block_ids=([0], ), # block_ids should be tuple[list[int]] num_computed_tokens=0, lora_request=None, )) @@ -132,10 +116,10 @@ def _is_req_state_block_table_match(model_runner, req_id: str) -> bool: # This is safe since we currently only use single KV cache groups block_table = multi_group_block_table[0] - # req_state.block_ids is now list[list[int]] for MultiGroupBlockTable + # req_state.block_ids is now tuple[list[int], ...] for MultiGroupBlockTable # Extract the first group's block IDs if isinstance(req_state.block_ids[0], list): - # New format: list[list[int]] - extract first group + # New format: tuple[list[int], ...] - extract first group req_block_ids = req_state.block_ids[0] else: # Legacy format: list[int] - use directly @@ -226,7 +210,7 @@ def test_update_states_request_resumed(model_runner): req_id=req_id, resumed_from_preemption=False, new_token_ids=[], - new_block_ids=[[]], + new_block_ids=([], ), num_computed_tokens=0, ) @@ -370,12 +354,14 @@ def test_get_req_paddings(): assert _get_req_paddings(8, 36) == [8, 16, 32, 36] -@pytest.mark.skip(reason="Test is broken on TPU when it's added.") -def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(): +def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order( + model_runner): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" error_msg = f"{layer_1} must come before the current layer" - with pytest.raises(ValueError, match=error_msg): + vllm_config = model_runner.vllm_config + with pytest.raises(ValueError, match=error_msg), \ + set_current_vllm_config(vllm_config): fwd_context = { # initialization below will fail because target layer is invalid; # the target layer needs to come before layer 1 @@ -399,13 +385,14 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(): assert fwd_context is not None -@pytest.mark.skip(reason="Test is broken on TPU when it's added.") -def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(): +def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" invalid_layer = "model.layers.0.cross_attn.attn" error_msg = f"{invalid_layer} is not a valid Attention layer in the model" - with pytest.raises(ValueError, match=error_msg): + vllm_config = model_runner.vllm_config + with pytest.raises(ValueError, match=error_msg), \ + set_current_vllm_config(vllm_config): fwd_context = { layer_0: Attention( @@ -428,12 +415,13 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(): assert fwd_context is not None -@pytest.mark.skip(reason="Test is broken on TPU when it's added.") -def test_init_kv_cache_with_kv_sharing_target_same_as_current(): +def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" error_msg = f"{layer_1} cannot be the same as the current layer" - with pytest.raises(ValueError, match=error_msg): + vllm_config = model_runner.vllm_config + with pytest.raises(ValueError, match=error_msg), \ + set_current_vllm_config(vllm_config): fwd_context = { # initialization below will fail because target layer is invalid; # the target layer needs to come before layer 1 @@ -457,11 +445,10 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current(): assert fwd_context is not None -@pytest.mark.skip(reason="Test is broken on TPU when it's added.") -def test_init_kv_cache_without_kv_sharing(model_runner): +def test_init_kv_cache_without_kv_sharing(): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" - vllm_config = model_runner.vllm_config + vllm_config = get_vllm_config() with set_current_vllm_config(vllm_config): fwd_context = { layer_0: @@ -482,33 +469,38 @@ def test_init_kv_cache_without_kv_sharing(model_runner): # suppress var not used error assert fwd_context is not None # Set high context length to test max context length estimation - vllm_config.model_config.max_model_len = 3_000_000 + vllm_config.model_config.max_model_len = 1_000_000 vllm_ctx = vllm_config.compilation_config.static_forward_context + model_runner = get_model_runner(vllm_config) kv_cache_spec = model_runner.get_kv_cache_spec() assert len(kv_cache_spec) == 2 assert len(model_runner.shared_kv_cache_layers) == 0 available_memory = 20 * GiB_bytes - # page size for layer 0's kv_cache_spec is 32KB - num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers) + # page size for each layer KV can be calculated as + # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim) + # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB + num_expected_blocks = 20480 # 20GB / 512KB / 2 (num layers) kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory) assert kv_cache_config.num_blocks == num_expected_blocks - assert len(kv_cache_config.tensors) == 2 - assert kv_cache_config.tensors[layer_0].size == available_memory // 2 - assert kv_cache_config.tensors[layer_1].size == available_memory // 2 + assert len(kv_cache_config.kv_cache_tensors) == 2 + assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2 + assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2 max_context_len =\ estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) # max context len with KV sharing should be 2x as large as without - assert max_context_len == 1310720 + # max_context_len = available_memory / (page_size / block_size) / num_caches + # max_context_len = 5GB / (512KB / 128) / 2 = 655360 + assert max_context_len == 655360 # important: override tensor size to prevent large mem alloc during test - # this will only allocate 2 block worth of memory (2 * 32kb) + # this will only allocate 2 block worth of memory (2 * 512kb) kv_cache_config.num_blocks = 1 - for layer in kv_cache_config.tensors: - kv_cache_config.tensors[layer].size =\ - kv_cache_spec[layer].page_size_bytes + for kv_cache_tensor in kv_cache_config.kv_cache_tensors: + kv_cache_tensor.size = ( + kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes) model_runner.initialize_kv_cache(kv_cache_config) @@ -524,11 +516,10 @@ def test_init_kv_cache_without_kv_sharing(model_runner): assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 -@pytest.mark.skip(reason="Test is broken on TPU when it's added.") -def test_init_kv_cache_with_kv_sharing_valid(model_runner): +def test_init_kv_cache_with_kv_sharing_valid(): layer_0 = "model.layers.0.self_attn.attn" layer_1 = "model.layers.1.self_attn.attn" - vllm_config = model_runner.vllm_config + vllm_config = get_vllm_config() with set_current_vllm_config(vllm_config): fwd_context = { layer_0: @@ -552,33 +543,34 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner): # Set high context length to test max context length estimation vllm_config.model_config.max_model_len = 3_000_000 vllm_ctx = vllm_config.compilation_config.static_forward_context + model_runner = get_model_runner(vllm_config) kv_cache_spec = model_runner.get_kv_cache_spec() assert len(kv_cache_spec) == 1 assert layer_0 in kv_cache_spec assert model_runner.shared_kv_cache_layers[layer_1] == layer_0 available_memory = 20 * GiB_bytes - # page size for layer 0's kv_cache_spec is 32KB + # page size for layer 0's kv_cache_spec is 512KB # with KV sharing, we can allocate (available_mem//page_size//1) blocks # which is twice as many as without KV sharing - num_expected_blocks = 655360 # 20GB / 32KB + num_expected_blocks = 2 * 20480 # 20GB / 512KB kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory) assert kv_cache_config.num_blocks == num_expected_blocks - assert len(kv_cache_config.tensors) == 1 + assert len(kv_cache_config.kv_cache_tensors) == 1 # Each layer now has twice the available memory for KV cache # compared to no KV sharing - assert kv_cache_config.tensors[layer_0].size == available_memory + assert kv_cache_config.kv_cache_tensors[0].size == available_memory max_context_len =\ estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) # max context len with KV sharing should be 2x as large as without - assert max_context_len == 2 * 1310720 + assert max_context_len == (2 * 655360) # important: override tensor size to prevent large mem alloc during test - # this will only allocate 1 block worth of memory (32kb) + # this will only allocate 1 block worth of memory (512kb) kv_cache_config.num_blocks = 1 - kv_cache_config.tensors[layer_0].size =\ + kv_cache_config.kv_cache_tensors[0].size =\ kv_cache_spec[layer_0].page_size_bytes model_runner.initialize_kv_cache(kv_cache_config) diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 72547e86b0e93..de6ebe4f6716b 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -203,7 +203,7 @@ def _construct_cached_request_state(req_id_suffix: int): sampling_params=_create_sampling_params(), mm_inputs=[], mm_positions=[], - block_ids=[[]], + block_ids=([], ), generator=None, num_computed_tokens=len(output_token_ids), output_token_ids=output_token_ids, diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index caacb1652e9a2..994432dfd593b 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -122,7 +122,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: mm_hashes=[], mm_positions=[], sampling_params=SamplingParams(), - block_ids=[[0]], + block_ids=([0], ), num_computed_tokens=0, lora_request=None, )) @@ -250,7 +250,7 @@ def test_update_states_request_resumed(model_runner): req_id=req_id, resumed_from_preemption=False, new_token_ids=[], - new_block_ids=[[]], + new_block_ids=([], ), num_computed_tokens=0, ) diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index a5e61128d1e93..ec33d334ab650 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -209,32 +209,32 @@ def test_multi_step_model_runner_input(): received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict( tensor_dict, attn_backend=attn_backend)) - receieved_frozen_input = received_model_input.frozen_model_input + received_frozen_input = received_model_input.frozen_model_input # Check that received copy has correct values. assert isinstance(received_model_input, StatefulModelInput) - assert receieved_frozen_input.input_tokens is not None - assert (receieved_frozen_input.input_tokens == + assert received_frozen_input.input_tokens is not None + assert (received_frozen_input.input_tokens == frozen_model_input.input_tokens).all() - assert receieved_frozen_input.input_positions is not None - assert (receieved_frozen_input.input_positions == + assert received_frozen_input.input_positions is not None + assert (received_frozen_input.input_positions == frozen_model_input.input_positions).all() - assert receieved_frozen_input.multi_modal_kwargs is None + assert received_frozen_input.multi_modal_kwargs is None assert (frozen_model_input.multi_modal_kwargs == frozen_model_input.multi_modal_kwargs) - assert receieved_frozen_input.lora_requests is None - assert (receieved_frozen_input.lora_requests == + assert received_frozen_input.lora_requests is None + assert (received_frozen_input.lora_requests == frozen_model_input.lora_requests) - assert receieved_frozen_input.lora_mapping is None + assert received_frozen_input.lora_mapping is None assert ( - receieved_frozen_input.lora_mapping == frozen_model_input.lora_mapping) + received_frozen_input.lora_mapping == frozen_model_input.lora_mapping) for field in dataclasses.fields(AttentionMetadata): - assert getattr(receieved_frozen_input.attn_metadata, field.name, + assert getattr(received_frozen_input.attn_metadata, field.name, None) == getattr(attn_metadata, field.name, None) # For sampling metadata, only selected_token_indices is copied. - assert (receieved_frozen_input.sampling_metadata.selected_token_indices == + assert (received_frozen_input.sampling_metadata.selected_token_indices == sampling_metadata.selected_token_indices) - assert receieved_frozen_input.sampling_metadata.seq_groups is None + assert received_frozen_input.sampling_metadata.seq_groups is None # check non frozen fields assert received_model_input.is_last_step == model_input.is_last_step diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py new file mode 100644 index 0000000000000..30a2d49dd8ca7 --- /dev/null +++ b/tools/check_pickle_imports.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +import os +import sys + +import regex as re + +try: + import pathspec +except ImportError: + print( + "ERROR: The 'pathspec' library is required. " + "Install it with 'pip install pathspec'.", + file=sys.stderr) + sys.exit(2) + +# List of files (relative to repo root) that are allowed to import pickle or +# cloudpickle +# +# STOP AND READ BEFORE YOU ADD ANYTHING ELSE TO THIS LIST: +# The pickle and cloudpickle modules are known to be unsafe when deserializing +# data from potentially untrusted parties. They have resulted in multiple CVEs +# for vLLM and numerous vulnerabilities in the Python ecosystem more broadly. +# Before adding new uses of pickle/cloudpickle, please consider safer +# alternatives like msgpack or pydantic that are already in use in vLLM. Only +# add to this list if absolutely necessary and after careful security review. +ALLOWED_FILES = set([ + # pickle + 'vllm/utils.py', + 'vllm/v1/serial_utils.py', + 'vllm/v1/executor/multiproc_executor.py', + 'vllm/multimodal/hasher.py', + 'vllm/transformers_utils/config.py', + 'vllm/model_executor/models/registry.py', + 'tests/test_utils.py', + 'tests/tokenization/test_cached_tokenizer.py', + 'tests/model_executor/test_guided_processors.py', + 'vllm/distributed/utils.py', + 'vllm/distributed/parallel_state.py', + 'vllm/engine/multiprocessing/client.py', + 'vllm/distributed/device_communicators/custom_all_reduce_utils.py', + 'vllm/distributed/device_communicators/shm_broadcast.py', + 'vllm/engine/multiprocessing/engine.py', + 'benchmarks/kernels/graph_machete_bench.py', + 'benchmarks/kernels/benchmark_lora.py', + 'benchmarks/kernels/benchmark_machete.py', + 'benchmarks/fused_kernels/layernorm_rms_benchmarks.py', + 'benchmarks/cutlass_benchmarks/w8a8_benchmarks.py', + 'benchmarks/cutlass_benchmarks/sparse_benchmarks.py', + # cloudpickle + 'vllm/worker/worker_base.py', + 'vllm/executor/mp_distributed_executor.py', + 'vllm/executor/ray_distributed_executor.py', + 'vllm/entrypoints/llm.py', + 'tests/utils.py', + # pickle and cloudpickle + 'vllm/utils.py', + 'vllm/v1/serial_utils.py', + 'vllm/v1/executor/multiproc_executor.py', + 'vllm/transformers_utils/config.py', + 'vllm/model_executor/models/registry.py', + 'vllm/engine/multiprocessing/client.py', + 'vllm/engine/multiprocessing/engine.py', +]) + +PICKLE_RE = re.compile(r"^\s*(import\s+(pickle|cloudpickle)(\s|$|\sas)" + r"|from\s+(pickle|cloudpickle)\s+import\b)") + + +def is_python_file(path): + return path.endswith('.py') + + +def scan_file(path): + with open(path, encoding='utf-8') as f: + for line in f: + if PICKLE_RE.match(line): + return True + return False + + +def load_gitignore(repo_root): + gitignore_path = os.path.join(repo_root, '.gitignore') + patterns = [] + if os.path.exists(gitignore_path): + with open(gitignore_path, encoding='utf-8') as f: + patterns = f.read().splitlines() + # Always ignore .git directory + patterns.append('.git/') + return pathspec.PathSpec.from_lines('gitwildmatch', patterns) + + +def main(): + repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + spec = load_gitignore(repo_root) + bad_files = [] + for dirpath, _, filenames in os.walk(repo_root): + for filename in filenames: + if not is_python_file(filename): + continue + abs_path = os.path.join(dirpath, filename) + rel_path = os.path.relpath(abs_path, repo_root) + # Skip ignored files + if spec.match_file(rel_path): + continue + if scan_file(abs_path) and rel_path not in ALLOWED_FILES: + bad_files.append(rel_path) + if bad_files: + print("\nERROR: The following files import 'pickle' or 'cloudpickle' " + "but are not in the allowed list:") + for f in bad_files: + print(f" {f}") + print("\nIf this is intentional, update the allowed list in " + "tools/check_pickle_imports.py.") + sys.exit(1) + sys.exit(0) + + +def test_regex(): + test_cases = [ + # Should match + ("import pickle", True), + ("import cloudpickle", True), + ("import pickle as pkl", True), + ("import cloudpickle as cpkl", True), + ("from pickle import *", True), + ("from cloudpickle import dumps", True), + ("from pickle import dumps, loads", True), + ("from cloudpickle import (dumps, loads)", True), + (" import pickle", True), + ("\timport cloudpickle", True), + ("from pickle import loads", True), + # Should not match + ("import somethingelse", False), + ("from somethingelse import pickle", False), + ("# import pickle", False), + ("print('import pickle')", False), + ("import pickleas as asdf", False), + ] + for i, (line, should_match) in enumerate(test_cases): + result = bool(PICKLE_RE.match(line)) + assert result == should_match, ( + f"Test case {i} failed: '{line}' " + f"(expected {should_match}, got {result})") + print("All regex tests passed.") + + +if __name__ == '__main__': + if '--test-regex' in sys.argv: + test_regex() + else: + main() diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md index 5c98e999da335..f1479146f053c 100644 --- a/tools/ep_kernels/README.md +++ b/tools/ep_kernels/README.md @@ -1,11 +1,10 @@ Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package. -Here we break down the requirements in 3 steps: +Here we break down the requirements in 2 steps: 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this. -2. Build and install the system libraries (GDR Copy). This step requires root access. You can do it inside a Docker container so that they can be shipped as a single image. -3. Build and install the system drivers (GDR Copy, and necessary modifications to NVIDIA driver to enable IBGDA). This step requires root access, and must be done on the host machine. +2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine. -2 and 3 are necessary for multi-node deployment. +2 is necessary for multi-node deployment. All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`. @@ -21,7 +20,6 @@ bash install_python_libraries.sh ```bash bash install_python_libraries.sh -sudo bash install_system_libraries.sh -sudo bash install_system_drivers.sh +sudo bash configure_system_drivers.sh sudo reboot # Reboot is required to load the new driver ``` diff --git a/tools/ep_kernels/configure_system_drivers.sh b/tools/ep_kernels/configure_system_drivers.sh new file mode 100644 index 0000000000000..cf15c1daccaec --- /dev/null +++ b/tools/ep_kernels/configure_system_drivers.sh @@ -0,0 +1,7 @@ +set -ex + +# turn on IBGDA +echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf +update-initramfs -u + +echo "Please reboot the system to apply the changes" diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh index e5632f4b58758..83643c084bf9a 100644 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -13,16 +13,6 @@ fi # install dependencies if not installed pip3 install cmake torch ninja -# build gdrcopy, required by nvshmem -pushd $WORKSPACE -wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.4.4.tar.gz -mkdir -p gdrcopy_src -tar -xvf v2.4.4.tar.gz -C gdrcopy_src --strip-components=1 -pushd gdrcopy_src -make -j$(nproc) -make prefix=$WORKSPACE/gdrcopy_install install -popd - # build nvshmem pushd $WORKSPACE mkdir -p nvshmem_src @@ -34,26 +24,30 @@ git init git apply -vvv nvshmem.patch # assume CUDA_HOME is set correctly -export GDRCOPY_HOME=$WORKSPACE/gdrcopy_install +if [ -z "$CUDA_HOME" ]; then + echo "CUDA_HOME is not set, please set it to your CUDA installation directory." + exit 1 +fi + +# disable all features except IBGDA +export NVSHMEM_IBGDA_SUPPORT=1 + export NVSHMEM_SHMEM_SUPPORT=0 export NVSHMEM_UCX_SUPPORT=0 export NVSHMEM_USE_NCCL=0 -export NVSHMEM_IBGDA_SUPPORT=1 export NVSHMEM_PMIX_SUPPORT=0 export NVSHMEM_TIMEOUT_DEVICE_POLLING=0 -export NVSHMEM_USE_GDRCOPY=1 -export NVSHMEM_IBRC_SUPPORT=1 - -# remove MPI dependency +export NVSHMEM_USE_GDRCOPY=0 +export NVSHMEM_IBRC_SUPPORT=0 export NVSHMEM_BUILD_TESTS=0 export NVSHMEM_BUILD_EXAMPLES=0 export NVSHMEM_MPI_SUPPORT=0 +export NVSHMEM_BUILD_HYDRA_LAUNCHER=0 +export NVSHMEM_BUILD_TXZ_PACKAGE=0 +export NVSHMEM_TIMEOUT_DEVICE_POLLING=0 -cmake -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install - -cd $WORKSPACE/nvshmem_build/ -make -j$(nproc) -make install +cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install +cmake --build $WORKSPACE/nvshmem_build/ --target install popd diff --git a/tools/ep_kernels/install_system_drivers.sh b/tools/ep_kernels/install_system_drivers.sh deleted file mode 100644 index 8b0669ef404ff..0000000000000 --- a/tools/ep_kernels/install_system_drivers.sh +++ /dev/null @@ -1,24 +0,0 @@ -set -ex - -# prepare workspace directory -WORKSPACE=$1 -if [ -z "$WORKSPACE" ]; then - export WORKSPACE=$(pwd)/ep_kernels_workspace -fi - -if [ ! -d "$WORKSPACE" ]; then - mkdir -p $WORKSPACE -fi - -# build and install gdrcopy driver -pushd $WORKSPACE -cd gdrcopy_src -./insmod.sh -# run gdrcopy_copybw to test the installation -$WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw - -# turn on IBGDA -echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf -update-initramfs -u - -echo "Please reboot the system to apply the changes" diff --git a/tools/ep_kernels/install_system_libraries.sh b/tools/ep_kernels/install_system_libraries.sh deleted file mode 100644 index c148d5443900a..0000000000000 --- a/tools/ep_kernels/install_system_libraries.sh +++ /dev/null @@ -1,18 +0,0 @@ -set -ex - -# prepare workspace directory -WORKSPACE=$1 -if [ -z "$WORKSPACE" ]; then - export WORKSPACE=$(pwd)/ep_kernels_workspace -fi - -if [ ! -d "$WORKSPACE" ]; then - mkdir -p $WORKSPACE -fi - -# build and install gdrcopy system packages -pushd $WORKSPACE -cd gdrcopy_src/packages -apt install devscripts -y -CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh -dpkg -i *.deb diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 7368ae95313d2..7386cdd9f7245 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -116,7 +116,7 @@ def ReadTargets(log, show_all): # If ninja.exe is rudely halted then the .ninja_log file may be # corrupt. Silently continue. continue - start, end, _, name, cmdhash = parts # Ignore restat. + start, end, _, name, cmdhash = parts # Ignore restart. # Convert from integral milliseconds to float seconds. start = int(start) / 1000.0 end = int(end) / 1000.0 diff --git a/typos.toml b/typos.toml new file mode 100644 index 0000000000000..f51ce2f362082 --- /dev/null +++ b/typos.toml @@ -0,0 +1,179 @@ +[files] +# these files may be written in non english words +extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", + "benchmarks/sonnet.txt", "tests/lora/data/*", "build/*", + "vllm/third_party/*"] +ignore-hidden = true +ignore-files = true +ignore-dot = true +ignore-vcs = true +ignore-global = true +ignore-parent = true + +[default] +binary = false +check-filename = false +check-file = true +unicode = true +ignore-hex = true +identifier-leading-digits = false +locale = "en" +extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw", + ".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*", + ".*ot.*", ".*[Tt]h[rR].*"] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[default.extend-identifiers] +bbc5b7ede = "bbc5b7ede" +womens_doubles = "womens_doubles" +v_2nd = "v_2nd" +splitted_input = "splitted_input" +NOOPs = "NOOPs" +typ = "typ" +nin_shortcut = "nin_shortcut" +UperNetDecoder = "UperNetDecoder" +subtile = "subtile" +cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin" +SFOuput = "SFOuput" +# huggingface transformers repo uses these words +depthwise_seperable_out_channel = "depthwise_seperable_out_channel" +DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d" +depthwise_seperable_CNN = "depthwise_seperable_CNN" + +[default.extend-words] +iy = "iy" +tendencias = "tendencias" +# intel cpu features +tme = "tme" +dout = "dout" +Pn = "Pn" +arange = "arange" + +[type.py] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.py.extend-identifiers] +arange = "arange" +NDArray = "NDArray" +EOFError = "EOFError" + +[type.py.extend-words] + +[type.cpp] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.cpp.extend-identifiers] +countr_one = "countr_one" + +[type.cpp.extend-words] + +[type.rust] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.rust.extend-identifiers] +flate2 = "flate2" + +[type.rust.extend-words] +ser = "ser" + +[type.lock] +extend-glob = [] +check-file = false +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.lock.extend-identifiers] + +[type.lock.extend-words] + +[type.jl] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.jl.extend-identifiers] + +[type.jl.extend-words] +modul = "modul" +egals = "egals" +usig = "usig" +egal = "egal" + +[type.go] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.go.extend-identifiers] +flate = "flate" + +[type.go.extend-words] + +[type.css] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.css.extend-identifiers] +nd = "nd" + +[type.css.extend-words] + +[type.man] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.man.extend-identifiers] +Nd = "Nd" + +[type.man.extend-words] + +[type.cert] +extend-glob = [] +check-file = false +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.cert.extend-identifiers] + +[type.cert.extend-words] + +[type.sh] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.sh.extend-identifiers] +stap = "stap" +ot = "ot" + +[type.sh.extend-words] + +[type.vimscript] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.vimscript.extend-identifiers] +windo = "windo" + +[type.vimscript.extend-words] diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 92de1f5efa830..ff992c33b3092 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib -import importlib from typing import TYPE_CHECKING, Optional, Union import torch @@ -595,7 +594,7 @@ if hasattr(torch.ops._C, "ggml_dequantize"): quant_type: int, row: torch.SymInt, ) -> torch.Tensor: - return torch.empty((1, row), dtype=X.dtype, device=W.device) + return torch.empty((X.shape[0], row), dtype=X.dtype, device=W.device) @register_fake("_C::ggml_mul_mat_a8") def _ggml_mul_mat_a8_fake( @@ -706,10 +705,8 @@ def cutlass_scaled_mm(a: torch.Tensor, cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0) if current_platform.is_rocm() or not cutlass_compatible_b: - triton_scaled_mm_module = importlib.import_module( - "vllm.model_executor.layers.quantization.compressed_tensors." - "triton_scaled_mm") - triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm + from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import ( # noqa + triton_scaled_mm) return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) out = torch.empty((m, n), dtype=out_dtype, device=a.device) @@ -1228,6 +1225,7 @@ def scaled_fp8_quant( num_token_padding: Optional[int] = None, scale_ub: Optional[torch.Tensor] = None, use_per_token_if_dynamic: bool = False, + output: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, torch.Tensor]: """ Quantize input tensor to FP8 and return quantized tensor and scale. @@ -1259,7 +1257,12 @@ def scaled_fp8_quant( out_dtype: torch.dtype = current_platform.fp8_dtype() if num_token_padding: shape = (max(num_token_padding, input.shape[0]), shape[1]) - output = torch.empty(shape, device=input.device, dtype=out_dtype) + if output is None: + output = torch.empty(shape, device=input.device, dtype=out_dtype) + else: + assert num_token_padding is None, \ + "padding not supported if output passed in" + assert output.dtype == out_dtype if scale is None: if use_per_token_if_dynamic: @@ -1267,7 +1270,7 @@ def scaled_fp8_quant( device=input.device, dtype=torch.float32) torch.ops._C.dynamic_per_token_scaled_fp8_quant( - output, input, scale, scale_ub) + output, input.contiguous(), scale, scale_ub) else: scale = torch.zeros(1, device=input.device, dtype=torch.float32) torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) @@ -1376,8 +1379,8 @@ def scaled_int8_quant( dtype=torch.float32) input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32) - torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales, - input_azp) + torch.ops._C.dynamic_scaled_int8_quant(output, input.contiguous(), + input_scales, input_azp) return output, input_scales, input_azp @@ -1550,10 +1553,10 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor, def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, + token_expert_indices: torch.Tensor, gating_output: torch.Tensor) -> None: - torch.ops._moe_C.topk_softmax(topk_weights, topk_ids, - token_expert_indicies, gating_output) + torch.ops._moe_C.topk_softmax(topk_weights, topk_ids, token_expert_indices, + gating_output) def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor], diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 0ba5a5bf94c9b..990ea054f3380 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -284,9 +284,25 @@ class AttentionImpl(ABC, Generic[T]): kv_cache: torch.Tensor, attn_metadata: T, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: raise NotImplementedError + def fused_output_quant_supported(self, dtype: torch.dtype, static: bool, + group_shape: tuple[int, int]): + """ + Does this attention implementation support fused output quantization. + This is used by the AttnFusionPass to only fuse output quantization + onto implementations that support it. + + TODO(luka) merge parameters into QuantDescriptor + :param dtype: quantized dtype + :param static: static or dynamic quantization + :param group_shape: quant group shape. (-1, -1) for per-tensor. + :return: is fusion supported for this type of quantization + """ + return False + class MLAAttentionImpl(AttentionImpl[T], Generic[T]): @@ -300,6 +316,7 @@ class MLAAttentionImpl(AttentionImpl[T], Generic[T]): kv_cache: torch.Tensor, attn_metadata: T, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index c1663516de358..fe9738d804cb1 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -65,7 +65,6 @@ class BlocksparseParams: assert self.block_size > 0 assert self.local_blocks >= 0 assert self.vert_stride >= 1 - assert self.num_heads % self.num_kv_heads == 0 tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() @@ -329,9 +328,8 @@ class BlocksparseFlashAttentionImpl(AttentionImpl): self.head_size = head_size self.scale = float(scale) self.alibi_slopes = alibi_slopes - self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.num_kv_heads = num_kv_heads - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.local_blocks = self.blocksparse_params.local_blocks @@ -374,6 +372,7 @@ class BlocksparseFlashAttentionImpl(AttentionImpl): kv_cache: torch.Tensor, attn_metadata: BlocksparseFlashAttentionMetadata, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. @@ -388,6 +387,11 @@ class BlocksparseFlashAttentionImpl(AttentionImpl): Returns: shape = [num_tokens, num_heads * head_size] """ + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for BlocksparseFlashAttentionImpl") + num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index 963bccdf21bc0..f62a43b441f23 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -307,7 +307,6 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl): if sliding_window is not None else (-1, -1)) self.kv_cache_dtype = kv_cache_dtype - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads if sliding_window is not None: # NOTE(woosuk): flash-attn's sliding window does not work with @@ -370,6 +369,8 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl): value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: DualChunkFlashAttentionMetadata, + output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with DualChunkFlashAttention. Args: @@ -383,6 +384,13 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl): Returns: shape = [num_tokens, num_heads * head_size] """ + assert output is None, "Output tensor not supported for DualChunk" + + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for FlashAttentionImpl") + ( query, query_succ, diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 73e3772682e69..bf8e373802f81 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -654,7 +654,6 @@ class FlashAttentionImpl(AttentionImpl): logits_soft_cap = 0 self.logits_soft_cap = logits_soft_cap - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads support_head_sizes = FlashAttentionBackend.get_supported_head_sizes() @@ -673,6 +672,7 @@ class FlashAttentionImpl(AttentionImpl): kv_cache: torch.Tensor, attn_metadata: FlashAttentionMetadata, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -692,6 +692,11 @@ class FlashAttentionImpl(AttentionImpl): """ assert output is not None, "Output tensor must be provided." + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for FlashAttentionImpl") + # NOTE(woosuk): FlashAttention2 does not support FP8 KV cache. if not flash_attn_supports_fp8() or output.dtype != torch.bfloat16: assert ( diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index a3937760f03b8..b7d80f5194c0f 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses -import os from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass @@ -50,8 +49,7 @@ if TYPE_CHECKING: from vllm.worker.model_runner import (ModelInputForGPUBuilder, ModelInputForGPUWithSamplingMetadata) -FLASHINFER_KV_CACHE_LAYOUT: str = os.getenv("FLASHINFER_KV_CACHE_LAYOUT", - "NHD").upper() +FLASHINFER_KV_CACHE_LAYOUT: str = envs.VLLM_KV_CACHE_LAYOUT or "NHD" class FlashInferBackend(AttentionBackend): @@ -957,7 +955,6 @@ class FlashInferImpl(AttentionImpl): self.kv_cache_dtype = kv_cache_dtype self.logits_soft_cap = logits_soft_cap - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads if attn_type != AttentionType.DECODER: @@ -975,8 +972,14 @@ class FlashInferImpl(AttentionImpl): kv_cache: torch.Tensor, attn_metadata: FlashInferMetadata, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for FlashInferImpl") + # TODO: directly write to output tensor num_heads: int = self.num_heads head_size: int = self.head_size diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 9bd513fd894f5..bf778a1e5016d 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -148,7 +148,6 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): alibi_slopes_tensor = torch.tensor(alibi_slopes, dtype=torch.bfloat16) self.alibi_slopes = alibi_slopes_tensor - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads if self.prefill_impl == 'fsdpa': @@ -181,6 +180,7 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): kv_cache: torch.Tensor, attn_metadata: HPUAttentionMetadata, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -193,6 +193,11 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): Returns: shape = [num_tokens, num_heads * head_size] """ + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for HPUAttentionImpl") + batch_size, seq_len, hidden_size = query.shape _, seq_len_kv, _ = key.shape diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 5051c6a7cc4fd..410ada3b0828b 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -145,7 +145,6 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): self.sliding_window = sliding_window self.kv_cache_dtype = kv_cache_dtype - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.need_mask = (self.sliding_window is not None) if logits_soft_cap is None: @@ -192,6 +191,7 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): kv_cache: torch.Tensor, attn_metadata: IpexAttnMetadata, # type: ignore output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with IPEX varlen_attention and PagedAttention. @@ -206,6 +206,11 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): Returns: shape = [num_tokens, num_heads * head_size] """ + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for IpexAttentionImpl") + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 78cf952881303..0c3ff26d04c8b 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -1319,11 +1319,17 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): kv_cache: torch.Tensor, attn_metadata: T, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: if output is not None: raise NotImplementedError( "output is not yet supported for MLAImplBase") + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for MLAImplBase") + if attn_metadata.is_profile_run and \ attn_metadata.context_chunk_workspace is not None: # During the profile run try to simulate to worse case output size diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 7ad67615d33d9..c900666955a32 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -121,9 +121,8 @@ class PallasAttentionBackendImpl(AttentionImpl): self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) - self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.num_kv_heads = num_kv_heads - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.logits_soft_cap = logits_soft_cap if head_size % 128 != 0: @@ -172,6 +171,7 @@ class PallasAttentionBackendImpl(AttentionImpl): kv_cache: Tuple[torch.Tensor, torch.Tensor], attn_metadata: PallasMetadata, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with Pallas attention. @@ -187,6 +187,11 @@ class PallasAttentionBackendImpl(AttentionImpl): Returns: shape = [batch_size, seq_len, num_heads * head_size] """ + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for PallasAttentionImpl") + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 batch_size, seq_len, hidden_size = query.shape query = query.view(batch_size, seq_len, self.num_heads, self.head_size) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 4b460dc0b58cd..1e2c21f4e69d6 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -17,6 +17,7 @@ from vllm.attention.backends.utils import (CommonAttentionState, CommonMetadataBuilder) from vllm.attention.ops.paged_attn import (PagedAttention, PagedAttentionMetadata) +from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.platforms.rocm import use_rocm_custom_paged_attention @@ -37,11 +38,11 @@ def is_rocm_aiter_paged_attn_enabled() -> bool: @cache def _get_paged_attn_module() -> PagedAttention: """ - Initializes the appropriate PagedAttention module from `attention/ops`, + Initializes the appropriate PagedAttention module from `attention/ops`, which is used as helper function by `ROCmFlashAttentionImpl` and `ROCmFlashAttentionBackend`. - The choice of attention module depends on whether + The choice of attention module depends on whether AITER paged attention is enabled: - If enabled, `ROCmFlashAttentionImpl` uses `AITERPagedAttention`. - Otherwise, it defaults to using the original `PagedAttention`. @@ -527,7 +528,6 @@ class ROCmFlashAttentionImpl(AttentionImpl): if sliding_window is not None else (-1, -1)) self.kv_cache_dtype = kv_cache_dtype - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.paged_attn_module = _get_paged_attn_module() @@ -584,6 +584,10 @@ class ROCmFlashAttentionImpl(AttentionImpl): logger.debug("Using naive (SDPA) attention in ROCmBackend") self.aiter_kv_scales_initialized = False + self.force_fp8_attention = ( + get_current_vllm_config() is not None + and get_current_vllm_config().model_config.override_attention_dtype + == "fp8") def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor: """torch.repeat_interleave(x, dim=1, repeats=n_rep)""" @@ -593,6 +597,15 @@ class ROCmFlashAttentionImpl(AttentionImpl): head_dim).reshape(tokens, n_kv_heads * n_rep, head_dim)) + def fused_output_quant_supported(self, dtype: torch.dtype, static: bool, + group_shape: tuple[int, int]): + if self.use_triton_flash_attn: + return dtype == current_platform.fp8_dtype( + ) and static and group_shape == (-1, -1) # per-tensor + + # Only supported in the Triton backend + return False + def forward( self, layer: AttentionLayer, @@ -602,6 +615,7 @@ class ROCmFlashAttentionImpl(AttentionImpl): kv_cache: torch.Tensor, attn_metadata: ROCmFlashAttentionMetadata, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. @@ -655,6 +669,11 @@ class ROCmFlashAttentionImpl(AttentionImpl): """ assert output is not None, "Output tensor must be provided." + if output_scale is not None and not self.use_triton_flash_attn: + raise NotImplementedError( + "fused output quantization only supported for Triton" + " implementation in ROCMFlashAttentionImpl for now") + query = query.view(-1, self.num_heads, self.head_size) if key is not None: assert value is not None @@ -770,9 +789,12 @@ class ROCmFlashAttentionImpl(AttentionImpl): query.dtype, seq_lens, make_attn_mask=causal_mask) # type: ignore + use_fp8_scales = (layer._q_scale and layer._k_scale and layer._v_scale and layer._prob_scale - and self.kv_cache_dtype == "fp8") + and (self.kv_cache_dtype == "fp8" + or self.force_fp8_attention)) + full_scales = ( layer._q_scale.item(), layer._k_scale.item(), layer._v_scale.item(), @@ -791,6 +813,7 @@ class ROCmFlashAttentionImpl(AttentionImpl): attn_masks[0][None] if attn_masks is not None else None, full_scales, + output_scale, ) elif self.use_naive_attn: if self.num_kv_heads != self.num_heads: @@ -868,6 +891,7 @@ class ROCmFlashAttentionImpl(AttentionImpl): decode_query.dtype, head_size, block_size, gqa_ratio, decode_meta.max_decode_seq_len, self.sliding_window, self.kv_cache_dtype, self.alibi_slopes) + if use_custom: max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type != AttentionType.ENCODER_DECODER else @@ -879,7 +903,7 @@ class ROCmFlashAttentionImpl(AttentionImpl): assert _PARTITION_SIZE_ROCM % block_size == 0 tmp_output = torch.empty( size=(num_seqs, num_heads, max_num_partitions, head_size), - dtype=output.dtype, + dtype=query.dtype, device=output.device, ) exp_sums = torch.empty( @@ -913,9 +937,17 @@ class ROCmFlashAttentionImpl(AttentionImpl): self.kv_cache_dtype, layer._k_scale, layer._v_scale, + output_scale, ) else: - output[num_prefill_tokens:] = paged_attn.forward_decode( + # PagedAttention does not support fused quant, manually quantize + if output_scale is None: + out_pa = output[num_prefill_tokens:] + else: + out_pa = torch.empty_like(output[num_prefill_tokens:], + dtype=query.dtype) + + out_pa[:] = paged_attn.forward_decode( decode_query, key_cache, value_cache, @@ -936,6 +968,14 @@ class ROCmFlashAttentionImpl(AttentionImpl): layer._v_scale, ) + # Manually perform quantization + if output_scale is not None: + out_uq = out_pa.view(-1, self.num_heads * self.head_size) + out_q = output.view(-1, self.num_heads * self.head_size) + ops.scaled_fp8_quant(out_uq, + output_scale, + output=out_q[num_prefill_tokens:]) + # Reshape the output tensor. return output.view(-1, self.num_heads * self.head_size) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 23231c323f139..3e1336a5ac3b2 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -433,7 +433,6 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): self.sliding_window = sliding_window self.kv_cache_dtype = kv_cache_dtype - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.need_mask = (self.alibi_slopes is not None or self.sliding_window is not None) @@ -459,6 +458,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): kv_cache: torch.Tensor, attn_metadata: TorchSDPAMetadata, # type: ignore output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with torch SDPA and PagedAttention. @@ -473,6 +473,10 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): Returns: shape = [num_tokens, num_heads * head_size] """ + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for TorchSDPABackendImpl") # For warming-up if attn_metadata is None: diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index e3f02a193614a..34e059067d84d 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -373,7 +373,7 @@ class CommonAttentionState(AttentionState): f"Expected attn_backend name to be either 'XFORMERS'," \ f"'ROCM_FLASH', or 'FLASH_ATTN', but " \ f"got '{self.runner.attn_backend.get_name()}'" - self._add_additonal_input_buffers_for_enc_dec_model( + self._add_additional_input_buffers_for_enc_dec_model( attn_metadata=attn_metadata, input_buffers=input_buffers) return input_buffers @@ -427,7 +427,7 @@ class CommonAttentionState(AttentionState): attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture attn_metadata.num_encoder_tokens = 0 - def _add_additonal_input_buffers_for_enc_dec_model( + def _add_additional_input_buffers_for_enc_dec_model( self, attn_metadata, input_buffers: Dict[str, Any]): """ Saves additional input buffers specific to the encoder-decoder model diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 04ef928b7d7b3..b583240c73c41 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -415,7 +415,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): self.sliding_window = sliding_window self.kv_cache_dtype = kv_cache_dtype - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads supported_head_sizes = PagedAttention.get_supported_head_sizes() @@ -435,6 +434,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): kv_cache: torch.Tensor, attn_metadata: "XFormersMetadata", output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -487,6 +487,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): Returns: shape = [num_tokens, num_heads * head_size] """ + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for XFormersImpl") + attn_type = self.attn_type # Check that appropriate attention metadata attributes are # selected for the desired attention type diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index a5fbd1a1c0166..6d9c6f51b34df 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -80,6 +80,9 @@ class Attention(nn.Module): calculate_kv_scales = False if num_kv_heads is None: num_kv_heads = num_heads + assert num_heads % num_kv_heads == 0, \ + f"num_heads ({num_heads}) is not " \ + f"divisible by num_kv_heads ({num_kv_heads})" # The default k/v_scale is set to 1.0. This is ignored # when kv-cache is not fp8, and should be used with @@ -291,7 +294,9 @@ class MultiHeadAttention(nn.Module): self.scale = scale self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads - assert self.num_heads % self.num_kv_heads == 0 + assert self.num_heads % self.num_kv_heads == 0, \ + f"num_heads ({self.num_heads}) is not " \ + f"divisible by num_kv_heads ({self.num_kv_heads})" self.num_queries_per_kv = self.num_heads // self.num_kv_heads dtype = torch.get_default_dtype() @@ -430,6 +435,7 @@ def unified_attention_with_output( value: torch.Tensor, output: torch.Tensor, layer_name: str, + output_scale: Optional[torch.Tensor] = None, ) -> None: wait_for_kv_layer_from_connector(layer_name) forward_context: ForwardContext = get_forward_context() @@ -444,7 +450,8 @@ def unified_attention_with_output( value, kv_cache, attn_metadata, - output=output) + output=output, + output_scale=output_scale) maybe_save_kv_layer_to_connector(layer_name, kv_cache) @@ -455,6 +462,7 @@ def unified_attention_with_output_fake( value: torch.Tensor, output: torch.Tensor, layer_name: str, + output_scale: Optional[torch.Tensor] = None, ) -> None: return diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index a26e713b1c624..49070e4c7ae6a 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -25,9 +25,14 @@ Not currently supported: import torch from vllm.platforms import current_platform -from vllm.platforms.rocm import on_gfx1x from vllm.triton_utils import tl, triton +# Avoid misleading ROCm warning. +if current_platform.is_rocm(): + from vllm.platforms.rocm import on_gfx1x +else: + on_gfx1x = lambda *args, **kwargs: False + torch_dtype: tl.constexpr = torch.float16 diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 92c09e6dd0640..c65f09523a3c7 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -7,6 +7,7 @@ # - Chih-Chieh Yang # - Thomas Parnell +import torch import triton import triton.language as tl @@ -28,6 +29,24 @@ def apply_softcap(S, x): return x * (p1 - p2) / (p1 + p2) +@triton.jit +def find_seq_idx(query_start_len_ptr, target_idx, num_seqs, + BLOCK_Q: tl.constexpr, use_q_block_mode: tl.constexpr): + left: tl.int32 = 0 + right = num_seqs + while left < right: + mid = (left + right) // 2 + val = tl.load(query_start_len_ptr + mid) + mid_val = val // BLOCK_Q + mid if use_q_block_mode else val + + if mid_val <= target_idx: + left = mid + 1 + else: + right = mid + + return left - 1 + + @triton.jit def kernel_unified_attention_2d( output_ptr, # [num_tokens, num_query_heads, head_size] @@ -67,21 +86,12 @@ def kernel_unified_attention_2d( num_seqs: tl.int32, BLOCK_M: tl.constexpr, # int ): - q_block_global_idx = tl.program_id(0) kv_head_idx = tl.program_id(1) - left: tl.int32 = 0 - right = num_seqs - while left < right: - mid = (left + right) // 2 - mid_val = tl.load(query_start_len_ptr + mid) // BLOCK_Q + mid - if mid_val <= q_block_global_idx: - left = mid + 1 - else: - right = mid + seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs, + BLOCK_Q, True) - seq_idx = left - 1 q_block_start_idx = tl.load(query_start_len_ptr + seq_idx) // BLOCK_Q + seq_idx @@ -242,6 +252,311 @@ def kernel_unified_attention_2d( ) +@triton.jit +def kernel_unified_attention_3d( + segm_output_ptr, + # [num_tokens, num_query_heads, num_segments, head_size] + segm_max_ptr, # [num_tokens, num_query_heads, num_segments] + segm_expsum_ptr, # [num_tokens, num_query_heads, num_segments] + query_ptr, # [num_tokens, num_query_heads, head_size] + key_cache_ptr, # [num_blks, num_kv_heads, head_size // x, blk_size, x] + value_cache_ptr, # [num_blks, num_kv_heads, head_size, blk_size] + block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_query_heads] + scale, # float32 + k_scale, # float32 + v_scale, # float32 + softcap, # float32 + num_query_heads: tl.constexpr, # int + num_queries_per_kv: tl.constexpr, # int + block_table_stride: tl.int64, # int + query_stride_0: tl.int64, # int + query_stride_1: tl.int64, # int, should be equal to head_size + BLOCK_SIZE: tl.constexpr, # int + HEAD_SIZE: tl.constexpr, # int + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + USE_ALIBI_SLOPES: tl.constexpr, # bool + USE_SOFTCAP: tl.constexpr, # bool + SLIDING_WINDOW: tl.constexpr, # int + stride_k_cache_0: tl.int64, # int + stride_k_cache_1: tl.int64, # int + stride_k_cache_2: tl.int64, # int + stride_k_cache_3: tl.constexpr, # int + stride_v_cache_0: tl.int64, # int + stride_v_cache_1: tl.int64, # int + stride_v_cache_2: tl.int64, # int + stride_v_cache_3: tl.constexpr, # int + query_start_len_ptr, # [num_seqs+1] + BLOCK_Q: tl.constexpr, # int + num_seqs: tl.int32, + BLOCK_M: tl.constexpr, # int + NUM_SEGMENTS_PER_SEQ: tl.constexpr, # int +): + q_block_global_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + segm_idx = tl.program_id(2) + + seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs, + BLOCK_Q, True) + + q_block_start_idx = tl.load(query_start_len_ptr + + seq_idx) // BLOCK_Q + seq_idx + + q_block_local_idx = q_block_global_idx - q_block_start_idx + + cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx) + cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1) + + cur_batch_query_len = cur_batch_in_all_stop_index \ + - cur_batch_in_all_start_index + + if q_block_local_idx * BLOCK_Q >= cur_batch_query_len: + return + + # sequence len for this particular sequence + seq_len = tl.load(seq_lens_ptr + seq_idx) + + # number of segments for this particular sequence + num_segments = NUM_SEGMENTS_PER_SEQ + blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE) + + if segm_idx * blocks_per_segment * BLOCK_SIZE >= seq_len: + return + + offs_m = tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, HEAD_SIZE_PADDED) + + query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv + + query_offset_0 = cur_batch_in_all_start_index + query_pos + query_offset_1 = kv_head_idx * num_queries_per_kv + \ + offs_m % num_queries_per_kv + + query_offset = (query_offset_0[:, None] * query_stride_0 + + query_offset_1[:, None] * query_stride_1 + offs_d[None, :]) + + dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1) + query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1) + query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1) + + # Q : (BLOCK_M, HEAD_SIZE_PADDED) + Q = tl.load( + query_ptr + query_offset, + mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], + other=0.0, + ) + + block_table_offset = seq_idx * block_table_stride + + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + L = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32) + + # context length for this particular sequences + context_len = seq_len - cur_batch_query_len + + # alibi slope for this head + if USE_ALIBI_SLOPES: + alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1, + mask=query_mask_1, + other=0.0) + + num_blocks = cdiv_fn(seq_len, BLOCK_SIZE) + + # iterate through tiles within current segment + for j in range( + segm_idx * blocks_per_segment, + min((segm_idx + 1) * blocks_per_segment, num_blocks), + ): + physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j) + + offs_n = tl.arange(0, BLOCK_SIZE) + + v_offset = (physical_block_idx * stride_v_cache_0 + + kv_head_idx * stride_v_cache_2 + + offs_d[None, :] * stride_v_cache_3 + + offs_n[:, None] * stride_v_cache_1) + + k_offset = (physical_block_idx * stride_k_cache_0 + + kv_head_idx * stride_k_cache_2 + + offs_d[:, None] * stride_k_cache_3 + + offs_n[None, :] * stride_k_cache_1) + + # K : (HEAD_SIZE, BLOCK_SIZE) + K_load = tl.load(key_cache_ptr + k_offset, + mask=dim_mask[:, None], + other=0.0) + + if K_load.dtype.is_fp8(): + if Q.dtype.is_fp8(): + K = K_load + else: + K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype) + else: + K = K_load + + # V : (BLOCK_SIZE, HEAD_SIZE) + V_load = tl.load(value_cache_ptr + v_offset, + mask=dim_mask[None, :], + other=0.0) + + if V_load.dtype.is_fp8(): + if Q.dtype.is_fp8(): + V = V_load + else: + V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype) + else: + V = V_load + + seq_offset = j * BLOCK_SIZE + offs_n + + seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 + + # S : (BLOCK_M, BLOCK_SIZE) + S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32) + + S += scale * tl.dot(Q, K) + + if USE_SOFTCAP: + S = apply_softcap(S, softcap) + + S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, + S, float("-inf")) + + if SLIDING_WINDOW > 0: + S = tl.where((context_len + query_pos[:, None] - seq_offset) + < SLIDING_WINDOW, S, float("-inf")) + + if USE_ALIBI_SLOPES: + S += alibi_slope[:, None] * (seq_offset - context_len) + + # compute running maximum + # m_j : (BLOCK_M,) + m_j = tl.maximum(M, tl.max(S, axis=1)) + # For sliding window there's a chance the max is -inf due to masking of + # the entire row. In this case we need to set m_j 0 to avoid NaN + m_j = tl.where(m_j > float("-inf"), m_j, 0.0) + + # P : (BLOCK_M, BLOCK_SIZE,) + P = tl.exp(S - m_j[:, None]) + + # l_j : (BLOCK_M,) + l_j = tl.sum(P, axis=1) + + # alpha : (BLOCK_M, ) + alpha = tl.exp(M - m_j) + + # acc : (BLOCK_M, HEAD_SIZE_PADDED) + acc = acc * alpha[:, None] + + # update constants + L = L * alpha + l_j + M = m_j + + # acc : (BLOCK_M, HEAD_SIZE_PADDED) + acc += tl.dot(P.to(V.dtype), V) + + segm_output_offset = ( + query_offset_0[:, None].to(tl.int64) * + (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + segm_idx * HEAD_SIZE_PADDED + tl.arange(0, HEAD_SIZE_PADDED)[None, :]) + tl.store( + segm_output_ptr + segm_output_offset, + acc, + mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], + ) + segm_offset = (query_offset_0.to(tl.int64) * + (num_query_heads * NUM_SEGMENTS_PER_SEQ) + + query_offset_1 * NUM_SEGMENTS_PER_SEQ + segm_idx) + tl.store(segm_max_ptr + segm_offset, M, mask=query_mask_0 & query_mask_1) + tl.store(segm_expsum_ptr + segm_offset, + L, + mask=query_mask_0 & query_mask_1) + + +@triton.jit +def reduce_segments( + output_ptr, # [num_tokens, num_query_heads, head_size] + segm_output_ptr, + #[num_tokens, num_query_heads, max_num_segments, head_size] + segm_max_ptr, # [num_tokens, num_query_heads, max_num_segments] + segm_expsum_ptr, # [num_tokens, num_query_heads, max_num_segments] + seq_lens_ptr, # [num_seqs] + num_seqs, # int + num_query_heads: tl.constexpr, # int + output_stride_0: tl.int64, # int + output_stride_1: tl.int64, # int, should be equal to head_size + block_table_stride: tl.int64, # int + BLOCK_SIZE: tl.constexpr, # int + HEAD_SIZE: tl.constexpr, # int, must be power of 2 + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + query_start_len_ptr, # [num_seqs+1] + BLOCK_Q: tl.constexpr, # int + NUM_SEGMENTS_PER_SEQ: tl.constexpr, # int +): + query_token_idx = tl.program_id(0) + query_head_idx = tl.program_id(1) + + seq_idx = find_seq_idx(query_start_len_ptr, query_token_idx, num_seqs, + BLOCK_Q, False) + + # sequence len for this particular sequence + seq_len = tl.load(seq_lens_ptr + seq_idx) + + # number of segments for this particular sequence + num_segments = NUM_SEGMENTS_PER_SEQ + blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE) + + # create masks for subsequent loads + act_num_segments = cdiv_fn(seq_len, blocks_per_segment * BLOCK_SIZE) + segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full( + [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32) + dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, + 0).to(tl.int1) + + # load segment maxima + segm_offset = (query_token_idx.to(tl.int64) * + (num_query_heads * NUM_SEGMENTS_PER_SEQ) + + query_head_idx * NUM_SEGMENTS_PER_SEQ + + tl.arange(0, NUM_SEGMENTS_PER_SEQ)) + segm_max = tl.load(segm_max_ptr + segm_offset, + mask=segm_mask, + other=float("-inf")) + overall_max = tl.max(segm_max) + + # load and rescale segment exp sums + segm_expsum = tl.load(segm_expsum_ptr + segm_offset, + mask=segm_mask, + other=0.0) + segm_expsum = segm_expsum * tl.exp(segm_max - overall_max) + overall_expsum = tl.sum(segm_expsum) + + # load, rescale, and add segment attention outputs + segm_output_offset = ( + query_token_idx.to(tl.int64) * + (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED + + tl.arange(0, HEAD_SIZE_PADDED)[None, :]) + segm_output = tl.load( + segm_output_ptr + segm_output_offset, + mask=segm_mask[:, None] & dim_mask[None, :], + other=0.0, + ) + segm_output *= tl.exp(segm_max - overall_max)[:, None] + acc_sum = tl.sum(segm_output, axis=0) + # safely divide by overall_expsum, returning 0.0 if overall_expsum is 0 + acc = tl.where(overall_expsum == 0.0, 0.0, acc_sum / overall_expsum) + + # write result + output_offset = (query_token_idx * output_stride_0 + + query_head_idx * output_stride_1 + + tl.arange(0, HEAD_SIZE_PADDED)) + tl.store(output_ptr + output_offset, acc, mask=dim_mask) + + def unified_attention( q, k, @@ -291,44 +606,133 @@ def unified_attention( # = floor(q.shape[0] / BLOCK_Q) + num_seqs total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs - kernel_unified_attention_2d[( - total_num_q_blocks, - num_kv_heads, - )]( - output_ptr=out, - query_ptr=q, - key_cache_ptr=k, - value_cache_ptr=v, - block_tables_ptr=block_table, - seq_lens_ptr=seqused_k, - alibi_slopes_ptr=alibi_slopes, - scale=softmax_scale, - k_scale=k_descale, - v_scale=v_descale, - softcap=softcap, - num_query_heads=num_query_heads, - num_queries_per_kv=num_queries_per_kv, - block_table_stride=block_table.stride(0), - query_stride_0=q.stride(0), - query_stride_1=q.stride(1), - output_stride_0=out.stride(0), - output_stride_1=out.stride(1), - BLOCK_SIZE=block_size, - HEAD_SIZE=head_size, - HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), - USE_ALIBI_SLOPES=use_alibi_slopes, - USE_SOFTCAP=(softcap > 0), - SLIDING_WINDOW=(1 + window_size[0]), - stride_k_cache_0=k.stride(0), - stride_k_cache_1=k.stride(1), - stride_k_cache_2=k.stride(2), - stride_k_cache_3=k.stride(3), - stride_v_cache_0=v.stride(0), - stride_v_cache_1=v.stride(1), - stride_v_cache_2=v.stride(2), - stride_v_cache_3=v.stride(3), - query_start_len_ptr=cu_seqlens_q, - BLOCK_Q=BLOCK_Q, - num_seqs=num_seqs, - BLOCK_M=BLOCK_M, - ) + # if batch contains a prefill + if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128: + kernel_unified_attention_2d[( + total_num_q_blocks, + num_kv_heads, + )]( + output_ptr=out, + query_ptr=q, + key_cache_ptr=k, + value_cache_ptr=v, + block_tables_ptr=block_table, + seq_lens_ptr=seqused_k, + alibi_slopes_ptr=alibi_slopes, + scale=softmax_scale, + k_scale=k_descale, + v_scale=v_descale, + softcap=softcap, + num_query_heads=num_query_heads, + num_queries_per_kv=num_queries_per_kv, + block_table_stride=block_table.stride(0), + query_stride_0=q.stride(0), + query_stride_1=q.stride(1), + output_stride_0=out.stride(0), + output_stride_1=out.stride(1), + BLOCK_SIZE=block_size, + HEAD_SIZE=head_size, + HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), + USE_ALIBI_SLOPES=use_alibi_slopes, + USE_SOFTCAP=(softcap > 0), + SLIDING_WINDOW=(1 + window_size[0]), + stride_k_cache_0=k.stride(0), + stride_k_cache_1=k.stride(1), + stride_k_cache_2=k.stride(2), + stride_k_cache_3=k.stride(3), + stride_v_cache_0=v.stride(0), + stride_v_cache_1=v.stride(1), + stride_v_cache_2=v.stride(2), + stride_v_cache_3=v.stride(3), + query_start_len_ptr=cu_seqlens_q, + BLOCK_Q=BLOCK_Q, + num_seqs=num_seqs, + BLOCK_M=BLOCK_M, + ) + else: + # for initial version, NUM_SEGMENTS = 16 is chosen as a default + # value that showed good performance in tests + NUM_SEGMENTS = 16 + + segm_output = torch.empty( + q.shape[0], + num_query_heads, + NUM_SEGMENTS, + triton.next_power_of_2(head_size), + dtype=torch.float32, + device=q.device, + ) + segm_max = torch.empty( + q.shape[0], + num_query_heads, + NUM_SEGMENTS, + dtype=torch.float32, + device=q.device, + ) + segm_expsum = torch.empty( + q.shape[0], + num_query_heads, + NUM_SEGMENTS, + dtype=torch.float32, + device=q.device, + ) + + kernel_unified_attention_3d[( + total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)]( + segm_output_ptr=segm_output, + segm_max_ptr=segm_max, + segm_expsum_ptr=segm_expsum, + query_ptr=q, + key_cache_ptr=k, + value_cache_ptr=v, + block_tables_ptr=block_table, + seq_lens_ptr=seqused_k, + alibi_slopes_ptr=alibi_slopes, + scale=softmax_scale, + k_scale=k_descale, + v_scale=v_descale, + softcap=softcap, + num_query_heads=num_query_heads, + num_queries_per_kv=num_queries_per_kv, + block_table_stride=block_table.stride(0), + query_stride_0=q.stride(0), + query_stride_1=q.stride(1), + BLOCK_SIZE=block_size, + HEAD_SIZE=head_size, + HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), + USE_ALIBI_SLOPES=use_alibi_slopes, + USE_SOFTCAP=(softcap > 0), + SLIDING_WINDOW=(1 + window_size[0]), + stride_k_cache_0=k.stride(0), + stride_k_cache_1=k.stride(1), + stride_k_cache_2=k.stride(2), + stride_k_cache_3=k.stride(3), + stride_v_cache_0=v.stride(0), + stride_v_cache_1=v.stride(1), + stride_v_cache_2=v.stride(2), + stride_v_cache_3=v.stride(3), + query_start_len_ptr=cu_seqlens_q, + BLOCK_Q=BLOCK_Q, + num_seqs=num_seqs, + BLOCK_M=BLOCK_M, + NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS, + ) + + reduce_segments[(q.shape[0], num_query_heads)]( + output_ptr=out, + segm_output_ptr=segm_output, + segm_max_ptr=segm_max, + segm_expsum_ptr=segm_expsum, + seq_lens_ptr=seqused_k, + num_seqs=num_seqs, + num_query_heads=num_query_heads, + output_stride_0=out.stride(0), + output_stride_1=out.stride(1), + block_table_stride=block_table.stride(0), + BLOCK_SIZE=block_size, + HEAD_SIZE=head_size, + HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), + query_start_len_ptr=cu_seqlens_q, + BLOCK_Q=BLOCK_Q, + NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS, + ) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 4da9f7368e631..3efbe5695711f 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -50,6 +50,11 @@ try: except ImportError: librosa = PlaceholderModule("librosa") +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + logger = logging.getLogger(__name__) # ----------------------------------------------------------------------------- @@ -458,6 +463,253 @@ class ShareGPTDataset(BenchmarkDataset): return samples +def add_dataset_parser(parser: FlexibleArgumentParser): + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--dataset-name", + type=str, + default="random", + choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], + help="Name of the dataset to benchmark on.", + ) + parser.add_argument( + "--dataset-path", + type=str, + default=None, + help="Path to the sharegpt/sonnet dataset. " + "Or the huggingface dataset ID if using HF dataset.", + ) + + # group for dataset specific arguments + custom_group = parser.add_argument_group("custom dataset options") + custom_group.add_argument( + "--custom-output-len", + type=int, + default=256, + help= + "Number of output tokens per request, used only for custom dataset.", + ) + custom_group.add_argument( + "--custom-skip-chat-template", + action="store_true", + help= + "Skip applying chat template to prompt, used only for custom dataset.", + ) + + sonnet_group = parser.add_argument_group("sonnet dataset options") + sonnet_group.add_argument( + "--sonnet-input-len", + type=int, + default=550, + help= + "Number of input tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-output-len", + type=int, + default=150, + help= + "Number of output tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-prefix-len", + type=int, + default=200, + help= + "Number of prefix tokens per request, used only for sonnet dataset.", + ) + + sharegpt_group = parser.add_argument_group("sharegpt dataset options") + sharegpt_group.add_argument( + "--sharegpt-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output length " + "from the ShareGPT dataset.", + ) + + random_group = parser.add_argument_group("random dataset options") + random_group.add_argument( + "--random-input-len", + type=int, + default=1024, + help= + "Number of input tokens per request, used only for random sampling.", + ) + random_group.add_argument( + "--random-output-len", + type=int, + default=128, + help= + "Number of output tokens per request, used only for random sampling.", + ) + random_group.add_argument( + "--random-range-ratio", + type=float, + default=0.0, + help="Range ratio for sampling input/output length, " + "used only for random sampling. Must be in the range [0, 1) to define " + "a symmetric sampling range" + "[length * (1 - range_ratio), length * (1 + range_ratio)].", + ) + random_group.add_argument( + "--random-prefix-len", + type=int, + default=0, + help=("Number of fixed prefix tokens before the random context " + "in a request. " + "The total input length is the sum of `random-prefix-len` and " + "a random " + "context length sampled from [input_len * (1 - range_ratio), " + "input_len * (1 + range_ratio)]."), + ) + + hf_group = parser.add_argument_group("hf dataset options") + hf_group.add_argument("--hf-subset", + type=str, + default=None, + help="Subset of the HF dataset.") + hf_group.add_argument("--hf-split", + type=str, + default=None, + help="Split of the HF dataset.") + hf_group.add_argument( + "--hf-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output lengths " + "from the sampled HF dataset.", + ) + + +def get_samples(args, tokenizer) -> list[SampleRequest]: + if args.dataset_name == "custom": + dataset = CustomDataset(dataset_path=args.dataset_path) + input_requests = dataset.sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.custom_output_len, + skip_chat_template=args.custom_skip_chat_template, + ) + + elif args.dataset_name == "sonnet": + dataset = SonnetDataset(dataset_path=args.dataset_path) + # For the "sonnet" dataset, formatting depends on the backend. + if args.endpoint_type == "openai-chat": + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=False, + ) + else: + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset.") + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=True, + ) + + elif args.dataset_name == "hf": + # all following datasets are implemented from the + # HuggingFaceDataset base class + if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: + dataset_class = VisionArenaDataset + args.hf_split = "train" + args.hf_subset = None + elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: + dataset_class = InstructCoderDataset + args.hf_split = "train" + elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS: + dataset_class = MTBenchDataset + args.hf_split = "train" + elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: + dataset_class = ConversationDataset + elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: + dataset_class = AIMODataset + args.hf_split = "train" + elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501 + dataset_class = NextEditPredictionDataset + args.hf_split = "train" + elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS: + dataset_class = ASRDataset + args.hf_split = "train" + else: + supported_datasets = set([ + dataset_name for cls in HuggingFaceDataset.__subclasses__() + for dataset_name in cls.SUPPORTED_DATASET_PATHS + ]) + raise ValueError( + f"Unsupported dataset path: {args.dataset_path}. " + "Huggingface dataset only supports dataset_path" + f" from one of following: {supported_datasets}. " + "Please consider contributing if you would " + "like to add support for additional dataset formats.") + + if dataset_class.IS_MULTIMODAL and args.endpoint_type not in [ + "openai-chat", + "openai-audio", + ]: + # multi-modal benchmark is only available on OpenAI Chat backend. + raise ValueError( + "Multi-modal content is only supported on 'openai-chat' and " + "'openai-audio' backend.") + input_requests = dataset_class( + dataset_path=args.dataset_path, + dataset_subset=args.hf_subset, + dataset_split=args.hf_split, + random_seed=args.seed, + ).sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.hf_output_len, + ) + + else: + # For datasets that follow a similar structure, use a mapping. + dataset_mapping = { + "sharegpt": + lambda: ShareGPTDataset(random_seed=args.seed, + dataset_path=args.dataset_path).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, + ), + "burstgpt": + lambda: BurstGPTDataset(random_seed=args.seed, + dataset_path=args.dataset_path). + sample(tokenizer=tokenizer, num_requests=args.num_prompts), + "random": + lambda: RandomDataset(dataset_path=args.dataset_path).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.random_prefix_len, + input_len=args.random_input_len, + output_len=args.random_output_len, + range_ratio=args.random_range_ratio, + ), + } + + try: + input_requests = dataset_mapping[args.dataset_name]() + except KeyError as err: + raise ValueError(f"Unknown dataset: {args.dataset_name}") from err + + return input_requests + + # ----------------------------------------------------------------------------- # Custom Dataset Implementation # ----------------------------------------------------------------------------- diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 019ebcf8d5041..4487d2d6841a1 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -32,12 +32,8 @@ import numpy as np from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase -from vllm.benchmarks.datasets import (AIMODataset, ASRDataset, BurstGPTDataset, - ConversationDataset, HuggingFaceDataset, - InstructCoderDataset, MTBenchDataset, - NextEditPredictionDataset, RandomDataset, - SampleRequest, ShareGPTDataset, - SonnetDataset, VisionArenaDataset) +from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser, + get_samples) from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, @@ -543,6 +539,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, def add_cli_args(parser: argparse.ArgumentParser): + add_dataset_parser(parser) parser.add_argument( "--endpoint-type", type=str, @@ -571,20 +568,6 @@ def add_cli_args(parser: argparse.ArgumentParser): default="/v1/completions", help="API endpoint.", ) - parser.add_argument( - "--dataset-name", - type=str, - default="random", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"], - help="Name of the dataset to benchmark on.", - ) - parser.add_argument( - "--dataset-path", - type=str, - default=None, - help="Path to the sharegpt/sonnet dataset. " - "Or the huggingface dataset ID if using HF dataset.", - ) parser.add_argument( "--max-concurrency", type=int, @@ -611,12 +594,6 @@ def add_cli_args(parser: argparse.ArgumentParser): "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-prompts", - type=int, - default=1000, - help="Number of prompts to process.", - ) parser.add_argument( "--logprobs", type=int, @@ -648,7 +625,6 @@ def add_cli_args(parser: argparse.ArgumentParser): "bursty requests. A higher burstiness value (burstiness > 1) " "results in a more uniform arrival of requests.", ) - parser.add_argument("--seed", type=int, default=0) parser.add_argument( "--trust-remote-code", action="store_true", @@ -739,89 +715,6 @@ def add_cli_args(parser: argparse.ArgumentParser): "and the blog: https://hao-ai-lab.github.io/blogs/distserve", ) - # group for dataset specific arguments - sonnet_group = parser.add_argument_group("sonnet dataset options") - sonnet_group.add_argument( - "--sonnet-input-len", - type=int, - default=550, - help= - "Number of input tokens per request, used only for sonnet dataset.", - ) - sonnet_group.add_argument( - "--sonnet-output-len", - type=int, - default=150, - help= - "Number of output tokens per request, used only for sonnet dataset.", - ) - sonnet_group.add_argument( - "--sonnet-prefix-len", - type=int, - default=200, - help= - "Number of prefix tokens per request, used only for sonnet dataset.", - ) - - sharegpt_group = parser.add_argument_group("sharegpt dataset options") - sharegpt_group.add_argument( - "--sharegpt-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output length " - "from the ShareGPT dataset.", - ) - - random_group = parser.add_argument_group("random dataset options") - random_group.add_argument( - "--random-input-len", - type=int, - default=1024, - help= - "Number of input tokens per request, used only for random sampling.", - ) - random_group.add_argument( - "--random-output-len", - type=int, - default=128, - help= - "Number of output tokens per request, used only for random sampling.", - ) - random_group.add_argument( - "--random-range-ratio", - type=float, - default=0.0, - help="Range ratio for sampling input/output length, " - "used only for random sampling. Must be in the range [0, 1) to define " - "a symmetric sampling range" - "[length * (1 - range_ratio), length * (1 + range_ratio)].", - ) - random_group.add_argument( - "--random-prefix-len", - type=int, - default=0, - help="Number of fixed prefix tokens before random " - " context. The length range of context in a random " - " request is [random-prefix-len, " - " random-prefix-len + random-prefix-len * random-range-ratio).") - - hf_group = parser.add_argument_group("hf dataset options") - hf_group.add_argument("--hf-subset", - type=str, - default=None, - help="Subset of the HF dataset.") - hf_group.add_argument("--hf-split", - type=str, - default=None, - help="Split of the HF dataset.") - hf_group.add_argument( - "--hf-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output lengths " - "from the sampled HF dataset.", - ) - sampling_group = parser.add_argument_group("sampling parameters") sampling_group.add_argument( "--top-p", @@ -884,7 +777,6 @@ def main(args: argparse.Namespace): random.seed(args.seed) np.random.seed(args.seed) - endpoint_type = args.endpoint_type label = args.label model_id = args.model model_name = args.served_model_name @@ -907,115 +799,8 @@ def main(args: argparse.Namespace): "Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.") - if args.dataset_name == "sonnet": - dataset = SonnetDataset(dataset_path=args.dataset_path) - # For the "sonnet" dataset, formatting depends on the backend. - if args.backend == "openai-chat": - input_requests = dataset.sample( - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=False, - ) - else: - assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset.") - input_requests = dataset.sample( - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=True, - ) - - elif args.dataset_name == "hf": - # all following datasets are implemented from the - # HuggingFaceDataset base class - if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: - dataset_class = VisionArenaDataset - args.hf_split = "train" - args.hf_subset = None - elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: - dataset_class = InstructCoderDataset - args.hf_split = "train" - elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS: - dataset_class = MTBenchDataset - args.hf_split = "train" - elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: - dataset_class = ConversationDataset - args.hf_split = "train" - elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: - dataset_class = AIMODataset - args.hf_split = "train" - elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501 - dataset_class = NextEditPredictionDataset - args.hf_split = "train" - elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS: - dataset_class = ASRDataset - args.hf_split = "train" - else: - supported_datasets = set([ - dataset_name for cls in HuggingFaceDataset.__subclasses__() - for dataset_name in cls.SUPPORTED_DATASET_PATHS - ]) - raise ValueError( - f"Unsupported dataset path: {args.dataset_path}. " - "Huggingface dataset only supports dataset_path" - f" from one of following: {supported_datasets}. " - "Please consider contributing if you would " - "like to add support for additional dataset formats.") - - if dataset_class.IS_MULTIMODAL and endpoint_type not in [ - "openai-chat", - "openai-audio", - ]: - # multi-modal benchmark is only available on OpenAI Chat backend. - raise ValueError( - "Multi-modal content is only supported on 'openai-chat' and " - "'openai-audio' backend.") - input_requests = dataset_class( - dataset_path=args.dataset_path, - dataset_subset=args.hf_subset, - dataset_split=args.hf_split, - random_seed=args.seed, - ).sample( - num_requests=args.num_prompts, - tokenizer=tokenizer, - output_len=args.hf_output_len, - ) - - else: - # For datasets that follow a similar structure, use a mapping. - dataset_mapping = { - "sharegpt": - lambda: ShareGPTDataset(random_seed=args.seed, - dataset_path=args.dataset_path).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - output_len=args.sharegpt_output_len, - ), - "burstgpt": - lambda: BurstGPTDataset(random_seed=args.seed, - dataset_path=args.dataset_path). - sample(tokenizer=tokenizer, num_requests=args.num_prompts), - "random": - lambda: RandomDataset(dataset_path=args.dataset_path).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - prefix_len=args.random_prefix_len, - input_len=args.random_input_len, - output_len=args.random_output_len, - range_ratio=args.random_range_ratio, - ), - } - - try: - input_requests = dataset_mapping[args.dataset_name]() - except KeyError as err: - raise ValueError(f"Unknown dataset: {args.dataset_name}") from err + # Load the dataset. + input_requests = get_samples(args, tokenizer) goodput_config_dict = check_goodput_args(args) # Collect the sampling parameters. @@ -1043,7 +828,7 @@ def main(args: argparse.Namespace): benchmark_result = asyncio.run( benchmark( - endpoint_type=endpoint_type, + endpoint_type=args.endpoint_type, api_url=api_url, base_url=base_url, model_id=model_id, @@ -1073,7 +858,7 @@ def main(args: argparse.Namespace): # Setup current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") result_json["date"] = current_dt - result_json["endpoint_type"] = endpoint_type + result_json["endpoint_type"] = args.endpoint_type result_json["label"] = label result_json["model_id"] = model_id result_json["tokenizer_id"] = tokenizer_id @@ -1118,7 +903,7 @@ def main(args: argparse.Namespace): base_model_id = model_id.split("/")[-1] max_concurrency_str = (f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else "") - label = label or endpoint_type + label = label or args.endpoint_type file_name = f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa if args.result_filename: file_name = args.result_filename diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 5af3b7efed2d6..8bb8c3a2a2e4e 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -7,6 +7,7 @@ import os import pprint import time from collections.abc import Sequence +from contextlib import contextmanager from typing import Any, Callable, Optional import torch @@ -31,7 +32,7 @@ logger = init_logger(__name__) def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: if compilation_config.use_inductor: if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer( - "2.8.0"): + "2.8.0a"): logger.debug("Using InductorStandaloneAdaptor") return InductorStandaloneAdaptor() else: @@ -66,7 +67,25 @@ class CompilerManager: def compute_hash(self, vllm_config: VllmConfig) -> str: return self.compiler.compute_hash(vllm_config) - def initialize_cache(self, cache_dir: str, disable_cache: bool = False): + def initialize_cache(self, + cache_dir: str, + disable_cache: bool = False, + prefix: str = ""): + """ + Initialize the cache directory for the compiler. + + The organization of the cache directory is as follows: + cache_dir=/path/to/hash_str/rank_i_j/prefix/ + inside cache_dir, there will be: + - vllm_compile_cache.py + - computation_graph.py + - transformed_code.py + + for multiple prefixes, they can share the same + base cache dir of /path/to/hash_str/rank_i_j/ , + to store some common compilation artifacts. + """ + self.disable_cache = disable_cache self.cache_dir = cache_dir self.cache_file_path = os.path.join(cache_dir, "vllm_compile_cache.py") @@ -80,7 +99,8 @@ class CompilerManager: self.cache = ast.literal_eval(f.read()) self.compiler.initialize_cache(cache_dir=cache_dir, - disable_cache=disable_cache) + disable_cache=disable_cache, + prefix=prefix) def save_to_file(self): if self.disable_cache or not self.is_cache_updated: @@ -310,6 +330,25 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): return output +# the tag for the part of model being compiled, +# e.g. backbone/eagle_head +model_tag: str = "backbone" + + +@contextmanager +def set_model_tag(tag: str): + """Context manager to set the model tag.""" + global model_tag + assert tag != model_tag, \ + f"Model tag {tag} is the same as the current tag {model_tag}." + old_tag = model_tag + model_tag = tag + try: + yield + finally: + model_tag = old_tag + + class VllmBackend: """The compilation backend for `torch.compile` with vLLM. It is used for compilation level of `CompilationLevel.PIECEWISE`, @@ -341,7 +380,17 @@ class VllmBackend: def __init__( self, vllm_config: VllmConfig, + prefix: str = "", ): + + # if the model is initialized with a non-empty prefix, + # then usually it's enough to use that prefix, + # e.g. launguage_model, vision_model, etc. + # when multiple parts are initialized as independent + # models, we need to use the model_tag to distinguish + # them, e.g. backbone (default), eagle_head, etc. + self.prefix = prefix or model_tag + global global_graph_pool if global_graph_pool is None: global_graph_pool = current_platform.graph_pool_handle() @@ -441,16 +490,13 @@ class VllmBackend: ) self.compilation_config.cache_dir = cache_dir - if compilation_counter.num_graphs_seen > 0: - cache_dir = self.compilation_config.cache_dir + \ - f'-{compilation_counter.num_graphs_seen}' - else: - cache_dir = self.compilation_config.cache_dir + cache_dir = self.compilation_config.cache_dir os.makedirs(cache_dir, exist_ok=True) self.compilation_config.cache_dir = cache_dir rank = vllm_config.parallel_config.rank dp_rank = vllm_config.parallel_config.data_parallel_rank - local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}") + local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}", + self.prefix) os.makedirs(local_cache_dir, exist_ok=True) self.compilation_config.local_cache_dir = local_cache_dir @@ -462,7 +508,8 @@ class VllmBackend: logger.info("Using cache directory: %s for vLLM's torch.compile", local_cache_dir) - self.compiler_manager.initialize_cache(local_cache_dir, disable_cache) + self.compiler_manager.initialize_cache(local_cache_dir, disable_cache, + self.prefix) # when dynamo calls the backend, it means the bytecode # transform and analysis are done diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 36c810ec2dc96..fd39a6127d00b 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -28,11 +28,22 @@ class CompilerInterface: # This is a class-level attribute. name: str - def initialize_cache(self, cache_dir: str, disable_cache: bool = False): + def initialize_cache(self, + cache_dir: str, + disable_cache: bool = False, + prefix: str = ""): """ when the vLLM process uses `cache_dir` as the cache directory, the compiler should initialize itself with the cache directory, e.g. by re-directing its own cache directory to a sub-directory. + + prefix can be used in combination with cache_dir to figure out the base + cache directory, e.g. there're multiple parts of model being compiled, + but we want to share the same cache directory for all of them. + + e.g. + cache_dir = "/path/to/dir/backbone", prefix = "backbone" + cache_dir = "/path/to/dir/eagle_head", prefix = "eagle_head" """ pass @@ -166,7 +177,10 @@ class InductorStandaloneAdaptor(CompilerInterface): usedforsecurity=False).hexdigest()[:10] return hash_str - def initialize_cache(self, cache_dir: str, disable_cache: bool = False): + def initialize_cache(self, + cache_dir: str, + disable_cache: bool = False, + prefix: str = ""): self.cache_dir = cache_dir def compile( @@ -242,18 +256,23 @@ class InductorAdaptor(CompilerInterface): usedforsecurity=False).hexdigest()[:10] return hash_str - def initialize_cache(self, cache_dir: str, disable_cache: bool = False): + def initialize_cache(self, + cache_dir: str, + disable_cache: bool = False, + prefix: str = ""): self.cache_dir = cache_dir + self.prefix = prefix + self.base_cache_dir = cache_dir[:-len(prefix)] if prefix else cache_dir if disable_cache: return # redirect the cache directory to a sub-directory # set flags so that Inductor and Triton store their cache # in the cache_dir, then users only need to copy the cache_dir # to another machine to reuse the cache. - inductor_cache = os.path.join(cache_dir, "inductor_cache") + inductor_cache = os.path.join(self.base_cache_dir, "inductor_cache") os.makedirs(inductor_cache, exist_ok=True) os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache - triton_cache = os.path.join(cache_dir, "triton_cache") + triton_cache = os.path.join(self.base_cache_dir, "triton_cache") os.makedirs(triton_cache, exist_ok=True) os.environ["TRITON_CACHE_DIR"] = triton_cache @@ -298,14 +317,14 @@ class InductorAdaptor(CompilerInterface): nonlocal file_path compiled_fn = inductor_compiled_graph.current_callable file_path = compiled_fn.__code__.co_filename # noqa - if not file_path.startswith(self.cache_dir): + if not file_path.startswith(self.base_cache_dir): # hooked in the align_inputs_from_check_idxs function # in torch/_inductor/utils.py for cell in compiled_fn.__closure__: if not callable(cell.cell_contents): continue if cell.cell_contents.__code__.co_filename.startswith( - self.cache_dir): + self.base_cache_dir): # this is the real file path compiled from Inductor file_path = cell.cell_contents.__code__.co_filename break @@ -325,14 +344,15 @@ class InductorAdaptor(CompilerInterface): nonlocal file_path compiled_fn = inductor_compiled_graph.current_callable file_path = compiled_fn.__code__.co_filename # noqa - if not file_path.startswith(self.cache_dir): + if not file_path.startswith(self.base_cache_dir): # hooked in the align_inputs_from_check_idxs function # in torch/_inductor/utils.py for cell in compiled_fn.__closure__: if not callable(cell.cell_contents): continue code = cell.cell_contents.__code__ - if code.co_filename.startswith(self.cache_dir): + if code.co_filename.startswith( + self.base_cache_dir): # this is the real file path # compiled from Inductor file_path = code.co_filename diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index c584c103f4410..165347cfccef7 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -15,7 +15,7 @@ class CompilationCounter: # not including the splitting ops num_piecewise_capturable_graphs_seen: int = 0 num_backend_compilations: int = 0 - num_cudagraph_caputured: int = 0 + num_cudagraph_captured: int = 0 # InductorAdapter.compile calls num_inductor_compiles: int = 0 # EagerAdapter.compile calls diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py index 16a7098749f8e..18c3dfe0f171e 100644 --- a/vllm/compilation/cuda_piecewise_backend.py +++ b/vllm/compilation/cuda_piecewise_backend.py @@ -14,6 +14,7 @@ from vllm.compilation.backends import VllmBackend from vllm.compilation.counter import compilation_counter from vllm.compilation.monitor import end_monitoring_torch_compile from vllm.config import VllmConfig +from vllm.forward_context import get_forward_context from vllm.logger import init_logger from vllm.utils import weak_ref_tensors @@ -138,7 +139,10 @@ class CUDAPiecewiseBackend: if self.is_last_graph and not self.to_be_compiled_sizes: self.check_for_ending_compilation() - if not entry.use_cudagraph: + # Skip CUDA graphs if this entry doesn't use them OR + # if we're supposed to skip them globally + skip_cuda_graphs = get_forward_context().skip_cuda_graphs + if not entry.use_cudagraph or skip_cuda_graphs: return entry.runnable(*args) if entry.cudagraph is None: @@ -194,7 +198,7 @@ class CUDAPiecewiseBackend: entry.output = weak_ref_tensors(output) entry.cudagraph = cudagraph - compilation_counter.num_cudagraph_caputured += 1 + compilation_counter.num_cudagraph_captured += 1 # important: we need to return the output, rather than # the weak ref of the output, so that pytorch can correctly diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index 7e2c5b4fe66a6..9d908fcae3dfd 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Callable, NamedTuple, Optional +from typing import Callable, ClassVar, NamedTuple, Optional import torch import torch._inductor.pattern_matcher as pm @@ -34,36 +33,66 @@ RMS_OP = torch.ops._C.rms_norm.default RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default +# Use proxy as NamedTuple direct subclasses cannot have static members +class _GroupShape(NamedTuple): + row: int + col: int + + +class GroupShape(_GroupShape): + """ + This class describes the quantization group shape. + It includes static members for common shapes (per-tensor, per-token). + """ + + # Aliases for common quantization group shapes + PER_TENSOR: ClassVar['GroupShape'] + PER_TOKEN: ClassVar['GroupShape'] + + +GroupShape.PER_TENSOR = GroupShape(-1, -1) +GroupShape.PER_TOKEN = GroupShape(1, -1) + + class QuantKey(NamedTuple): """ Named tuple for identifying the type of quantization. dtype: quantized data type static: static quantization if True, dynamic if False - per_tensor: per-tensor quantization if True, per-token if False + group_shape: quantization group shape symmetric: symmetric if True, asymmetric if False + + TODO(luka) use QuantDescriptor once standardized: + https://github.com/vllm-project/vllm/issues/8913 + """ dtype: torch.dtype static: bool - per_tensor: bool = True + group_shape: GroupShape symmetric: bool = True def __str__(self): + group_shape = ('per_tensor' + if self.group_shape == GroupShape.PER_TENSOR else + ('per_token' if self.group_shape == GroupShape.PER_TOKEN + else str(self.group_shape))) + return (f"QuantKey({'static' if self.static else 'dynamic'}," - f"{fx.graph.dtype_abbrs[self.dtype]}," - f"{'per_tensor' if self.per_tensor else 'per_token'}," + f"{fx.graph.dtype_abbrs[self.dtype]},{group_shape}," f"{'a' if not self.symmetric else ''}symmetric)") -kFp8StaticTensorSym = QuantKey(FP8_DTYPE, True, True, True) -kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, True, True) -kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, False, True) +kFp8StaticTensorSym = QuantKey(FP8_DTYPE, True, GroupShape.PER_TENSOR, True) +kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, GroupShape.PER_TENSOR, True) +kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, GroupShape.PER_TOKEN, True) QUANT_OPS: dict[QuantKey, OpOverload] = { - kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa + kFp8StaticTensorSym: + torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501 kFp8DynamicTensorSym: - torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa + torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501 kFp8DynamicTokenSym: - torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa + torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501 } @@ -83,13 +112,13 @@ class FusedRMSQuantKey(NamedTuple): FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = { FusedRMSQuantKey(kFp8StaticTensorSym, False): - torch.ops._C.rms_norm_static_fp8_quant.default, # noqa + torch.ops._C.rms_norm_static_fp8_quant.default, # noqa: E501 FusedRMSQuantKey(kFp8StaticTensorSym, True): - torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, # noqa + torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, # noqa: E501 FusedRMSQuantKey(kFp8DynamicTokenSym, False): - torch.ops._C.rms_norm_dynamic_per_token_quant.default, # noqa + torch.ops._C.rms_norm_dynamic_per_token_quant.default, # noqa: E501 FusedRMSQuantKey(kFp8DynamicTokenSym, True): - torch.ops._C.rms_norm_dynamic_per_token_quant.default, # noqa + torch.ops._C.rms_norm_dynamic_per_token_quant.default, # noqa: E501 } @@ -177,10 +206,11 @@ class RMSNormStaticQuantPattern(RMSNormQuantPattern): quant_dtype: torch.dtype, symmetric=True): fused_key = FusedRMSQuantKey(fused_add=False, - quant=QuantKey(dtype=quant_dtype, - static=True, - per_tensor=True, - symmetric=symmetric)) + quant=QuantKey( + dtype=quant_dtype, + static=True, + group_shape=GroupShape.PER_TENSOR, + symmetric=symmetric)) super().__init__(epsilon, fused_key) def register(self, pm_pass: PatternMatcherPass): @@ -233,10 +263,11 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern): quant_dtype: torch.dtype, symmetric=True): key = FusedRMSQuantKey(fused_add=True, - quant=QuantKey(dtype=quant_dtype, - static=True, - per_tensor=True, - symmetric=symmetric)) + quant=QuantKey( + dtype=quant_dtype, + static=True, + group_shape=GroupShape.PER_TENSOR, + symmetric=symmetric)) super().__init__(epsilon, key) def register(self, pm_pass: PatternMatcherPass, @@ -323,12 +354,12 @@ class RMSNormDynamicQuantPattern(RMSNormQuantPattern): def __init__(self, epsilon: float, quant_dtype: torch.dtype, - per_tensor: bool, + group_shape: GroupShape = GroupShape.PER_TOKEN, symmetric=True): key = FusedRMSQuantKey(fused_add=False, quant=QuantKey(dtype=quant_dtype, static=False, - per_tensor=per_tensor, + group_shape=group_shape, symmetric=symmetric)) super().__init__(epsilon, key) @@ -421,12 +452,12 @@ class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern): def __init__(self, epsilon: float, quant_dtype: torch.dtype, - per_tensor: bool = True, + group_shape: GroupShape = GroupShape.PER_TOKEN, symmetric=True): key = FusedRMSQuantKey(fused_add=True, quant=QuantKey(dtype=quant_dtype, static=False, - per_tensor=per_tensor, + group_shape=group_shape, symmetric=symmetric)) super().__init__(epsilon, key) @@ -566,16 +597,12 @@ class FusionPass(VllmInductorPass): self.patterns, self.record_match) # Fuse rms_norm + dynamic per-token fp8 quant - RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE, - per_tensor=False).register( - self.patterns, self.record_match) + RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register( + self.patterns, self.record_match) # Fuse fused_add_rms_norm + dynamic per-token fp8 quant - FusedAddRMSNormDynamicQuantPattern(epsilon, - FP8_DTYPE, - per_tensor=False).register( - self.patterns, - self.record_match) + FusedAddRMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register( + self.patterns, self.record_match) # WARNING: This is a hack to clear the pattern matcher cache # and allow multiple values of epsilon. diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py new file mode 100644 index 0000000000000..cf57e5ed282e2 --- /dev/null +++ b/vllm/compilation/fusion_attn.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torch._inductor.pattern_matcher as pm +from torch._higher_order_ops.auto_functionalize import auto_functionalized +from torch._inductor.pattern_matcher import PatternMatcherPass +from torch._subclasses.fake_tensor import (FakeTensorMode, + unset_fake_temporarily) + +from vllm.attention import Attention +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.platforms import current_platform + +from .fusion import QUANT_OPS, GroupShape, QuantKey, empty_bf16, empty_fp32 +from .vllm_inductor_pass import VllmInductorPass + +logger = init_logger(__name__) + +ATTN_OP = torch.ops.vllm.unified_attention_with_output.default +RESHAPE_OP = torch.ops.aten.reshape.default + + +class AttentionStaticQuantPattern: + + def __init__( + self, + layer_name: str, + num_heads: int, + head_size: int, + quant_dtype: torch.dtype, + symmetric=True, + ): + self.layer_name = layer_name + self.num_heads = num_heads + self.head_size = head_size + self.quant_dtype = quant_dtype + self.quant_key = QuantKey(dtype=quant_dtype, + static=True, + group_shape=GroupShape.PER_TENSOR, + symmetric=symmetric) + assert self.quant_key in QUANT_OPS, \ + f"unsupported quantization scheme {self.quant_key}" + self.QUANT_OP = QUANT_OPS[self.quant_key] + + def empty_quant(self, *args, **kwargs): + kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs} + return torch.empty(*args, **kwargs) + + def register_if_supported(self, pm_pass: PatternMatcherPass, + layer: Attention): + if layer.impl.fused_output_quant_supported(self.quant_dtype, + self.quant_key.static, + self.quant_key.group_shape): + self._register(pm_pass) + + def _register(self, pm_pass: PatternMatcherPass): + + def pattern(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + output_attn: torch.Tensor, output_quant: torch.Tensor, + scale: torch.Tensor): + view_7 = RESHAPE_OP(output_attn, + [-1, self.num_heads, self.head_size]) + + at1 = auto_functionalized(ATTN_OP, + query=q, + key=k, + value=v, + output=view_7, + layer_name=self.layer_name, + output_scale=None) + attn_out_view = RESHAPE_OP(at1[1], + [-1, self.num_heads * self.head_size]) + + at2 = auto_functionalized(self.QUANT_OP, + result=output_quant, + input=attn_out_view, + scale=scale) + return at2[1] + + def replacement(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + output_attn: torch.Tensor, output_quant: torch.Tensor, + scale: torch.Tensor): + view_7 = RESHAPE_OP(output_quant, + [-1, self.num_heads, self.head_size]) + + at1 = auto_functionalized(ATTN_OP, + query=q, + key=k, + value=v, + output=view_7, + layer_name=self.layer_name, + output_scale=scale) + + return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size]) + + # Need custom fake mode, otherwise tracing happens with real tensors. + # That would not work for the unified_attention custom op. + with unset_fake_temporarily(), FakeTensorMode(): + inputs = [ + empty_bf16(5, self.num_heads, self.head_size), # q + empty_bf16(5, self.num_heads, self.head_size), # k + empty_bf16(5, self.num_heads, self.head_size), # v + empty_bf16(5, self.num_heads * self.head_size), # attn_output + self.empty_quant(5, self.num_heads * + self.head_size), # quant_output + empty_fp32(1, 1) # scale + ] + + def wrap_trace_fn(process_fx, trace_fn): + + def wrapped(*args, **kwargs): + return process_fx(trace_fn(*args, **kwargs)) + + return wrapped + + def fx_view_to_reshape(gm: torch.fx.GraphModule): + from torch._inductor.fx_passes.post_grad import view_to_reshape + view_to_reshape(gm) + return gm + + pm.register_replacement( + pattern, replacement, inputs, + wrap_trace_fn(fx_view_to_reshape, pm.fwd_only), pm_pass) + + +class AttnFusionPass(VllmInductorPass): + """ + This pass fuses post-attention quantization onto attention if supported. + + It uses the pattern matcher and matches each layer manually, as strings + cannot be wildcarded. This also lets us check support on attention layers + upon registration instead of during pattern matching. + + Currently, only static fp8 quant is supported, but patterns could easily be + added for other quant schemes and dtypes. The bigger hurdle for wider + support are attention kernels, which need to support fusing output quant. + """ + + def __init__(self, config: VllmConfig): + super().__init__(config) + self.static_fwd_ctx = config.compilation_config.static_forward_context + + self.patterns = PatternMatcherPass(pass_name="attn_fusion_pass") + + for key, layer in self.static_fwd_ctx.items(): + pattern = AttentionStaticQuantPattern(key, layer.num_heads, + layer.head_size, + current_platform.fp8_dtype()) + pattern.register_if_supported(self.patterns, layer) + if len(self.static_fwd_ctx) == 0: + logger.warning( + "Attention + quant fusion is enabled, but " + "CompilationConfig.static_forward_context is empty. " + "Cannot access attention layers so no fusion " + "patterns were registered.") + + def __call__(self, graph: torch.fx.graph.Graph) -> None: + self.begin() + self.dump_graph(graph, "before_attn_fusion") + + count = self.patterns.apply(graph) + logger.debug("Fused quantization onto %s attention nodes", count) + self.dump_graph(graph, "after_attn_fusion") + self.end_and_log() diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py index 9ef3889323887..2db8b5441bd6f 100644 --- a/vllm/compilation/fx_utils.py +++ b/vllm/compilation/fx_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import operator -from collections.abc import Iterable +from collections.abc import Iterable, Iterator from typing import Optional from torch import fx @@ -14,6 +14,10 @@ def is_func(node: fx.Node, target) -> bool: return node.op == "call_function" and node.target == target +def is_auto_func(node: fx.Node, op: OpOverload) -> bool: + return is_func(node, auto_functionalized) and node.args[0] == op + + # Returns the first specified node with the given op (if it exists) def find_specified_fn_maybe(nodes: Iterable[fx.Node], op: OpOverload) -> Optional[fx.Node]: @@ -60,3 +64,21 @@ def find_getitem(node: fx.Node, idx: int) -> fx.Node: ret = find_getitem_maybe(node, idx) assert ret is not None, f"Could not find getitem {idx} in node {node}" return ret + + +# An auto-functionalization-aware utility for finding nodes with a specific op +def find_op_nodes(op: OpOverload, graph: fx.Graph) -> Iterator[fx.Node]: + if not op._schema.is_mutable: + yield from graph.find_nodes(op="call_function", target=op) + + for n in graph.find_nodes(op="call_function", target=auto_functionalized): + if n.args[0] == op: + yield n + + +# Asserts that the node only has one user and returns it +# Even if a node has only 1 user, it might share storage with another node, +# which might need to be taken into account. +def get_only_user(node: fx.Node) -> fx.Node: + assert len(node.users) == 1 + return next(iter(node.users)) diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py index 46f70dcdc6886..4888d4d1298e3 100644 --- a/vllm/compilation/noop_elimination.py +++ b/vllm/compilation/noop_elimination.py @@ -23,7 +23,23 @@ class NoOpEliminationPass(VllmInductorPass): in the 2D-case. Additionally, torch internal no-op elimination pass does not handle certain slice variants. + Cases handled: + 1. A chain of reshapes is equivalent to the last reshape called on the + base tensor (input of the first reshape). + 2. A reshape that produces the shape of the input is redundant + 3. A slice that produces the shape of the input is redundant + Example graph 1: + mul_1: "f16[s0, 4096]" = ... + view_1: "f16[s0, 128, 32]" = torch.reshape(mul_1, [-1, 128, 32]) + view_2: "f16[s0, 4096]" = torch.reshape(view_2, [-1, 4096]) + view_3: "f16[s0, 128, 32]" = torch.reshape(view_3, [-1, 128, 32]) + + Can be replaced with: + mul_1: "f16[s0, 4096]" = ... + view_3: "f16[s0, 128, 32]" = ... + + Example graph 2: getitem_1: "f16[s0, 4096]" = ... view_1: "f16[s0, 4096]" = torch.reshape(getitem_1, [-1, 4096]) at = auto_functionalized(static_scaled_fp8_quant, input = view_1, ...) @@ -34,7 +50,7 @@ class NoOpEliminationPass(VllmInductorPass): at = auto_functionalized(static_scaled_fp8_quant, input = getitem_1, ...) out: "f8e4m3fn[s0, 4096]" = at[1] - Example graph 2: + Example graph 3: arg0: "s0" = SymInt(s0) scaled_mm: "f16[s0, 4096]" = ... slice_1: "f16[s0, 4096]" = torch.slice(scaled_mm, -1, 0, arg0) @@ -58,6 +74,18 @@ class NoOpEliminationPass(VllmInductorPass): # Remove no-op reshapes/views: for node in graph.nodes: if is_func(node, torch.ops.aten.reshape.default): + # Case 1: rewrite reshape chains to reshapes on the base tensor + input = node.args[0] + # If the input is a reshape, rebind to that node + if is_func(input, torch.ops.aten.reshape.default): + # The new input is guaranteed not to be a reshape, + # because we process nodes in order + node.update_arg(0, input.args[0]) + if len(input.users) == 0: + graph.erase_node(input) + count += 1 + + # Case 2: remove this reshape if it produces the original shape input, shape = node.args[:2] input_shape = input.meta["val"].shape if len(shape) != len(input_shape): diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 621c89a144874..28a59905ecf86 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -10,6 +10,7 @@ from .activation_quant_fusion import ActivationQuantFusionPass from .collective_fusion import AsyncTPPass from .fix_functionalization import FixFunctionalizationPass from .fusion import FusionPass +from .fusion_attn import AttnFusionPass from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context from .noop_elimination import NoOpEliminationPass from .sequence_parallelism import SequenceParallelismPass @@ -59,6 +60,9 @@ class PostGradPassManager(CustomGraphPass): if self.pass_config.enable_async_tp: self.passes += [AsyncTPPass(config)] + if self.pass_config.enable_attn_fusion: + self.passes += [AttnFusionPass(config)] + self.fix_functionalization = FixFunctionalizationPass(config) def add(self, pass_: InductorPass): diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index 3ccbf52d9fd38..628e9e204c552 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -4,6 +4,7 @@ import time import torch +from torch._dynamo.utils import lazy_format_graph_code from vllm.config import PassConfig, VllmConfig # yapf: disable @@ -34,6 +35,8 @@ class VllmInductorPass(InductorPass): self.pass_name = self.__class__.__name__ def dump_graph(self, graph: torch.fx.Graph, stage: str, always=False): + lazy_format_graph_code(stage, graph.owning_module) + if stage in self.pass_config.dump_graph_stages or always: # Make sure filename includes rank in the distributed setting parallel = p_is_init() and get_tp_world_size() > 1 diff --git a/vllm/config.py b/vllm/config.py index bd5e9dd635766..c4fc320ec4d92 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -44,7 +44,8 @@ from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - try_get_generation_config, try_get_safetensors_metadata, uses_mrope) + try_get_generation_config, try_get_safetensors_metadata, + try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, @@ -416,6 +417,8 @@ class ModelConfig: available.\n - "vllm" will use the vLLM model implementation.\n - "transformers" will use the Transformers model implementation.""" + override_attention_dtype: Optional[str] = None + """Override dtype for attention""" def compute_hash(self) -> str: """ @@ -516,6 +519,12 @@ class ModelConfig: from vllm.platforms import current_platform + if (self.override_attention_dtype is not None + and not current_platform.is_rocm()): + warnings.warn( + "override-attention-dtype is set but not using ROCm platform", + stacklevel=2) + if (self.enable_sleep_mode and not current_platform.is_sleep_mode_available()): raise ValueError( @@ -1420,13 +1429,19 @@ class ModelConfig: return getattr(self.hf_config, "matryoshka_dimensions", None) def get_and_verify_max_len(self, max_model_len: int): + tokenizer_config = try_get_tokenizer_config( + self.tokenizer, + trust_remote_code=self.trust_remote_code, + revision=self.tokenizer_revision) max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, + tokenizer_config=tokenizer_config, max_model_len=max_model_len, disable_sliding_window=self.disable_sliding_window, sliding_window_len=self.get_hf_config_sliding_window(), spec_target_max_model_len=self.spec_target_max_model_len, encoder_config=self.encoder_config) + logger.info("Using max model len %s", max_model_len) return max_model_len @@ -1791,7 +1806,7 @@ class ParallelConfig: """The full name of the worker class to use. If "auto", the worker class will be determined based on the platform.""" sd_worker_cls: str = "auto" - """The full name of the worker class to use for speculative decofing. + """The full name of the worker class to use for speculative decoding. If "auto", the worker class will be determined based on the platform.""" worker_extension_cls: str = "" """The full name of the worker extension class to use. The worker extension @@ -3274,6 +3289,7 @@ def _get_and_verify_dtype( def _get_and_verify_max_len( hf_config: PretrainedConfig, + tokenizer_config: Optional[dict], max_model_len: Optional[int], disable_sliding_window: bool, sliding_window_len: Optional[Union[int, list[Optional[int]]]], @@ -3300,7 +3316,7 @@ def _get_and_verify_max_len( "max_seq_length", "seq_len", ] - # Choose the smallest "max_length" from the possible keys. + # Choose the smallest "max_length" from the possible keys max_len_key = None for key in possible_keys: max_len = getattr(hf_config, key, None) @@ -3323,6 +3339,13 @@ def _get_and_verify_max_len( derived_max_model_len = min(derived_max_model_len, sliding_window_len_min) + # Consider model_max_length in tokenizer_config + if tokenizer_config: + tokenizer_model_max_length = tokenizer_config.get( + "model_max_length", derived_max_model_len) + derived_max_model_len = min(derived_max_model_len, + tokenizer_model_max_length) + # If none of the keys were found in the config, use a default and # log a warning. if derived_max_model_len == float("inf"): @@ -3795,9 +3818,10 @@ class PassConfig: its own stages (before, after, maybe in-between).""" dump_graph_dir: Path = Path(".") """Directory to dump the graphs.""" - # TODO(luka) better pass enabling system. enable_fusion: bool = True - """Whether to enable the custom fusion pass.""" + """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass.""" + enable_attn_fusion: bool = False + """Whether to enable the custom attention+quant fusion pass.""" enable_noop: bool = True """Whether to enable the custom no-op elimination pass.""" enable_sequence_parallelism: bool = False @@ -3805,6 +3829,8 @@ class PassConfig: enable_async_tp: bool = False """Whether to enable async TP.""" + # TODO(luka) better pass enabling system. + def uuid(self): """ Produces a hash unique to the pass configuration. @@ -3812,18 +3838,20 @@ class PassConfig: Do not include dump_graph_* in the hash - they don't affect compilation. """ - include = { - "enable_fusion", "enable_noop", "enable_sequence_parallelism", - "enable_async_tp" - } - dict_ = {k: v for k, v in asdict(self).items() if k in include} + exclude = {"dump_graph_stages", "dump_graph_dir"} + dict_ = {k: v for k, v in asdict(self).items() if k not in exclude} return InductorPass.hash_dict(dict_) def __post_init__(self) -> None: - if not self.enable_noop and self.enable_fusion: - logger.warning_once( - "Fusion enabled but reshape elimination disabled. " - "RMSNorm + quant (fp8) fusion might not work") + if not self.enable_noop: + if self.enable_fusion: + logger.warning_once( + "Fusion enabled but reshape elimination disabled. " + "RMSNorm/SiluMul + quant (fp8) fusion might not work") + if self.enable_attn_fusion: + logger.warning_once( + "Fusion enabled but reshape elimination disabled. " + "Attention + quant (fp8) fusion might not work") @config @@ -3930,12 +3958,14 @@ class CompilationConfig: constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`.""" # CudaGraph compilation - use_cudagraph: bool = False + use_cudagraph: bool = field(default_factory=lambda: envs.VLLM_USE_V1) """Whether to use cudagraph inside compilation. - False: cudagraph inside compilation is not used. - True: cudagraph inside compilation is used. It requires that all input buffers have fixed addresses, and all splitting ops write their outputs to input buffers. + In the vLLM V1 Engine, this flag only applies for + CompilationLevel.PIECEWISE (aka -O3). Note that this is orthogonal to the cudagraph capture logic outside of compilation. TODO: move outside cudagraph logic into compilation. @@ -4434,16 +4464,27 @@ class VllmConfig: self.compilation_config.custom_ops.append("+rms_norm") if envs.VLLM_USE_V1 and self.model_config is not None and \ not self.model_config.enforce_eager: - # FIXME(rob): Add function to set all of these. - if not self.compilation_config.custom_ops: - self.compilation_config.custom_ops = ["none"] - self.compilation_config.use_cudagraph = True + # By default, V1 uses piecewise CUDA graphs. If full_cuda_graph + # is set to True, full CUDA graphs will be used. self.compilation_config.cudagraph_num_of_warmups = 1 self.compilation_config.pass_config.enable_fusion = False self.compilation_config.pass_config.enable_noop = False self.compilation_config.level = CompilationLevel.PIECEWISE self.compilation_config.set_splitting_ops_for_v1() + # The behavior of custom ops with inductor depends on the config: + # - If use_inductor=True and custom_ops is empty: + # Inductor generates Triton kernels for all registered custom ops + # (default behavior) + # - If use_inductor=True and custom_ops is non-empty: + # Custom CUDA kernels are used for specified ops while inductor + # generates Triton kernels for remaining ops, including misc torch + # ops in the model. + if (not self.compilation_config.custom_ops + and self.compilation_config.use_inductor): + # Let inductor generate Triton kernels for the custom ops. + self.compilation_config.custom_ops = ["none"] + self._set_cudagraph_sizes() if self.cache_config.cpu_offload_gb > 0 and \ @@ -4468,7 +4509,6 @@ class VllmConfig: "full_cuda_graph is not supported with " "cascade attention. Disabling cascade attention.") self.model_config.disable_cascade_attn = True - self.cache_config.enable_prefix_caching = False if self.parallel_config.enable_microbatching and \ self.compilation_config.level >= CompilationLevel.PIECEWISE: @@ -4504,13 +4544,13 @@ class VllmConfig: # warning message here and will log it later. if not (current_platform.is_cuda() or current_platform.is_rocm()): # Hybrid KV cache manager is not supported on non-GPU platforms. - self.disable_hybrid_kv_cache_manager = True + self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.kv_transfer_config is not None: # Hybrid KV cache manager is not compatible with KV transfer. - self.disable_hybrid_kv_cache_manager = True + self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.kv_events_config is not None: # Hybrid KV cache manager is not compatible with KV events. - self.disable_hybrid_kv_cache_manager = True + self.scheduler_config.disable_hybrid_kv_cache_manager = True def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list: @@ -4660,10 +4700,13 @@ class VllmConfig: _current_vllm_config: Optional[VllmConfig] = None +_current_prefix: Optional[str] = None @contextmanager -def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): +def set_current_vllm_config(vllm_config: VllmConfig, + check_compile=False, + prefix: Optional[str] = None): """ Temporarily set the current vLLM config. Used during model initialization. @@ -4671,12 +4714,14 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): so that all modules can access it, e.g. custom ops can access the vLLM config to determine how to dispatch. """ - global _current_vllm_config + global _current_vllm_config, _current_prefix old_vllm_config = _current_vllm_config + old_prefix = _current_prefix from vllm.compilation.counter import compilation_counter num_models_seen = compilation_counter.num_models_seen try: _current_vllm_config = vllm_config + _current_prefix = prefix yield except Exception: raise @@ -4700,6 +4745,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): vllm_config.model_config.model) finally: _current_vllm_config = old_vllm_config + _current_prefix = old_prefix def get_current_vllm_config() -> VllmConfig: @@ -4713,6 +4759,15 @@ def get_current_vllm_config() -> VllmConfig: return _current_vllm_config +def get_current_model_prefix() -> str: + """ + Get the prefix of the model that's currently being initialized. + """ + assert _current_prefix is not None, \ + "Current model prefix is not set. " + return _current_prefix + + def contains_object_print(text): """ Check if the text looks like a printed Python object, e.g. diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index a33399204fafa..4ec5a775f465c 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -270,6 +270,10 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager): self.block_tables[seq_id].free() del self.block_tables[seq_id] + def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: + seq_id = seq.seq_id + self._computed_blocks_tracker.remove_seq(seq_id) + def free_cross(self, seq_group: SequenceGroup) -> None: request_id = seq_group.request_id if request_id not in self.cross_block_tables: diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index ba290eeda12b5..69b9169ddd8a9 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -133,3 +133,7 @@ class BlockSpaceManager(ABC): @abstractmethod def get_num_cached_tokens(self, seq: Sequence) -> int: pass + + @abstractmethod + def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: + pass \ No newline at end of file diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index 71b22942a3edd..679515924e85d 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -98,3 +98,6 @@ class PlaceholderBlockSpaceManager(BlockSpaceManager): def get_num_cached_tokens(self, seq: Sequence) -> int: return 0 + + def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: + return diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 44be855b1bfde..0ef0396996b62 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -901,6 +901,8 @@ class Scheduler: num_new_tokens=num_new_tokens_uncached, num_new_seqs=num_new_seqs, ): + self.remove_seq_from_computed_blocks_tracker( + seq_group, SequenceStatus.SWAPPED) break if lora_int_id > 0 and curr_loras is not None: @@ -1024,6 +1026,9 @@ class Scheduler: # Put the sequence back into the waiting queue waiting_queue.appendleft(seq_group) + self.remove_seq_from_computed_blocks_tracker( + seq_group, SequenceStatus.WAITING) + waiting_queue = deque(sorted(waiting_queue, key=self._get_priority)) self.waiting = waiting_queue @@ -1113,6 +1118,8 @@ class Scheduler: ) for seq in waiting_seqs: seq.status = SequenceStatus.FINISHED_IGNORED + self.remove_seq_from_computed_blocks_tracker( + seq_group, SequenceStatus.FINISHED_IGNORED) ignored_seq_groups.append(seq_group) waiting_queue.popleft() continue @@ -1126,6 +1133,8 @@ class Scheduler: can_allocate = self.block_manager.can_allocate( seq_group, num_lookahead_slots=num_lookahead_slots) if can_allocate == AllocStatus.LATER: + self.remove_seq_from_computed_blocks_tracker( + seq_group, SequenceStatus.WAITING) break elif can_allocate == AllocStatus.NEVER: logger.warning( @@ -1136,6 +1145,8 @@ class Scheduler: ) for seq in waiting_seqs: seq.status = SequenceStatus.FINISHED_IGNORED + self.remove_seq_from_computed_blocks_tracker( + seq_group, SequenceStatus.FINISHED_IGNORED) ignored_seq_groups.append(seq_group) waiting_queue.popleft() continue @@ -1145,6 +1156,8 @@ class Scheduler: if len(seq_groups) == 0: using_prompt_embeds = seq_group.uses_prompt_embeds() if using_prompt_embeds != seq_group.uses_prompt_embeds(): + self.remove_seq_from_computed_blocks_tracker( + seq_group, SequenceStatus.WAITING) leftover_waiting_sequences.appendleft(seq_group) waiting_queue.popleft() continue @@ -1159,6 +1172,8 @@ class Scheduler: and len(curr_loras) >= self.lora_config.max_loras): # We don't have a space for another LoRA, so # we ignore this request for now. + self.remove_seq_from_computed_blocks_tracker( + seq_group, SequenceStatus.WAITING) leftover_waiting_sequences.appendleft(seq_group) waiting_queue.popleft() continue @@ -1168,6 +1183,8 @@ class Scheduler: # We've reached the budget limit - since there might be # continuous prefills in the running queue, we should break # to avoid scheduling any new prefills. + self.remove_seq_from_computed_blocks_tracker( + seq_group, SequenceStatus.WAITING) break num_new_seqs = seq_group.get_max_num_running_seqs() @@ -1175,6 +1192,8 @@ class Scheduler: num_new_tokens=num_new_tokens_uncached, num_new_seqs=num_new_seqs, ): + self.remove_seq_from_computed_blocks_tracker( + seq_group, SequenceStatus.WAITING) break # Can schedule this request. @@ -1688,6 +1707,20 @@ class Scheduler: """Free a sequence from a block table.""" self.block_manager.free(seq) + def remove_seq_from_computed_blocks_tracker( + self, seq_group: SequenceGroup, + status: Optional[SequenceStatus]) -> None: + seqs = seq_group.get_seqs(status=status) + for seq in seqs: + self._remove_seq_from_computed_blocks_tracker(seq) + + def _remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None: + """ + Free a sequence computed blocks tracker _seq_id_to_blocks_hashes + and _seq_id_to_num_tokens_computed. + """ + self.block_manager.remove_seq_from_computed_blocks_tracker(seq) + def _free_finished_seqs(self, seq_group: SequenceGroup) -> None: """Free finished seqs in a sequence group.""" for seq in seq_group.get_seqs(): diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 710e96c33bef6..ede3dec5fe147 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -244,16 +244,11 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): # Defaults for internode and intranode are taken from DeepEP tests. num_nvl_bytes = 1024 * 1024 * 1024 num_qps_per_rank = num_local_experts - num_rdma_bytes = None - - if self.internode: - num_rdma_bytes = 1024 * 1024 * 1024 - else: - num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint( - num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank, - hidden=token_hidden_size, - num_ranks=num_ep_ranks, - num_experts=num_global_experts) + num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint( + num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank, + hidden=token_hidden_size, + num_ranks=num_ep_ranks, + num_experts=num_global_experts) assert num_rdma_bytes is not None return dict(group=self.cpu_group, diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index b9bed06d791c5..493235d724f4e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -3,7 +3,6 @@ """ KV cache helper for store. """ - import torch import vllm.envs as envs @@ -94,15 +93,17 @@ class model_aware_kv_ops_helper: def get_kv_connector_cache_layout(): + # NOTE (NickLucche) When running disaggregated PD with NIXL, HND layout is + # used for faster transfer. vllm_config = get_current_vllm_config() kv_config = vllm_config.kv_transfer_config - if vllm_config.model_config is None: - logger.warning("Unable to detect current VLLM config. " \ + if vllm_config.model_config is None or kv_config is None: + logger.warning_once("Unable to detect current VLLM config. " \ "Defaulting to NHD kv cache layout.") else: use_mla = vllm_config.model_config.use_mla if not use_mla and kv_config.kv_connector == "NixlConnector": - logger.info("NixlConnector detected. Setting KV cache " \ + logger.info_once("NixlConnector detected. Setting KV cache " \ "layout to HND for better xfer performance.") return "HND" return "NHD" diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 7552fc889f2f1..bdab4850d4c19 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -703,8 +703,6 @@ class NixlConnectorWorker: assert self.block_size == remote_block_size, "Remote P worker with " \ "different block size is not supported" - assert self.num_blocks >= nixl_agent_meta.num_blocks - # Create dst descs and xfer side handles. TP workers have same #blocks. if engine_id in self.dst_num_blocks: assert self.dst_num_blocks[engine_id] == nixl_agent_meta.num_blocks diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 10f87c49baa9e..126160b09553d 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -938,6 +938,13 @@ def init_distributed_environment( assert distributed_init_method is not None, ( "distributed_init_method must be provided when initializing " "distributed environment") + if not torch.distributed.is_backend_available(backend): + logger.warning( + "Distributed backend %s is not available; " + "falling back to gloo.", backend) + assert torch.distributed.is_gloo_available(), ( + "Fallback Gloo backend is not available.") + backend = "gloo" # this backend is used for WORLD torch.distributed.init_process_group( backend=backend, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2249243bdd29d..9451b55fdf09c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -430,6 +430,7 @@ class EngineArgs: override_generation_config: dict[str, Any] = \ get_field(ModelConfig, "override_generation_config") model_impl: str = ModelConfig.model_impl + override_attention_dtype: str = ModelConfig.override_attention_dtype calculate_kv_scales: bool = CacheConfig.calculate_kv_scales @@ -550,6 +551,8 @@ class EngineArgs: model_group.add_argument("--model-impl", choices=[f.value for f in ModelImpl], **model_kwargs["model_impl"]) + model_group.add_argument("--override-attention-dtype", + **model_kwargs["override_attention_dtype"]) # Model loading arguments load_kwargs = get_kwargs(LoadConfig) @@ -949,6 +952,7 @@ class EngineArgs: override_generation_config=self.override_generation_config, enable_sleep_mode=self.enable_sleep_mode, model_impl=self.model_impl, + override_attention_dtype=self.override_attention_dtype, ) def create_load_config(self) -> LoadConfig: @@ -1444,9 +1448,10 @@ class EngineArgs: _raise_or_fallback(feature_name=name, recommend_to_remove=False) return False - # Non-[CUDA, TPU] may be supported on V1, but off by default for now. + # Non-[CUDA, TPU, x86 CPU] may be supported on V1, + # but off by default for now. v0_hardware = not any( - (current_platform.is_cuda(), current_platform.is_tpu(), + (current_platform.is_cuda_alike(), current_platform.is_tpu(), (current_platform.is_cpu() and current_platform.get_cpu_architecture() == CpuArchEnum.X86))) if v0_hardware and _warn_or_fallback( # noqa: SIM103 @@ -1561,14 +1566,20 @@ class EngineArgs: UsageContext.LLM_CLASS: 16384, UsageContext.OPENAI_API_SERVER: 8192, } - default_max_num_seqs = 1024 + default_max_num_seqs = { + UsageContext.LLM_CLASS: 1024, + UsageContext.OPENAI_API_SERVER: 1024, + } else: # TODO(woosuk): Tune the default values for other hardware. default_max_num_batched_tokens = { UsageContext.LLM_CLASS: 8192, UsageContext.OPENAI_API_SERVER: 2048, } - default_max_num_seqs = 256 + default_max_num_seqs = { + UsageContext.LLM_CLASS: 256, + UsageContext.OPENAI_API_SERVER: 256, + } # tpu specific default values. if current_platform.is_tpu(): @@ -1585,6 +1596,17 @@ class EngineArgs: } } + # cpu specific default values. + if current_platform.is_cpu(): + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 4096, + UsageContext.OPENAI_API_SERVER: 2048, + } + default_max_num_seqs = { + UsageContext.LLM_CLASS: 128, + UsageContext.OPENAI_API_SERVER: 32, + } + use_context_value = usage_context.value if usage_context else None if (self.max_num_batched_tokens is None and usage_context in default_max_num_batched_tokens): @@ -1605,8 +1627,9 @@ class EngineArgs: "Setting max_num_batched_tokens to %d for %s usage context.", self.max_num_batched_tokens, use_context_value) - if self.max_num_seqs is None: - self.max_num_seqs = default_max_num_seqs + if (self.max_num_seqs is None + and usage_context in default_max_num_seqs): + self.max_num_seqs = default_max_num_seqs[usage_context] logger.debug("Setting max_num_seqs to %d for %s usage context.", self.max_num_seqs, use_context_value) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 72020a8ccf96b..3d7d28055dd00 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -6,12 +6,10 @@ import copy import time import weakref from functools import partial -from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable, - List, Mapping, Optional, Set, Tuple, Type, Union, overload) +from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List, + Mapping, Optional, Set, Tuple, Type, Union) from weakref import ReferenceType -from typing_extensions import deprecated - import vllm.envs as envs from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig) @@ -36,7 +34,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, deprecate_kwargs, weak_bind +from vllm.utils import Device, weak_bind logger = init_logger(__name__) ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S @@ -429,24 +427,6 @@ class _AsyncLLMEngine(LLMEngine): return await ( self.get_tokenizer_group().get_lora_tokenizer_async(lora_request)) - @overload - @deprecated("'inputs' will be renamed to 'prompt") - async def add_request_async( - self, - request_id: str, - *, - inputs: PromptType, - params: Union[SamplingParams, PoolingParams], - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - ) -> None: - ... - - @overload async def add_request_async( self, request_id: str, @@ -459,32 +439,10 @@ class _AsyncLLMEngine(LLMEngine): priority: int = 0, data_parallel_rank: Optional[int] = None, ) -> None: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - async def add_request_async( - self, - request_id: str, - prompt: Optional[PromptType] = None, - params: Optional[Union[SamplingParams, PoolingParams]] = None, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - *, - inputs: Optional[PromptType] = None, # DEPRECATED - ) -> None: - """Async version of - [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].""" - if inputs is not None: - prompt = inputs - assert prompt is not None and params is not None - + """ + Async version of + [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]. + """ if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") @@ -521,8 +479,7 @@ class _AsyncLLMEngine(LLMEngine): params = await build_guided_decoding_logits_processor_async( sampling_params=params, tokenizer=await self.get_tokenizer_async(lora_request), - default_guided_backend=self.decoding_config. - guided_decoding_backend, + default_guided_backend=self.decoding_config.backend, reasoning_backend=self.decoding_config.reasoning_backend, model_config=self.model_config) @@ -894,28 +851,7 @@ class AsyncLLMEngine(EngineClient): raise await asyncio.sleep(0) - # This method does not need to be async, but kept that way - # for backwards compatibility. - @overload - @deprecated("'inputs' will be renamed to 'prompt") - def add_request( - self, - request_id: str, - *, - inputs: PromptType, - params: Union[SamplingParams, PoolingParams], - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - ) -> Coroutine[None, None, AsyncGenerator[Union[ - RequestOutput, PoolingRequestOutput], None]]: - ... - - @overload - def add_request( + async def add_request( self, request_id: str, prompt: PromptType, @@ -926,32 +862,7 @@ class AsyncLLMEngine(EngineClient): prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, - ) -> Coroutine[None, None, AsyncGenerator[Union[ - RequestOutput, PoolingRequestOutput], None]]: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - async def add_request( - self, - request_id: str, - prompt: Optional[PromptType] = None, - params: Optional[Union[SamplingParams, PoolingParams]] = None, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - *, - inputs: Optional[PromptType] = None, # DEPRECATED ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: - if inputs is not None: - prompt = inputs - assert prompt is not None and params is not None - if not self.is_running: if self.start_engine_loop: self.start_background_loop() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index dbcf78f023611..8fccf9bd2aa00 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -11,10 +11,10 @@ from functools import partial from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict, Iterable, List, Literal, Mapping, NamedTuple, Optional) from typing import Sequence as GenericSequence -from typing import Set, Type, Union, cast, overload +from typing import Set, Type, Union, cast import torch -from typing_extensions import TypeVar, deprecated +from typing_extensions import TypeVar import vllm.envs as envs from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, @@ -58,8 +58,7 @@ from vllm.transformers_utils.tokenizer_group import ( TokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) -from vllm.utils import (Counter, Device, deprecate_kwargs, - resolve_obj_by_qualname, weak_bind) +from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind from vllm.version import __version__ as VLLM_VERSION from vllm.worker.model_runner_base import InputProcessingError @@ -629,7 +628,6 @@ class LLMEngine: def stop_remote_worker_execution_loop(self) -> None: self.model_executor.stop_remote_worker_execution_loop() - @overload def add_request( self, request_id: str, @@ -641,42 +639,6 @@ class LLMEngine: trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> None: - ... - - @overload - @deprecated("'inputs' will be renamed to 'prompt") - def add_request( - self, - request_id: str, - *, - inputs: PromptType, - params: Union[SamplingParams, PoolingParams], - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - ) -> None: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - def add_request( - self, - request_id: str, - prompt: Optional[PromptType] = None, - params: Optional[Union[SamplingParams, PoolingParams]] = None, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - tokenization_kwargs: Optional[dict[str, Any]] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - *, - inputs: Optional[PromptType] = None, # DEPRECATED ) -> None: """Add a request to the engine's request pool. @@ -725,10 +687,6 @@ class LLMEngine: >>> # continue the request processing >>> ... """ - if inputs is not None: - prompt = inputs - assert prompt is not None and params is not None - if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index bf9f669031cb0..db968cd6b5d86 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -4,9 +4,7 @@ import uuid from dataclasses import dataclass, field from enum import Enum -from typing import List, Mapping, Optional, Union, overload - -from typing_extensions import deprecated +from typing import List, Mapping, Optional, Union from vllm import PoolingParams from vllm.inputs import PromptType @@ -14,7 +12,7 @@ from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams -from vllm.utils import Device, deprecate_kwargs +from vllm.utils import Device VLLM_RPC_SUCCESS_STR = "SUCCESS" @@ -38,7 +36,6 @@ class RPCProcessRequest: prompt_adapter_request: Optional[PromptAdapterRequest] = None priority: int = 0 - @overload def __init__( self, prompt: PromptType, @@ -49,44 +46,6 @@ class RPCProcessRequest: prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> None: - ... - - @overload - @deprecated("'inputs' will be renamed to 'prompt") - def __init__( - self, - *, - inputs: PromptType, - params: Union[SamplingParams, PoolingParams], - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - ) -> None: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - def __init__( - self, - prompt: Optional[PromptType] = None, - params: Optional[Union[SamplingParams, PoolingParams]] = None, - request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - *, - inputs: Optional[PromptType] = None, # DEPRECATED - ) -> None: - if inputs is not None: - prompt = inputs - assert (prompt is not None and params is not None - and request_id is not None) - super().__init__() self.prompt = prompt diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index f2f4424859331..9e018ec7f344c 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -6,13 +6,12 @@ import copy import pickle from contextlib import contextmanager, suppress from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping, - Optional, Union, cast, overload) + Optional, Union, cast) import cloudpickle import psutil import zmq import zmq.asyncio -from typing_extensions import deprecated from zmq import Frame # type: ignore[attr-defined] from zmq.asyncio import Socket @@ -49,7 +48,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs -from vllm.utils import Device, deprecate_kwargs +from vllm.utils import Device logger = init_logger(__name__) @@ -442,7 +441,6 @@ class MQLLMEngineClient(EngineClient): def dead_error(self) -> BaseException: return ENGINE_DEAD_ERROR(self._errored_with) - @overload def generate( self, prompt: PromptType, @@ -452,39 +450,6 @@ class MQLLMEngineClient(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> AsyncGenerator[RequestOutput, None]: - ... - - @overload - @deprecated("'inputs' will be renamed to 'prompt") - def generate( - self, - *, - inputs: PromptType, - sampling_params: SamplingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - ) -> AsyncGenerator[RequestOutput, None]: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - def generate( - self, - prompt: Optional[PromptType] = None, - sampling_params: Optional[SamplingParams] = None, - request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - *, - inputs: Optional[PromptType] = None # DEPRECATED ) -> AsyncGenerator[RequestOutput, None]: """Generate outputs for a request. @@ -506,16 +471,12 @@ class MQLLMEngineClient(EngineClient): Any priority other than 0 will lead to an error if the scheduling policy is not "priority". """ - if inputs is not None: - prompt = inputs - assert (prompt is not None and sampling_params is not None - and request_id is not None) + return cast( + AsyncGenerator[RequestOutput, None], + self._process_request(prompt, sampling_params, request_id, + lora_request, trace_headers, + prompt_adapter_request, priority)) - return self._process_request(prompt, sampling_params, request_id, - lora_request, trace_headers, - prompt_adapter_request, priority) - - @overload def encode( self, prompt: PromptType, @@ -524,37 +485,6 @@ class MQLLMEngineClient(EngineClient): lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, - ) -> AsyncGenerator[PoolingRequestOutput, None]: - ... - - @overload - @deprecated("'inputs' will be renamed to 'prompt") - def encode( - self, - *, - inputs: PromptType, - pooling_params: PoolingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> AsyncGenerator[PoolingRequestOutput, None]: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - def encode( - self, - prompt: Optional[PromptType] = None, - pooling_params: Optional[PoolingParams] = None, - request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - *, - inputs: Optional[PromptType] = None # DEPRECATED ) -> AsyncGenerator[PoolingRequestOutput, None]: """Generate outputs for a request from a pooling model. @@ -575,11 +505,6 @@ class MQLLMEngineClient(EngineClient): The output `PoolingRequestOutput` objects from the LLMEngine for the request. """ - if inputs is not None: - prompt = inputs - assert (prompt is not None and pooling_params is not None - and request_id is not None) - return cast( AsyncGenerator[PoolingRequestOutput, None], self._process_request(prompt, diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 7925d91f60640..3fb2f71b5e999 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -82,7 +82,7 @@ class StopChecker: return # Check if the sequence has reached max_model_len. - if seq.get_len() > self._get_max_model_len(lora_req): + if seq.get_len() >= self._get_max_model_len(lora_req): seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 56f8754c266bb..3d1e5dc14d2f3 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -17,6 +17,7 @@ from typing import Any, Optional from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse +import vllm.envs as envs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.launcher import serve_http @@ -29,7 +30,6 @@ from vllm.version import __version__ as VLLM_VERSION logger = init_logger("vllm.entrypoints.api_server") -TIMEOUT_KEEP_ALIVE = 5 # seconds. app = FastAPI() engine = None @@ -134,7 +134,7 @@ async def run_server(args: Namespace, host=args.host, port=args.port, log_level=args.log_level, - timeout_keep_alive=TIMEOUT_KEEP_ALIVE, + timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 51807a953e021..9040877a422ff 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -95,10 +95,9 @@ class ServeSubcommand(CLISubcommand): type=str, default='', required=False, - help="Read CLI options from a config file." - "Must be a YAML with the following options:" - "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference" - ) + help="Read CLI options from a config file. " + "Must be a YAML with the following options: " + "https://docs.vllm.ai/en/latest/configuration/serve_args.html") serve_parser = make_arg_parser(serve_parser) show_filtered_argument_or_group_from_help(serve_parser, "serve") diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index fd28bf39e2d56..c11e627ee2361 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -10,6 +10,7 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union, import cloudpickle import torch.nn as nn +from pydantic import ValidationError from tqdm.auto import tqdm from typing_extensions import TypeVar, deprecated @@ -179,7 +180,8 @@ class LLM: hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, override_pooler_config: Optional[PoolerConfig] = None, - compilation_config: Optional[Union[int, dict[str, Any]]] = None, + compilation_config: Optional[Union[int, dict[str, Any], + CompilationConfig]] = None, **kwargs, ) -> None: """LLM constructor.""" @@ -194,6 +196,23 @@ class LLM: if isinstance(worker_cls, type): kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) + if "kv_transfer_config" in kwargs and isinstance( + kwargs["kv_transfer_config"], dict): + from vllm.config import KVTransferConfig + raw_config_dict = kwargs["kv_transfer_config"] + try: + kwargs["kv_transfer_config"] = KVTransferConfig( + **raw_config_dict) + except ValidationError as e: + logger.error( + "Failed to convert 'kv_transfer_config' dict to " + "KVTransferConfig object. Dict: %s. Error: %s", + raw_config_dict, e) + # Consider re-raising a more specific vLLM error or ValueError + # to provide better context to the user. + raise ValueError( + f"Invalid 'kv_transfer_config' provided: {e}") from e + if hf_overrides is None: hf_overrides = {} @@ -281,7 +300,7 @@ class LLM: sampling_params: Optional[Union[SamplingParams, Sequence[SamplingParams]]] = None, *, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, @@ -297,7 +316,7 @@ class LLM: sampling_params: Optional[Union[SamplingParams, list[SamplingParams]]] = None, prompt_token_ids: Optional[list[int]] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, @@ -313,7 +332,7 @@ class LLM: sampling_params: Optional[Union[SamplingParams, list[SamplingParams]]] = None, prompt_token_ids: Optional[list[list[int]]] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, @@ -330,7 +349,7 @@ class LLM: list[SamplingParams]]] = None, *, prompt_token_ids: list[int], - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, @@ -347,7 +366,7 @@ class LLM: list[SamplingParams]]] = None, *, prompt_token_ids: list[list[int]], - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, @@ -362,7 +381,7 @@ class LLM: prompts: None, sampling_params: None, prompt_token_ids: Union[list[int], list[list[int]]], - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, @@ -382,7 +401,7 @@ class LLM: sampling_params: Optional[Union[SamplingParams, Sequence[SamplingParams]]] = None, prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, @@ -404,7 +423,10 @@ class LLM: When it is a single value, it is applied to every prompt. When it is a list, the list must have the same length as the prompts and it is paired one by one with the prompt. - use_tqdm: Whether to use tqdm to display the progress bar. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. prompt_adapter_request: Prompt Adapter request to use for generation, if any. @@ -519,7 +541,6 @@ class LLM: Sequence) and len(lora_request) != len(prompts): raise ValueError( "Lora request list should be the same length as the prompts") - return lora_request if lora_request is None or isinstance(lora_request, LoRARequest): return [lora_request] * len(prompts) @@ -679,7 +700,7 @@ class LLM: list[list[ChatCompletionMessageParam]]], sampling_params: Optional[Union[SamplingParams, list[SamplingParams]]] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[LoRARequest] = None, chat_template: Optional[str] = None, chat_template_content_format: ChatTemplateContentFormatOption = "auto", @@ -710,7 +731,10 @@ class LLM: is a single value, it is applied to every prompt. When it is a list, the list must have the same length as the prompts and it is paired one by one with the prompt. - use_tqdm: Whether to use tqdm to display the progress bar. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. chat_template: The template to use for structuring the chat. If not provided, the model's default chat template will be used. @@ -824,7 +848,7 @@ class LLM: Sequence[PoolingParams]]] = None, *, truncate_prompt_tokens: Optional[int] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: @@ -839,7 +863,7 @@ class LLM: Sequence[PoolingParams]]] = None, prompt_token_ids: Optional[list[int]] = None, truncate_prompt_tokens: Optional[int] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: @@ -854,7 +878,7 @@ class LLM: Sequence[PoolingParams]]] = None, prompt_token_ids: Optional[list[list[int]]] = None, truncate_prompt_tokens: Optional[int] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: @@ -870,7 +894,7 @@ class LLM: *, prompt_token_ids: list[int], truncate_prompt_tokens: Optional[int] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: @@ -886,7 +910,7 @@ class LLM: *, prompt_token_ids: list[list[int]], truncate_prompt_tokens: Optional[int] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: @@ -900,7 +924,7 @@ class LLM: pooling_params: None, prompt_token_ids: Union[list[int], list[list[int]]], truncate_prompt_tokens: Optional[int] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: @@ -919,7 +943,7 @@ class LLM: Sequence[PoolingParams]]] = None, prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None, truncate_prompt_tokens: Optional[int] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: @@ -936,7 +960,10 @@ class LLM: for more details about the format of each prompts. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. - use_tqdm: Whether to use tqdm to display the progress bar. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. prompt_adapter_request: Prompt Adapter request to use for generation, if any. @@ -1006,7 +1033,7 @@ class LLM: /, *, truncate_prompt_tokens: Optional[int] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, @@ -1025,7 +1052,10 @@ class LLM: for more details about the format of each prompts. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. - use_tqdm: Whether to use tqdm to display the progress bar. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. prompt_adapter_request: Prompt Adapter request to use for generation, if any. @@ -1052,7 +1082,7 @@ class LLM: prompts: Union[PromptType, Sequence[PromptType]], /, *, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ClassificationRequestOutput]: @@ -1067,7 +1097,10 @@ class LLM: prompts: The prompts to the LLM. You may pass a sequence of prompts for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. - use_tqdm: Whether to use tqdm to display the progress bar. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. prompt_adapter_request: Prompt Adapter request to use for generation, if any. @@ -1093,7 +1126,7 @@ class LLM: text_1: list[Union[str, TextPrompt, TokensPrompt]], text_2: list[Union[str, TextPrompt, TokensPrompt]], truncate_prompt_tokens: Optional[int] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ScoringRequestOutput]: @@ -1127,7 +1160,7 @@ class LLM: text_1: list[str], text_2: list[str], truncate_prompt_tokens: Optional[int] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ScoringRequestOutput]: @@ -1179,7 +1212,7 @@ class LLM: /, *, truncate_prompt_tokens: Optional[int] = None, - use_tqdm: bool = True, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ScoringRequestOutput]: @@ -1199,7 +1232,10 @@ class LLM: text_2: The texts to pair with the query to form the input to the LLM. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. - use_tqdm: Whether to use tqdm to display the progress bar. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. prompt_adapter_request: Prompt Adapter request to use for generation, if any. @@ -1380,7 +1416,7 @@ class LLM: params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams, Sequence[PoolingParams]], *, - use_tqdm: bool, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]], prompt_adapter_request: Optional[PromptAdapterRequest], tokenization_kwargs: Optional[dict[str, Any]] = None, @@ -1418,7 +1454,8 @@ class LLM: # Add requests to the engine. it = prompts if use_tqdm: - it = tqdm(it, desc="Adding requests") + tqdm_func = use_tqdm if callable(use_tqdm) else tqdm + it = tqdm_func(it, desc="Adding requests") for i, prompt in enumerate(it): self._add_request( @@ -1475,12 +1512,15 @@ class LLM: return params def _run_engine( - self, *, use_tqdm: bool + self, + *, + use_tqdm: Union[bool, Callable[..., tqdm]] = True ) -> list[Union[RequestOutput, PoolingRequestOutput]]: # Initialize tqdm. if use_tqdm: num_requests = self.llm_engine.get_num_unfinished_requests() - pbar = tqdm( + tqdm_func = use_tqdm if callable(use_tqdm) else tqdm + pbar = tqdm_func( total=num_requests, desc="Processed prompts", dynamic_ncols=True, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2f8819bca60da..62f1c6a7c12bf 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -103,8 +103,6 @@ from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path, from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION -TIMEOUT_KEEP_ALIVE = 5 # seconds - prometheus_multiproc_dir: tempfile.TemporaryDirectory # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) @@ -1360,7 +1358,7 @@ async def run_server_worker(listen_address, # NOTE: When the 'disable_uvicorn_access_log' value is True, # no access log will be output. access_log=not args.disable_uvicorn_access_log, - timeout_keep_alive=TIMEOUT_KEEP_ALIVE, + timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 79f0f200c74ed..5f2d07e677bbf 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -272,6 +272,7 @@ class ChatCompletionRequest(OpenAIBaseModel): truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None prompt_logprobs: Optional[int] = None allowed_token_ids: Optional[list[int]] = None + bad_words: list[str] = Field(default_factory=list) # --8<-- [end:chat-completion-sampling-params] # --8<-- [start:chat-completion-extra-params] @@ -550,6 +551,7 @@ class ChatCompletionRequest(OpenAIBaseModel): else RequestOutputKind.FINAL_ONLY, guided_decoding=guided_decoding, logit_bias=self.logit_bias, + bad_words= self.bad_words, allowed_token_ids=self.allowed_token_ids, extra_args=({"kv_transfer_params": self.kv_transfer_params} if self.kv_transfer_params else None)) @@ -700,22 +702,26 @@ class ChatCompletionRequest(OpenAIBaseModel): # ensure that if "tool_choice" is specified as an object, # it matches a valid tool + correct_usage_message = 'Correct usage: `{"type": "function",' \ + ' "function": {"name": "my_function"}}`' if isinstance(data["tool_choice"], dict): valid_tool = False - specified_function = data["tool_choice"].get("function") - if not specified_function: + function = data["tool_choice"].get("function") + if not isinstance(function, dict): raise ValueError( - "Expected field `function` in `tool_choice`." - " Correct usage: `{\"type\": \"function\"," - " \"function\": {\"name\": \"my_function\"}}`") - specified_function_name = specified_function.get("name") - if not specified_function_name: + f"Invalid value for `function`: `{function}` in " + f"`tool_choice`! {correct_usage_message}") + if "name" not in function: + raise ValueError(f"Expected field `name` in `function` in " + f"`tool_choice`! {correct_usage_message}") + function_name = function["name"] + if not isinstance(function_name, + str) or len(function_name) == 0: raise ValueError( - "Expected field `name` in `function` in `tool_choice`." - "Correct usage: `{\"type\": \"function\", " - "\"function\": {\"name\": \"my_function\"}}`") + f"Invalid `name` in `function`: `{function_name}`" + f" in `tool_choice`! {correct_usage_message}") for tool in data["tools"]: - if tool["function"]["name"] == specified_function_name: + if tool["function"]["name"] == function_name: valid_tool = True break if not valid_tool: diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 79eac184a2129..2a0d4cd74a284 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -873,7 +873,7 @@ class OpenAIServingChat(OpenAIServing): total_tokens=num_prompt_tokens + completion_tokens, ) - data = chunk.model_dump_json(exclude_none=True) + data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" # once the final token is handled, if stream_options.include_usage diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index f58611c49b88c..9f333c02ab529 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -174,8 +174,8 @@ class ServingScores(OpenAIServing): for t1, t2 in input_pairs)) for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs): - - request_prompt = f"{t1}{tokenizer.sep_token}{t2}" + sep_token = tokenizer.sep_token if tokenizer.sep_token else '' + request_prompt = f"{t1}{sep_token}{t2}" input_ids = prompt_inputs["input_ids"] text_token_prompt = \ diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index f667c7e9b3a96..60d66434ea5ab 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -2,11 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import io +import math import time from collections.abc import AsyncGenerator from math import ceil from typing import Final, Optional, Union, cast +import numpy as np from fastapi import Request from vllm.config import ModelConfig @@ -143,6 +145,8 @@ ISO639_1_OTHER_LANGS = { # As per https://platform.openai.com/docs/guides/speech-to-text#overview. # TODO configurable MAX_AUDIO_CLIP_FILESIZE_MB = 25 +OVERLAP_CHUNK_SECOND = 1 +MIN_ENERGY_WINDOW_SIZE = 1600 # 1600 ~ 100ms for 16000 Hz audio class OpenAIServingTranscription(OpenAIServing): @@ -178,7 +182,7 @@ class OpenAIServingTranscription(OpenAIServing): self, request: TranscriptionRequest, audio_data: bytes, - ) -> tuple[PromptType, float]: + ) -> tuple[list[PromptType], float]: # Validate request # TODO language should be optional and can be guessed. # For now we default to en. See @@ -206,22 +210,22 @@ class OpenAIServingTranscription(OpenAIServing): y, sr = librosa.load(bytes_) duration = librosa.get_duration(y=y, sr=sr) - if duration > self.max_audio_clip_s: - raise ValueError( - f"Maximum clip duration ({self.max_audio_clip_s}s) " - "exceeded.") - - prompt = { - "encoder_prompt": { - "prompt": "", - "multi_modal_data": { - "audio": (y, sr), + chunks = [y] if duration < 30 else self._split_audio(y, sr) + prompts = [] + for i, chunk in enumerate(chunks): + prompt = { + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": (chunk, sr), + }, }, - }, - "decoder_prompt": - f"<|startoftranscript|>{lang_token}<|transcribe|><|notimestamps|>{request.prompt}" - } - return cast(PromptType, prompt), duration + "decoder_prompt": + f"<|startoftranscript|>{lang_token}<|transcribe|><|notimestamps|>{request.prompt}" + if i == 0 else "" + } + prompts.append(cast(PromptType, prompt)) + return prompts, duration # TODO (varun) : Make verbose response work ! async def create_transcription( @@ -268,7 +272,7 @@ class OpenAIServingTranscription(OpenAIServing): "Currently do not support PromptAdapter for Transcription." ) - prompt, duration_s = await self._preprocess_transcription( + prompts, duration_s = await self._preprocess_transcription( request=request, audio_data=audio_data, ) @@ -277,7 +281,8 @@ class OpenAIServingTranscription(OpenAIServing): logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) - result_generator: Optional[AsyncGenerator[RequestOutput, None]] = None + list_result_generator: Optional[list[AsyncGenerator[RequestOutput, + None]]] = None try: # Unlike most decoder-only models, whisper generation length is not # constrained by the size of the input audio, which is mapped to a @@ -288,32 +293,36 @@ class OpenAIServingTranscription(OpenAIServing): self._log_inputs( request_id, - prompt['decoder_prompt'], # type: ignore + prompts[0]['decoder_prompt'], # type: ignore params=sampling_params, lora_request=None, prompt_adapter_request=None) - result_generator = self.engine_client.generate( - prompt, - sampling_params, - request_id, - ) + list_result_generator = [ + self.engine_client.generate( + prompt, + sampling_params, + request_id, + ) for prompt in prompts + ] except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) if request.stream: return self.transcription_stream_generator(request, - result_generator, + list_result_generator, request_id, request_metadata, duration_s) # Non-streaming response. try: - assert result_generator is not None - async for op in result_generator: - result = op - return TranscriptionResponse(text=result.outputs[0].text) + assert list_result_generator is not None + text = "" + for result_generator in list_result_generator: + async for op in result_generator: + text += op.outputs[0].text + return TranscriptionResponse(text=text) except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: @@ -322,7 +331,7 @@ class OpenAIServingTranscription(OpenAIServing): async def transcription_stream_generator( self, request: TranscriptionRequest, - result_generator: AsyncGenerator[RequestOutput, None], + list_result_generator: list[AsyncGenerator[RequestOutput, None]], request_id: str, request_metadata: RequestResponseMetadata, audio_duration_s: float) -> AsyncGenerator[str, None]: created_time = int(time.time()) @@ -335,60 +344,65 @@ class OpenAIServingTranscription(OpenAIServing): include_usage = request.stream_include_usage \ if request.stream_include_usage else False include_continuous_usage = request.stream_continuous_usage_stats\ - if include_usage and request.stream_continuous_usage_stats\ - else False + if include_usage and request.stream_continuous_usage_stats\ + else False try: - async for res in result_generator: - # On first result. - if res.prompt_token_ids is not None: - # Do not account the 4-tokens `<|startoftranscript|>..` - # Could be negative when language token is not specified. - num_prompt_tokens = max(len(res.prompt_token_ids) - 4, 0) - # NOTE(NickLucche) user can't pass encoder prompts directly - # at least not to Whisper. One indicator of the encoder - # amount of processing is the log-mel spectogram length. - num_prompt_tokens += ceil(audio_duration_s * - self.model_sr / self.hop_length) + for result_generator in list_result_generator: + async for res in result_generator: + # On first result. + if res.prompt_token_ids is not None: + # Do not account the 4-tokens `<|startoftranscript|>..` + # Could be negative when language token + # is not specified. + num_prompt_tokens = max( + len(res.prompt_token_ids) - 4, 0) + # NOTE(NickLucche) user can't pass encoder + # prompts directly at least not to Whisper. + # One indicator of the encoder amount of processing + # is the log-mel spectogram length. + num_prompt_tokens += ceil( + audio_duration_s * self.model_sr / self.hop_length) - # We need to do it here, because if there are exceptions in - # the result_generator, it needs to be sent as the FIRST - # response (by the try...catch). + # We need to do it here, because if there are exceptions in + # the result_generator, it needs to be sent as the FIRST + # response (by the try...catch). - # Just one output (n=1) supported. - assert len(res.outputs) == 1 - output = res.outputs[0] + # Just one output (n=1) supported. + assert len(res.outputs) == 1 + output = res.outputs[0] - delta_message = DeltaMessage(content=output.text) - completion_tokens += len(output.token_ids) + delta_message = DeltaMessage(content=output.text) + completion_tokens += len(output.token_ids) - if output.finish_reason is None: - # Still generating, send delta update. - choice_data = TranscriptionResponseStreamChoice( - delta=delta_message) - else: - # Model is finished generating. - choice_data = TranscriptionResponseStreamChoice( - delta=delta_message, - finish_reason=output.finish_reason, - stop_reason=output.stop_reason) + if output.finish_reason is None: + # Still generating, send delta update. + choice_data = TranscriptionResponseStreamChoice( + delta=delta_message) + else: + # Model is finished generating. + choice_data = TranscriptionResponseStreamChoice( + delta=delta_message, + finish_reason=output.finish_reason, + stop_reason=output.stop_reason) - chunk = TranscriptionStreamResponse(id=request_id, - object=chunk_object_type, - created=created_time, - choices=[choice_data], - model=model_name) + chunk = TranscriptionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name) - # handle usage stats if requested & if continuous - if include_continuous_usage: - chunk.usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=num_prompt_tokens + completion_tokens, - ) + # handle usage stats if requested & if continuous + if include_continuous_usage: + chunk.usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + completion_tokens, + ) - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" # Once the final token is handled, if stream_options.include_usage # is sent, send the usage. @@ -422,3 +436,52 @@ class OpenAIServingTranscription(OpenAIServing): yield f"data: {data}\n\n" # Send the final done message after all response.n are finished yield "data: [DONE]\n\n" + + def _split_audio(self, audio_data: np.ndarray, + sample_rate: int) -> list[np.ndarray]: + chunk_size = sample_rate * self.max_audio_clip_s + overlap_size = sample_rate * OVERLAP_CHUNK_SECOND + chunks = [] + i = 0 + while i < audio_data.shape[-1]: + if i + chunk_size >= audio_data.shape[-1]: + # handle last chunk + chunks.append(audio_data[..., i:]) + break + + # Find the best split point in the overlap region + search_start = i + chunk_size - overlap_size + search_end = min(i + chunk_size, audio_data.shape[-1]) + split_point = self._find_split_point(audio_data, search_start, + search_end) + + # Extract chunk up to the split point + chunks.append(audio_data[..., i:split_point]) + i = split_point + return chunks + + def _find_split_point(self, wav: np.ndarray, start_idx: int, + end_idx: int) -> int: + """Find the best point to split audio by + looking for silence or low amplitude. + Args: + wav: Audio tensor [1, T] + start_idx: Start index of search region + end_idx: End index of search region + Returns: + Index of best splitting point + """ + segment = wav[start_idx:end_idx] + + # Calculate RMS energy in small windows + min_energy = math.inf + quietest_idx = 0 + for i in range(0, + len(segment) - MIN_ENERGY_WINDOW_SIZE, + MIN_ENERGY_WINDOW_SIZE): + window = segment[i:i + MIN_ENERGY_WINDOW_SIZE] + energy = (window**2).mean()**0.5 + if energy < min_energy: + quietest_idx = i + start_idx + min_energy = energy + return quietest_idx diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index e5dcdf9a07602..92004de030d14 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -40,7 +40,7 @@ class Internlm2ToolParser(ToolParser): request.skip_special_tokens = False return request - def get_argments(self, obj): + def get_arguments(self, obj): if "parameters" in obj: return obj.get("parameters") elif "arguments" in obj: @@ -119,9 +119,9 @@ class Internlm2ToolParser(ToolParser): # now we know we're on the same tool call and we're streaming # arguments else: - prev_arguments = self.get_argments( + prev_arguments = self.get_arguments( self.prev_tool_call_arr[self.current_tool_id]) - cur_arguments = self.get_argments(tool_call_arr) + cur_arguments = self.get_arguments(tool_call_arr) # not arguments generated if not cur_arguments and not prev_arguments: @@ -170,7 +170,7 @@ class Internlm2ToolParser(ToolParser): # check to see if the name is defined and has been sent. if so, # stream the name - otherwise keep waiting # finish by setting old and returning None as base case - tool_call_arr["arguments"] = self.get_argments(tool_call_arr) + tool_call_arr["arguments"] = self.get_arguments(tool_call_arr) self.prev_tool_call_arr = [tool_call_arr] return delta except Exception: diff --git a/vllm/env_override.py b/vllm/env_override.py index 2bede4963f964..ef425d433320d 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -13,7 +13,7 @@ logger = init_logger(__name__) # that interact with vllm workers. # they are executed whenever `import vllm` is called. -if 'NCCL_CUMEM_ENABLE' in os.environ: +if os.environ.get('NCCL_CUMEM_ENABLE', '0') != '0': logger.warning( "NCCL_CUMEM_ENABLE is set to %s, skipping override. " "This may increase memory overhead with cudagraph+allreduce: " diff --git a/vllm/envs.py b/vllm/envs.py index ffb630079a847..a4a1784f97f90 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -15,7 +15,7 @@ if TYPE_CHECKING: VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60 VLLM_NCCL_SO_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None - VLLM_USE_TRITON_FLASH_ATTN: bool = False + VLLM_USE_TRITON_FLASH_ATTN: bool = True VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False VLLM_FLASH_ATTN_VERSION: Optional[int] = None LOCAL_RANK: int = 0 @@ -44,6 +44,7 @@ if TYPE_CHECKING: VLLM_PP_LAYER_PARTITION: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" + VLLM_CPU_NUM_OF_RESERVED_CPU: int = 0 VLLM_CPU_MOE_PREPACK: bool = True VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") VLLM_XLA_CHECK_RECOMPILATION: bool = False @@ -71,6 +72,7 @@ if TYPE_CHECKING: VERBOSE: bool = False VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False VLLM_RPC_TIMEOUT: int = 10000 # ms + VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds VLLM_PLUGINS: Optional[list[str]] = None VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None VLLM_TORCH_PROFILER_DIR: Optional[str] = None @@ -110,6 +112,8 @@ if TYPE_CHECKING: VLLM_DP_SIZE: int = 1 VLLM_DP_MASTER_IP: str = "" VLLM_DP_MASTER_PORT: int = 0 + VLLM_MOE_DP_CHUNK_SIZE: int = 256 + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False VLLM_MARLIN_USE_ATOMIC_ADD: bool = False VLLM_V0_USE_OUTLINES_CACHE: bool = False VLLM_TPU_BUCKET_PADDING_GAP: int = 0 @@ -123,6 +127,8 @@ if TYPE_CHECKING: VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1 VLLM_SLEEP_WHEN_IDLE: bool = False + VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16 + VLLM_KV_CACHE_LAYOUT: Optional[str] = None def get_default_cache_root(): @@ -419,7 +425,12 @@ environment_variables: dict[str, Callable[[], Any]] = { # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31", # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'. "VLLM_CPU_OMP_THREADS_BIND": - lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"), + lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"), + + # (CPU backend only) CPU cores not used by OMP threads . + # Those CPU cores will not be used by OMP threads of a rank. + "VLLM_CPU_NUM_OF_RESERVED_CPU": + lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")), # (CPU backend only) whether to use prepack for MoE layer. This will be # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might @@ -555,6 +566,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")), + # Timeout in seconds for keeping HTTP connections alive in API server + "VLLM_HTTP_TIMEOUT_KEEP_ALIVE": + lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")), + # a list of plugin names to load, separated by commas. # if this is not set, it means all plugins will be loaded # if this is set to an empty string, no plugins will be loaded @@ -760,6 +775,18 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_DP_MASTER_PORT": lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")), + # In the context of executing MoE models with Data-Parallel, Expert-Parallel + # and Batched All-to-All dispatch/combine kernels, VLLM_MOE_DP_CHUNK_SIZE + # dictates the quantum of tokens that can be dispatched from a DP + # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE + # units. + "VLLM_MOE_DP_CHUNK_SIZE": + lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")), + + # Randomize inputs during dummy runs when using Data Parallel + "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": + lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1", + # Whether to use S3 path for model loading in CI via RunAI Streamer "VLLM_CI_USE_S3": lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1", @@ -847,6 +874,22 @@ environment_variables: dict[str, Callable[[], Any]] = { # latency penalty when a request eventually comes. "VLLM_SLEEP_WHEN_IDLE": lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))), + + # Control the max chunk bytes (in MB) for the rpc message queue. + # Object larger than this threshold will be broadcast to worker + # processes via zmq. + "VLLM_MQ_MAX_CHUNK_BYTES_MB": + lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")), + + # KV Cache layout used throughout vllm. + # Some common values are: + # - NHD + # - HND + # Where N=num_blocks, H=num_heads and D=head_size. The default value will + # leave the layout choice to the backend. Mind that backends may only + # implement and support a subset of all possible layouts. + "VLLM_KV_CACHE_LAYOUT": + lambda: os.getenv("VLLM_KV_CACHE_LAYOUT", None) } # --8<-- [end:env-vars-definition] diff --git a/vllm/forward_context.py b/vllm/forward_context.py index e964a3badb4eb..4671db4971132 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -117,6 +117,7 @@ class ForwardContext: virtual_engine: int # set dynamically for each forward pass # set dynamically for each forward pass dp_metadata: Optional[DPMetadata] = None + skip_cuda_graphs: bool = False _forward_context: Optional[ForwardContext] = None @@ -134,7 +135,8 @@ def create_forward_context(attn_metadata: Any, vllm_config: VllmConfig, virtual_engine: int = 0, num_tokens: Optional[int] = None, - num_tokens_across_dp: Optional[torch.Tensor] = None): + num_tokens_across_dp: Optional[torch.Tensor] = None, + skip_cuda_graphs: bool = False): dp_metadata: Optional[DPMetadata] = None if vllm_config.parallel_config.data_parallel_size > 1 and ( attn_metadata is not None or num_tokens is not None): @@ -146,7 +148,8 @@ def create_forward_context(attn_metadata: Any, static_forward_context, virtual_engine=virtual_engine, attn_metadata=attn_metadata, - dp_metadata=dp_metadata) + dp_metadata=dp_metadata, + skip_cuda_graphs=skip_cuda_graphs) @contextmanager @@ -165,11 +168,14 @@ def override_forward_context(forward_context: Optional[ForwardContext]): @contextmanager -def set_forward_context(attn_metadata: Any, - vllm_config: VllmConfig, - virtual_engine: int = 0, - num_tokens: Optional[int] = None, - num_tokens_across_dp: Optional[torch.Tensor] = None): +def set_forward_context( + attn_metadata: Any, + vllm_config: VllmConfig, + virtual_engine: int = 0, + num_tokens: Optional[int] = None, + num_tokens_across_dp: Optional[torch.Tensor] = None, + skip_cuda_graphs: bool = False, +): """A context manager that stores the current forward context, can be attention metadata, etc. Here we can inject common logic for every model forward pass. @@ -181,7 +187,8 @@ def set_forward_context(attn_metadata: Any, forward_context = create_forward_context(attn_metadata, vllm_config, virtual_engine, num_tokens, - num_tokens_across_dp) + num_tokens_across_dp, + skip_cuda_graphs) try: with override_forward_context(forward_context): diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 3dad021e31668..66e78833f52af 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -168,10 +168,12 @@ class InputProcessingContext(InputContext): try: output = hf_processor(**data, **merged_kwargs, return_tensors="pt") # this emulates output.to(dtype=self.model_config.dtype) - cast_output = json_map_leaves(maybe_cast_dtype, output) if isinstance(output, BatchFeature): + cast_output = json_map_leaves(maybe_cast_dtype, output.data) return BatchFeature(cast_output) + cast_output = json_map_leaves(maybe_cast_dtype, output) + logger.warning_once( f"{type(hf_processor).__name__} did not return `BatchFeature`. " "Make sure to match the behaviour of `ProcessorMixin` when " diff --git a/vllm/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py index 872f8e4dea4ac..ad89638e10614 100644 --- a/vllm/logging_utils/dump_input.py +++ b/vllm/logging_utils/dump_input.py @@ -59,28 +59,23 @@ def dump_engine_exception(config: VllmConfig, scheduler_stats: Optional[SchedulerStats]): # NOTE: ensure we can log extra info without risking raises # unexpected errors during logging - with contextlib.suppress(BaseException): + with contextlib.suppress(Exception): _dump_engine_exception(config, scheduler_output, scheduler_stats) def _dump_engine_exception(config: VllmConfig, scheduler_output: SchedulerOutput, scheduler_stats: Optional[SchedulerStats]): - pass - # logger.error("Dumping input data") - - # logger.error( - # "V1 LLM engine (v%s) with config: %s, ", - # VLLM_VERSION, - # config, - # ) - - # try: - # dump_obj = prepare_object_to_dump(scheduler_output) - # logger.error("Dumping scheduler output for model execution:") - # logger.error(dump_obj) - # if scheduler_stats: - # logger.error(scheduler_stats) - # except BaseException as exception: - # logger.error("Error preparing object to dump") - # logger.error(repr(exception)) + logger.error( + "Dumping input data for V1 LLM engine (v%s) with config: %s, ", + VLLM_VERSION, + config, + ) + try: + dump_obj = prepare_object_to_dump(scheduler_output) + logger.error("Dumping scheduler output for model execution: %s", + dump_obj) + if scheduler_stats: + logger.error("Dumping scheduler stats: %s", scheduler_stats) + except Exception: + logger.exception("Error preparing object to dump") diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 66e037a97d063..3d0c583175021 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1202,7 +1202,7 @@ class LinearScalingRotaryEmbeddingWithLoRA(BaseLayerWithLoRA): multiple LoRA adapters with a specialized kernel. Replace LinearScalingRotaryEmbedding with MultiLinearScalingRotaryEmbedding - which can handle multi lora adapters in a specialied kernel. + which can handle multi lora adapters in a specialized kernel. """ def __init__(self, base_layer: RotaryEmbedding) -> None: diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py index 0b0a7989f3907..8430cb91865f4 100644 --- a/vllm/lora/punica_wrapper/utils.py +++ b/vllm/lora/punica_wrapper/utils.py @@ -68,11 +68,11 @@ def convert_mapping( LoRA indices. sampler_indices: Tensor of shape [batch_size] mapping requests to LoRA indices for sampler. For generation, this will be the - same as base_indicies. For prefill, this will map requests + same as base_indices. For prefill, this will map requests to LoRA indices. sampler_indices_padded: Tensor of shape [batch_size] mapping requests to LoRA indices for sampler with padding. - Same as sampler_indicies, but -1 is replaced with + Same as sampler_indices, but -1 is replaced with max_loras. embeddings_indices: Tensor of shape [2, batch_size] mapping requests to embedding indices. First row is for embeddings diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index 76d71ca08856c..5492399efdf86 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -36,6 +36,9 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): assert (len(self.block_shape) == 2 and all( [v == self.DEEPGEMM_BLOCK_SHAPE for v in self.block_shape])) + def supports_chunking(self) -> bool: + return False + def workspace_shapes( self, a: torch.Tensor, @@ -44,18 +47,26 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): N: int, K: int, topk: int, - num_experts: int, - ) -> tuple[int, int, torch.dtype]: + global_num_experts: int, + local_num_experts: int, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: assert a.dim() == 2 - num_dp = self.world_size // self.dp_size + # FIXME (varun): We should be able to dispatch only from the leader + # DP ranks in the case of TP > 1. At the moment, all the Ranks + # end up sending their tokens. This needs to be fixed. + num_dispatchers = self.world_size + num_experts = local_num_experts max_num_tokens = a.size( 0) if self.max_num_tokens is None else self.max_num_tokens - workspace13 = num_experts * max_num_tokens * num_dp * max(K, N) - workspace2 = num_experts * max_num_tokens * num_dp * (N // 2) - return (workspace13, workspace2, a.dtype) + workspace13 = (num_experts, max_num_tokens * num_dispatchers, + max(K, N)) + workspace2 = (num_experts, max_num_tokens * num_dispatchers, (N // 2)) + output = (num_experts, max_num_tokens * num_dispatchers, K) + return (workspace13, workspace2, output, a.dtype) def apply( self, + output: torch.Tensor, hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -72,16 +83,13 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], - ) -> torch.Tensor: + ): import deep_gemm as dg assert hidden_states.ndim == 3 a1q = hidden_states _, N, K = w1.size() - if global_num_experts == -1: - global_num_experts = w1.size(0) - assert w2.size(1) == K E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size( @@ -89,7 +97,6 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N)) workspace2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2)) - workspace3 = _resize_cache(workspace13, (E, max_num_tokens, K)) # (from deepgemm docs) : A value hint (which is a value on CPU) # for the M expectation of each batch, correctly setting this value @@ -118,8 +125,6 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): dg.m_grouped_gemm_fp8_fp8_bf16_nt_masked((a2q, a2q_scale), (w2, w2_scale), - out=workspace3, + out=output, masked_m=expert_num_tokens, expected_m=expected_m) - - return workspace3 diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py index d62d519af8d7b..822cda8205bfe 100644 --- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py @@ -64,6 +64,15 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): block_shape=self.block_shape, # type: ignore[arg-type] ) if (self.allow_deep_gemm and is_fp8_128_block_quantized) else None + assert (self.batched_deep_gemm_experts is not None + or self.batched_triton_experts is not None) + + def supports_chunking(self) -> bool: + bdge = self.batched_deep_gemm_experts + bte = self.batched_triton_experts + return ((bdge is None or bdge.supports_chunking()) + and (bte is None or bte.supports_chunking())) + def workspace_shapes( self, a: torch.Tensor, @@ -72,21 +81,23 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): N: int, K: int, topk: int, - num_experts: int, - ) -> tuple[int, int, torch.dtype]: + global_num_experts: int, + local_num_experts: int, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: # Note: the deep gemm workspaces are strictly larger than the triton # workspaces so we can be pessimistic here and allocate for DeepGemm # even if we fall back to triton later, e.g. if expert maps are set. if self.allow_deep_gemm and self.batched_deep_gemm_experts is not None: return self.batched_deep_gemm_experts.workspace_shapes( - a, aq, M, N, K, topk, num_experts) + a, aq, M, N, K, topk, global_num_experts, local_num_experts) else: assert self.batched_triton_experts is not None return self.batched_triton_experts.workspace_shapes( - a, aq, M, N, K, topk, num_experts) + a, aq, M, N, K, topk, global_num_experts, local_num_experts) def apply( self, + output: torch.Tensor, hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -103,7 +114,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], - ) -> torch.Tensor: + ): use_batched_deep_gemm_experts = (self.allow_deep_gemm and self.batched_deep_gemm_experts is not None) @@ -111,7 +122,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): if use_batched_deep_gemm_experts else self.batched_triton_experts) assert experts is not None - return experts.apply(hidden_states, w1, w2, topk_ids, activation, - global_num_experts, expert_map, w1_scale, - w2_scale, w1_zp, w2_zp, a1q_scale, a2_scale, - workspace13, workspace2, expert_num_tokens) + experts.apply(output, hidden_states, w1, w2, topk_ids, activation, + global_num_experts, expert_map, w1_scale, w2_scale, + w1_zp, w2_zp, a1q_scale, a2_scale, workspace13, + workspace2, expert_num_tokens) diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000000000..b506820759496 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..60ccde1351598 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000000000..b0139b9f2af40 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..b9dc2d71f6dcf --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..3559f33f444b9 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json new file mode 100644 index 0000000000000..1fa444bca150a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 6e7b1a4f2b6c9..3f9ceac8b6e36 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -14,6 +14,7 @@ from vllm.scalar_type import scalar_types def run_cutlass_moe_fp8( + output: torch.Tensor, hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -31,7 +32,8 @@ def run_cutlass_moe_fp8( out_dtype: torch.dtype, per_act_token: bool, per_out_ch: bool, -) -> torch.Tensor: + use_batched_format: bool, +): a1q = hidden_states assert w1_scale is not None @@ -61,23 +63,20 @@ def run_cutlass_moe_fp8( if expert_map is not None: assert expert_num_tokens is None - # We have two modes: PPLX and non-PPLX. We differentiate them by checking - # if expert_num_tokens is None (expert_num_tokens is a tensor which PPLX - # uses to track the number of tokens per expert). - # In the non-PPLX mode, the input tokens are not padded: thus, the shape + # We have two modes: batched experts and non-batched experts. + # In the non-batched mode, the input tokens are not padded: thus, the shape # of the input is [total_num_tokens, hidden_size]. The input and output # require shuffling by a_map and c_map such that the tokens assigned to # each expert are contiguous. - # In the PPLX mode, the input tokens are padded per expert to ensure that - # the PPLX dispatch and combine functions work correctly: thus, the shape + # In the batched mode, the input tokens are padded per expert to ensure that + # the batched dispatch and combine functions work correctly: thus, the shape # of the input is [num_experts, max_num_tokens_per_expert, hidden_size]. - # The PPLX input and output require no shuffling by a_map and c_map since + # The batched input and output require no shuffling by a_map and c_map since # their tokens are already contiguous for each expert as a result of # the dispatch function. - is_pplx = expert_num_tokens is not None - M = a1q.shape[0] # no pplx - padded_M = a1q.shape[1] # pplx + M = a1q.shape[0] # non batched expert M + padded_M = a1q.shape[1] # batched expert M _, K, N = w2.shape device = a1q.device @@ -95,7 +94,9 @@ def run_cutlass_moe_fp8( topk = local_topk_ids.shape[1] local_E = w1.shape[0] - if is_pplx: + if use_batched_format: + assert expert_num_tokens is not None + expert_offsets = torch.empty((local_E), dtype=torch.int32, device=device) @@ -167,7 +168,7 @@ def run_cutlass_moe_fp8( device=device, dtype=torch.int64) - if is_pplx: + if use_batched_format: c1 = _resize_cache(workspace13, (local_E * padded_M, N * 2)) c2 = _resize_cache(workspace2, (local_E * padded_M, N)) c3 = _resize_cache(workspace13, (local_E * padded_M, K)) @@ -192,12 +193,15 @@ def run_cutlass_moe_fp8( problem_sizes2, ab_strides2, ab_strides2, c_strides2, per_act_token, per_out_ch) - if is_pplx: - return c3.reshape(local_E, padded_M, K) + if use_batched_format: + output.copy_(c3.reshape(local_E, padded_M, K), non_blocking=True) else: - return c3[c_map].view(M, topk, K) + # We can't do this inplace because output may point to the same tensor + # as c3. + output.copy_(c3[c_map].view(M * topk, K), non_blocking=True) +# TODO (bnell): split class batched vs. non-batched? class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute): def __init__( @@ -206,12 +210,17 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute): out_dtype: torch.dtype, per_act_token: bool, per_out_ch: bool, + use_batched_format: bool = False, ): super().__init__() self.max_experts_per_worker = max_experts_per_worker self.out_dtype = out_dtype self.per_act_token = per_act_token self.per_out_ch = per_out_ch + self.use_batched_format = use_batched_format + + def supports_chunking(self) -> bool: + return not self.use_batched_format def workspace_shapes( self, @@ -221,15 +230,26 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute): N: int, K: int, topk: int, - num_experts: int, - ) -> tuple[int, int, torch.dtype]: - padded_M = aq.shape[1] - workspace1 = self.max_experts_per_worker * padded_M * max(N, K) - workspace2 = self.max_experts_per_worker * padded_M * (N // 2) - return (workspace1, workspace2, self.out_dtype) + global_num_experts: int, + local_num_experts: int, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: + workspace1: tuple[int, ...] = () + workspace2: tuple[int, ...] = () + output: tuple[int, ...] = () + if self.use_batched_format: + padded_M = aq.shape[1] + workspace1 = (self.max_experts_per_worker, padded_M, max(N, K)) + workspace2 = (self.max_experts_per_worker, padded_M, (N // 2)) + output = (self.max_experts_per_worker, padded_M, K) + else: + workspace1 = (M * topk, max(2 * N, K)) + workspace2 = (M * topk, N) + output = (M * topk, K) + return (workspace1, workspace2, output, self.out_dtype) def apply( self, + output: torch.Tensor, hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -246,16 +266,17 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute): workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], - ) -> torch.Tensor: + ): assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE" assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE" activation_callable = lambda i, o: self.activation(activation, i, o) - return run_cutlass_moe_fp8(hidden_states, w1, w2, topk_ids, - activation_callable, global_num_experts, - expert_map, w1_scale, w2_scale, a1q_scale, - a2_scale, workspace13, workspace2, - expert_num_tokens, self.out_dtype, - self.per_act_token, self.per_out_ch) + run_cutlass_moe_fp8(output, hidden_states, w1, w2, topk_ids, + activation_callable, global_num_experts, + expert_map, w1_scale, w2_scale, a1q_scale, + a2_scale, workspace13, workspace2, + expert_num_tokens, self.out_dtype, + self.per_act_token, self.per_out_ch, + self.use_batched_format) def cutlass_moe_fp8( @@ -325,6 +346,7 @@ def cutlass_moe_fp8( out_dtype=out_dtype, per_act_token=per_act_token, per_out_ch=per_out_ch, + use_batched_format=False, ), ) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index c00e849b4ebb3..b4473b907381a 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -70,25 +70,27 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): super().__init__() self.block_shape = deep_gemm_block_shape() + def supports_chunking(self) -> bool: + return True + def workspace_shapes( - self, - a: torch.Tensor, - aq: torch.Tensor, - M: int, - N: int, - K: int, - topk: int, - num_experts: int, - ) -> tuple[int, int, torch.dtype]: + self, a: torch.Tensor, aq: torch.Tensor, M: int, N: int, K: int, + topk: int, global_num_experts: int, local_num_experts: int + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: + # We use global_num_experts due to how moe_align_block_size handles + # expert_maps. + num_experts = global_num_experts block_m = self.block_shape[0] M_sum = (M * topk) + num_experts * (block_m - 1) M_sum = round_up(M_sum, block_m) - workspace1 = M_sum * max(N * 2, K) - workspace2 = M_sum * N - return (workspace1, workspace2, a.dtype) + workspace1 = (M_sum, max(N * 2, K)) + workspace2 = (M_sum, max(N, K)) + output = (M * topk, K) + return (workspace1, workspace2, output, a.dtype) def apply( self, + output: torch.Tensor, hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -105,7 +107,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], - ) -> torch.Tensor: + ): import deep_gemm as dg a1q = hidden_states @@ -135,26 +137,28 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): # Note: M_sum is different than the pre-permuted shape of a1q. M_sum = a1q.size(0) - workspace1 = _resize_cache(workspace13, (M_sum, N)) - workspace2 = _resize_cache(workspace2, (M_sum, N // 2)) - workspace3 = _resize_cache(workspace13, (M_sum, K)) + + mm1_out = _resize_cache(workspace13, (M_sum, N)) + act_out = _resize_cache(workspace2, (M_sum, N // 2)) + quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn), + (M_sum, N // 2)) + mm2_out = _resize_cache(workspace2, (M_sum, K)) dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( - (a1q, a1q_scale), (w1, w1_scale), workspace1, expert_ids) + (a1q, a1q_scale), (w1, w1_scale), mm1_out, expert_ids) - self.activation(activation, workspace2, workspace1.view(-1, N)) + self.activation(activation, act_out, mm1_out.view(-1, N)) a2q_scale: Optional[torch.Tensor] = None - a2q, a2q_scale = per_token_group_quant_fp8(workspace2, + a2q, a2q_scale = per_token_group_quant_fp8(act_out, self.block_shape[1], - column_major_scales=True) + column_major_scales=True, + out_q=quant_out) dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( - (a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids) + (a2q, a2q_scale), (w2, w2_scale), mm2_out, expert_ids) - workspace3 = workspace3[inv_perm, ...] - - return workspace3 + torch.index_select(mm2_out, 0, inv_perm, out=output) def deep_gemm_moe_fp8( diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 48cf01638ade4..8c21d8aa53a64 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -5,6 +5,7 @@ import deep_ep import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input) @@ -193,20 +194,23 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): apply_router_weight_on_input: bool, output_dtype: torch.dtype): + hidden_dim = fused_expert_output.size(-1) if fused_expert_output.ndim == 2: - hidden_dim = fused_expert_output.size(-1) fused_expert_output = fused_expert_output.view( num_tokens, -1, hidden_dim) if not apply_router_weight_on_input: # The DeepEP combine kernels don't do the topk weight # multiplication. We multiply the weights locally. - fused_expert_output = fused_expert_output.to(torch.float32) - fused_expert_output = fused_expert_output * topk_weights.view( - fused_expert_output.size(0), -1, 1) - fused_expert_output = fused_expert_output.to(output_dtype) + m_x_topk = fused_expert_output.size(0) + fused_expert_output.mul_(topk_weights.view(m_x_topk, -1, 1)) - return fused_expert_output.sum(dim=1).to(output_dtype) + out = torch.empty((num_tokens, hidden_dim), + device=fused_expert_output.device, + dtype=output_dtype) + ops.moe_sum(fused_expert_output, out) + + return out def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 68a3485ff1f6a..a12cfafd42ab6 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -335,9 +335,6 @@ def invoke_moe_batched_triton_kernel( BLOCK_M = config['BLOCK_SIZE_M'] BLOCK_N = config['BLOCK_SIZE_N'] BLOCK_K = config['BLOCK_SIZE_K'] - assert (torch.compiler.is_compiling() - or torch.cuda.is_current_stream_capturing() - or max_num_tokens % BLOCK_M == 0) grid = (expert_num_tokens.size(0), triton.cdiv(max_num_tokens, BLOCK_M) * triton.cdiv(B.size(1), BLOCK_N)) @@ -390,8 +387,8 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): that the PPLX dispatch/combine kernels use. """ - def __init__(self, max_num_tokens: Optional[int], world_size: int, - dp_size: int, rank: int): + def __init__(self, max_num_tokens: int, world_size: int, dp_size: int, + rank: int): super().__init__() self.world_size = world_size self.dp_size = dp_size @@ -430,14 +427,9 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): num_tokens, hidden_dim = a1.size() topk = topk_ids.size(1) - if self.max_num_tokens is None: - tokens_per_expert = torch.bincount(topk_ids.view(-1), - minlength=num_experts) - self.max_num_tokens = int(tokens_per_expert.max().item()) - else: - tokens_per_expert = torch.zeros(num_experts, - dtype=torch.int, - device=a1.device) + tokens_per_expert = torch.zeros(num_experts, + dtype=torch.int, + device=a1.device) assert num_experts % self.world_size == 0 @@ -497,9 +489,9 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, + max_num_tokens: int, world_size: int, dp_size: int, - max_num_tokens: Optional[int] = None, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, use_int8_w8a16: bool = False, @@ -518,6 +510,9 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): self.world_size = world_size self.dp_size = dp_size + def supports_chunking(self) -> bool: + return False + def workspace_shapes( self, a: torch.Tensor, @@ -526,19 +521,19 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): N: int, K: int, topk: int, - num_experts: int, - ) -> tuple[int, int, torch.dtype]: + global_num_experts: int, + local_num_experts: int, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: assert a.dim() == 2 - num_dp = self.world_size // self.dp_size - max_num_tokens = a.size( - 0) if self.max_num_tokens is None else self.max_num_tokens - #print(f"WORKSPACE {max_num_tokens} {num_dp}") - workspace13 = num_experts * max_num_tokens * num_dp * K - workspace2 = max_num_tokens * num_dp * N - return (workspace13, workspace2, a.dtype) + num_dp = self.dp_size + num_experts = local_num_experts + workspace13 = (num_experts, self.max_num_tokens * num_dp, K) + workspace2 = (self.max_num_tokens * num_dp, N) + return (workspace13, workspace2, workspace13, a.dtype) def apply( self, + output: torch.Tensor, hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -555,20 +550,12 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], - ) -> torch.Tensor: + ): assert hidden_states.dim() == 3 assert expert_num_tokens is not None - hidden_dim = hidden_states.size(-1) - - if self.max_num_tokens is None: - max_num_tokens = hidden_states.size(1) - else: - max_num_tokens = self.max_num_tokens + max_num_tokens = self.max_num_tokens num_dp = self.world_size // self.dp_size - num_experts = global_num_experts - out = _resize_cache(workspace13, - (num_experts, max_num_tokens * num_dp, hidden_dim)) num_local_experts = w1.size(0) assert num_local_experts == w1.size(0), ( f"{num_local_experts} == {w1.size(0)}") @@ -585,15 +572,13 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): # Indexing expert_num_tokens doesn't work w/cudagraphs or inductor if (torch.compiler.is_compiling() or torch.cuda.is_current_stream_capturing()): - num = max_num_tokens * num_dp + num = hidden_states.shape[1] else: num = int(expert_num_tokens[expert].item()) tmp = _resize_cache(workspace2, (num, N)) input = hidden_states[expert, :num, :] @ w1[expert].transpose(0, 1) self.activation(activation, tmp, input) - out[expert, :num, :] = tmp @ w2[expert].transpose(0, 1) - - return out + output[expert, :num, :] = tmp @ w2[expert].transpose(0, 1) class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): @@ -630,6 +615,9 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): assert not use_int4_w4a16, "NYI" assert self.block_shape is None, "NYI" + def supports_chunking(self) -> bool: + return False + def workspace_shapes( self, a: torch.Tensor, @@ -638,18 +626,22 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): N: int, K: int, topk: int, - num_experts: int, - ) -> tuple[int, int, torch.dtype]: + global_num_experts: int, + local_num_experts: int, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: assert a.dim() == 2 num_dp = self.world_size // self.dp_size + num_experts = local_num_experts max_num_tokens = a.size( 0) if self.max_num_tokens is None else self.max_num_tokens - workspace13 = num_experts * max_num_tokens * num_dp * max(K, N) - workspace2 = num_experts * max_num_tokens * num_dp * (N // 2) - return (workspace13, workspace2, a.dtype) + workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N)) + workspace2 = (num_experts, max_num_tokens * num_dp, (N // 2)) + output = (num_experts, max_num_tokens * num_dp, K) + return (workspace13, workspace2, output, a.dtype) def apply( self, + output: torch.Tensor, hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -666,7 +658,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], - ) -> torch.Tensor: + ): # Check constraints. if self.use_int4_w4a16: assert hidden_states.size(-1) // 2 == w1.size(2), ( @@ -723,8 +715,9 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): (E, max_num_tokens, N)) intermediate_cache2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2)) - intermediate_cache3 = _resize_cache(workspace13, - (E, max_num_tokens, K)) + + if self.use_fp8_w8a8: + intermediate_cache1.fill_(0) # MM1 invoke_moe_batched_triton_kernel(A=hidden_states, @@ -761,7 +754,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): invoke_moe_batched_triton_kernel(A=qintermediate_cache2, B=w2, - C=intermediate_cache3, + C=output, expert_num_tokens=expert_num_tokens, compute_type=compute_type, A_scale=a2q_scale, @@ -772,4 +765,3 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): use_int4_w4a16=self.use_int4_w4a16, config=config, block_shape=self.block_shape) - return intermediate_cache3 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index ba1498e65319e..437e80696ac65 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1542,6 +1542,9 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): use_int4_w4a16=use_int4_w4a16) self.per_channel_quant = per_channel_quant + def supports_chunking(self) -> bool: + return True + def workspace_shapes( self, a: torch.Tensor, @@ -1550,15 +1553,17 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): N: int, K: int, topk: int, - num_experts: int, - ) -> tuple[int, int, torch.dtype]: - factor = num_experts if a.dim() == 3 else 1 - workspace1 = M * topk * max(N * 2, K) * factor - workspace2 = M * topk * N * factor - return (workspace1, workspace2, a.dtype) + global_num_experts: int, + local_num_experts: int, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: + workspace1 = (M, topk, max(N * 2, K)) + workspace2 = (M, topk, N) + output = (M, topk, K) + return (workspace1, workspace2, output, a.dtype) def apply( self, + output: torch.Tensor, hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -1575,7 +1580,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], - ) -> torch.Tensor: + ): # Check constraints. if self.use_int4_w4a16: assert hidden_states.size(-1) // 2 == w1.size(2), ( @@ -1632,8 +1637,6 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): (num_tokens, top_k_num, N)) intermediate_cache2 = _resize_cache(workspace2, (num_tokens * top_k_num, N // 2)) - intermediate_cache3 = _resize_cache(workspace13, - (num_tokens, top_k_num, K)) sorted_token_ids, expert_ids, num_tokens_post_padded = ( moe_align_block_size(topk_ids, config['BLOCK_SIZE_M'], @@ -1671,7 +1674,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): invoke_fused_moe_kernel(qintermediate_cache2, w2, - intermediate_cache3, + output, a2q_scale, w2_scale, w2_zp, @@ -1690,8 +1693,6 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): per_channel_quant=self.per_channel_quant, block_shape=self.block_shape) - return intermediate_cache3 - def modular_triton_fused_moe( use_fp8_w8a8: bool, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 485988fefe679..56e134be9be26 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -63,10 +63,6 @@ else: fused_moe_pallas = None # type: ignore logger = init_logger(__name__) -# Note: this limit is somewhat arbitrary and might be changed later. -# The size of the activations will be E x MOE_DP_CHUNK_SIZE x hidden_dim. -MOE_DP_CHUNK_SIZE = 256 - @dataclass class FusedMoEParallelConfig: @@ -220,7 +216,12 @@ class MoEConfig: # TODO: add more quantization params, blocked, per-token, etc. block_size: int = 128 - max_num_tokens: int = MOE_DP_CHUNK_SIZE + max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE + + def __post_init__(self): + if self.dp_size > 1: + logger.debug("Using MOEConfig::max_num_tokens=%d", + self.max_num_tokens) @property def tp_size(self): @@ -915,7 +916,7 @@ class FusedMoE(torch.nn.Module): moe_parallel_config=self.moe_parallel_config, in_dtype=params_dtype, quant_dtype=quant_dtype, - max_num_tokens=MOE_DP_CHUNK_SIZE, + max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, ) self.moe_config = moe self.quant_config = quant_config @@ -954,12 +955,12 @@ class FusedMoE(torch.nn.Module): or self.moe_parallel_config.use_deepep_ll_kernels): act_dtype = vllm_config.model_config.dtype self.batched_hidden_states = torch.zeros( - (2, MOE_DP_CHUNK_SIZE, self.hidden_size), + (2, envs.VLLM_MOE_DP_CHUNK_SIZE, self.hidden_size), dtype=act_dtype, device=torch.cuda.current_device()) self.batched_router_logits = torch.zeros( - (2, MOE_DP_CHUNK_SIZE, self.global_num_experts), + (2, envs.VLLM_MOE_DP_CHUNK_SIZE, self.global_num_experts), dtype=act_dtype, device=torch.cuda.current_device()) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index e7aaf62fb3408..ed3b6b8a1af42 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1,10 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod +from math import prod from typing import Optional import torch +import vllm.envs as envs +from vllm.model_executor.layers.fused_moe.utils import _resize_cache +from vllm.utils import cdiv + # # This file defines a set of base classes used to make MoE kernels more modular. # The goal is to be able to utilize different communication mechanisms with @@ -115,9 +120,9 @@ class FusedMoEPrepareAndFinalize(ABC): - quantized + dispatched a. - quantized + dispatched a1_scales. - Optional tensor as big as number of local experts that contains the - number of tokens assigned to each local expert. + number of tokens assigned to each local expert. - Optional dispatched expert topk IDs - - Optional dispatched expert topk weight + - Optional dispatched expert topk weight """ raise NotImplementedError @@ -159,7 +164,7 @@ class FusedMoEPrepareAndFinalize(ABC): Some PrepareFinalize All2All implementations are batched. Meaning, they can processes only as set of tokens at a time. This function returns the batch size i.e the maximum number of tokens - the implementation can process at a time. + the implementation can process at a time. Return None if there are no such restrictions. """ raise NotImplementedError @@ -171,6 +176,15 @@ class FusedMoEPermuteExpertsUnpermute(ABC): above. """ + # TODO (bnell): make this return a CHUNK_SIZE or None instead? + @abstractmethod + def supports_chunking(self) -> bool: + """ + A flag indicating whether or not this class supports activation + chunking. + """ + raise NotImplementedError + @abstractmethod def workspace_shapes( self, @@ -180,20 +194,24 @@ class FusedMoEPermuteExpertsUnpermute(ABC): N: int, K: int, topk: int, - num_experts: int, - ) -> tuple[int, int, torch.dtype]: + global_num_experts: int, + local_num_experts: int, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: """ - Compute the number of elements for the temporary outputs of the two - gemms and activation in the fused expert function. Since the - gemms are independent, the workspace for the first gemm can be shared - with the workspace for the last gemm. + Compute the shapes for the temporary and final outputs of the two gemms + and activation in the fused expert function. Since the gemms are + independent, the workspace for the first gemm can be shared with the + workspace for the last gemm. Returns a tuple of: - - Number of workspace13 elements: must be large enough to hold the + - workspace13 shape tuple: must be large enough to hold the result of either expert gemm. - - Number of workspace2 elements: must be large enough to hold the + - workspace2 shape tuple: must be large enough to hold the result of the activation function. + - output shape tuple: must be exact size of the final gemm output. - Workspace type: The dtype to use for the workspace tensors. + - Note: in order for activation chunking to work, the first dimension + of each tuple must be the number of tokens. """ raise NotImplementedError @@ -210,6 +228,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC): @abstractmethod def apply( self, + output: torch.Tensor, hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -226,12 +245,13 @@ class FusedMoEPermuteExpertsUnpermute(ABC): workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], - ) -> torch.Tensor: + ): """ This function computes the intermediate result of a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2. Parameters: + - output: (torch.Tensor): The unweighted, unreduced output tensor. - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE layer. - w1 (torch.Tensor): The first set of expert weights. @@ -259,13 +279,20 @@ class FusedMoEPermuteExpertsUnpermute(ABC): function. - expert_num_tokens: An optional tensor containing the number of tokens assigned to each expert when using batched experts format input. - - Returns: - - torch.Tensor: The unweighted, unreduced output tensor """ raise NotImplementedError +def _chunk_scales(scales: Optional[torch.Tensor], start: int, + end: int) -> Optional[torch.Tensor]: + if scales is not None: + if scales.numel() == 1: + return scales + else: + return scales[start:end] + return None + + class FusedMoEModularKernel(torch.nn.Module): """ This class combines a FusedMoEPrepareAndFinalize instance and @@ -288,61 +315,6 @@ class FusedMoEModularKernel(torch.nn.Module): self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts - def _do_fused_experts( - self, - a1: torch.Tensor, # input to forward fn - a1q: torch.Tensor, # output of prepare fn - w1: torch.Tensor, - w2: torch.Tensor, - topk_ids: torch.Tensor, - expert_num_tokens: torch.Tensor, - activation: str, - global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], - a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor]) -> torch.Tensor: - - _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids) - - # Use a1 here to decipher the correct workspace datatype - workspace13_shape, workspace2_shape, workspace_dtype = ( - self.fused_experts.workspace_shapes(a1, a1q, M, N, K, top_k, - global_num_experts)) - - # We can reuse the memory between cache1 and cache3 because by the time - # we need cache3, we're done with cache1 - workspace13 = torch.zeros(workspace13_shape, - device=a1.device, - dtype=workspace_dtype) - workspace2 = torch.zeros(workspace2_shape, - device=a1.device, - dtype=workspace_dtype) - - fused_out = self.fused_experts.apply( - a1q, - w1, - w2, - topk_ids, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=w1_zp, - w2_zp=w2_zp, - a1q_scale=a1q_scale, - a2_scale=a2_scale, - workspace13=workspace13, - workspace2=workspace2, - expert_num_tokens=expert_num_tokens, - ) - - return fused_out - def forward( self, hidden_states: torch.Tensor, @@ -401,19 +373,22 @@ class FusedMoEModularKernel(torch.nn.Module): a1 = hidden_states output = a1 if inplace else torch.zeros_like(a1) + local_num_experts = w1.size(0) if global_num_experts == -1: - global_num_experts = w1.size(0) + global_num_experts = local_num_experts (a1q, a1q_scale, expert_num_tokens, _expert_topk_ids, _expert_topk_weights) = self.prepare_finalize.prepare( a1, a1_scale, a2_scale, topk_weights, topk_ids, global_num_experts, expert_map, apply_router_weight_on_input) + # Maybe prepare gathered topk_ids and topk_weights from other EP ranks. topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids topk_weights = (topk_weights if _expert_topk_weights is None else _expert_topk_weights) fused_out = None + if a1q.numel() == 0: # This happens when none of the tokens from the all2all reach this # EP rank. Also, note that this is only relevant for CUDAGraph @@ -423,22 +398,110 @@ class FusedMoEModularKernel(torch.nn.Module): # and can never run into the tensor.numel() == 0 case. fused_out = torch.empty_like(a1q).to(dtype=a1.dtype) else: - fused_out = self._do_fused_experts( - a1=a1, - a1q=a1q, - w1=w1, - w2=w2, - topk_ids=topk_ids, - expert_num_tokens=expert_num_tokens, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=w1_zp, - w2_zp=w2_zp, - a1q_scale=a1q_scale, - a2_scale=a2_scale) + _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids) + + if self.fused_experts.supports_chunking(): + CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE + num_chunks = cdiv(M, CHUNK_SIZE) + else: + CHUNK_SIZE = M + num_chunks = 1 + + if num_chunks == 1: + (workspace13_shape, workspace2_shape, fused_out_shape, + workspace_dtype) = self.fused_experts.workspace_shapes( + a1, a1q, M, N, K, top_k, global_num_experts, + local_num_experts) + else: + # Use the full M to get the final output shape. + _, _, fused_out_shape, _ = ( + self.fused_experts.workspace_shapes( + a1, a1q, M, N, K, top_k, global_num_experts, + local_num_experts)) + # Use the CHUNK_SIZE to get the workspace shapes. + workspace13_shape, workspace2_shape, _, workspace_dtype = ( + self.fused_experts.workspace_shapes( + a1, a1q, CHUNK_SIZE, N, K, top_k, global_num_experts, + local_num_experts)) + + # We can reuse the memory between cache1 and cache3 because by the + # time we need cache3, we're done with cache1. + workspace13 = torch.empty(prod(workspace13_shape), + device=a1.device, + dtype=workspace_dtype) + workspace2 = torch.empty(prod(workspace2_shape), + device=a1.device, + dtype=workspace_dtype) + + if num_chunks == 1: + fused_out = _resize_cache(workspace13, fused_out_shape) + + self.fused_experts.apply( + fused_out, + a1q, + w1, + w2, + topk_ids, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1q_scale=a1q_scale, + a2_scale=a2_scale, + workspace13=workspace13, + workspace2=workspace2, + expert_num_tokens=expert_num_tokens, + ) + else: + # The leading output dimension may not be equal to M, so + # we compute output indices separately. + M_out = fused_out_shape[0] + assert M_out >= M + factor = M_out // M + assert factor > 0 + OUT_CHUNK_SIZE = CHUNK_SIZE * factor + + fused_out = torch.empty(fused_out_shape, + device=a1q.device, + dtype=workspace_dtype) + + assert cdiv(M_out, OUT_CHUNK_SIZE) == num_chunks, ( + f"{cdiv(M_out, OUT_CHUNK_SIZE)} == {num_chunks}") + + for chunk in range(num_chunks): + begin_chunk_idx = chunk * CHUNK_SIZE + end_chunk_idx = min((chunk + 1) * CHUNK_SIZE, M) + begin_out_idx = chunk * OUT_CHUNK_SIZE + end_out_idx = min((chunk + 1) * OUT_CHUNK_SIZE, M_out) + curr_a1q = a1q[begin_chunk_idx:end_chunk_idx] + curr_a1q_scale = _chunk_scales(a1q_scale, begin_chunk_idx, + end_chunk_idx) + curr_a2_scale = _chunk_scales(a2_scale, begin_chunk_idx, + end_chunk_idx) + curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] + + self.fused_experts.apply( + fused_out[begin_out_idx:end_out_idx], + curr_a1q, + w1, + w2, + curr_topk_ids, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1q_scale=curr_a1q_scale, + a2_scale=curr_a2_scale, + workspace13=workspace13, + workspace2=workspace2, + expert_num_tokens=expert_num_tokens, + ) self.prepare_finalize.finalize(output, fused_out, topk_weights, topk_ids, apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py index 98e175b12ed45..9d990959e01fa 100644 --- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py +++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py @@ -159,6 +159,12 @@ def moe_align_block_size( Aligns the token distribution across experts to be compatible with block size for matrix multiplication. + Note: In the case of expert_parallel, moe_align_block_size initially + considers all experts as valid and aligns all tokens appropriately. + Before the function returns it marks the experts_ids that are not in + the current GPU rank as -1 so the MoE matmuls could skip those blocks. + This requires the num_experts input arg to be the num global experts. + Parameters: - topk_ids: A tensor of shape [total_tokens, top_k] representing the top-k expert indices for each token. diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index 89481e5bd6b0a..20ee0d9f780a7 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -18,7 +18,7 @@ def _moe_permute( expert_map: Optional[torch.Tensor], block_m: int, ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, - Optional[torch.Tensor]]: + torch.Tensor]: """ Determine the sorted_token_ids, expert_ids for the given problem size. Permute the hidden states and scales according to `sorted_token_ids`. diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index 87de29444c01d..4bbfea446e291 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -34,6 +34,12 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): self.deep_gemm_expert = DeepGemmExperts( ) if self.allow_deep_gemm else None + def supports_chunking(self) -> bool: + dge = self.deep_gemm_expert + te = self.triton_expert + return ((dge is None or dge.supports_chunking()) + and (te is None or te.supports_chunking())) + def workspace_shapes( self, a: torch.Tensor, @@ -42,21 +48,24 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): N: int, K: int, topk: int, - num_experts: int, - ) -> tuple[int, int, torch.dtype]: + global_num_experts: int, + local_num_experts: int, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: # Note: the deep gemm workspaces are strictly larger than the triton # workspaces so we can be pessimistic here and allocate for DeepGemm # even if we fall back to triton later, e.g. if expert maps are set. if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K): assert self.deep_gemm_expert is not None return self.deep_gemm_expert.workspace_shapes( - a, aq, M, N, K, topk, num_experts) + a, aq, M, N, K, topk, global_num_experts, local_num_experts) else: return self.triton_expert.workspace_shapes(a, aq, M, N, K, topk, - num_experts) + global_num_experts, + local_num_experts) def apply( self, + output: torch.Tensor, hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -73,45 +82,31 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], - ) -> torch.Tensor: + ): N = w1.size(1) - if (self.allow_deep_gemm and self.use_fp8_w8a8 and N > 512 - and _valid_deep_gemm(hidden_states, w1, w2)): - assert self.deep_gemm_expert is not None - return self.deep_gemm_expert.apply( - hidden_states, - w1, - w2, - topk_ids, - activation, - global_num_experts, - expert_map, - w1_scale, - w2_scale, - w1_zp, - w2_zp, - a1q_scale, - a2_scale, - workspace13, - workspace2, - expert_num_tokens, - ) - else: - return self.triton_expert.apply( - hidden_states, - w1, - w2, - topk_ids, - activation, - global_num_experts, - expert_map, - w1_scale, - w2_scale, - w1_zp, - w2_zp, - a1q_scale, - a2_scale, - workspace13, - workspace2, - expert_num_tokens, - ) + + use_deep_gemm = (self.allow_deep_gemm and self.use_fp8_w8a8 and N > 512 + and _valid_deep_gemm(hidden_states, w1, w2)) + + experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert + assert experts is not None + + experts.apply( + output, + hidden_states, + w1, + w2, + topk_ids, + activation, + global_num_experts, + expert_map, + w1_scale, + w2_scale, + w1_zp, + w2_zp, + a1q_scale, + a2_scale, + workspace13, + workspace2, + expert_num_tokens, + ) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 6d9ea5387879b..cd3b0b3907d77 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -319,7 +319,7 @@ class MambaMixer2(CustomOp): n_groups == 1, # if there was only one group ) intermediate_settings = (intermediate_size, 0, False) - head_setings = (self.num_heads, 0, False) + head_settings = (self.num_heads, 0, False) # - the weight already has a "weight_loader" attribute # which set_weight_attrs will raise if we do not @@ -372,7 +372,7 @@ class MambaMixer2(CustomOp): intermediate_settings, group_shard_settings, group_shard_settings, - head_setings, # for dt + head_settings, # for dt ], self.tp_size, tp_rank, diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py index 58bfb661d332a..ad58a9918f03c 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py @@ -516,7 +516,7 @@ def _chunk_state_varlen_kernel( offs_n[None, :] * stride_chunk_states_dstate) else: - # - this seems repetitve, buts its to help the compiler + # - this seems repetitive, buts its to help the compiler if start_idx < pid_c * chunk_size: past_states_ptrs = chunk_states_ptr + ( offs_m[:, None] * stride_chunk_states_hdim + diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 258038bed40bd..6829d93d2d6c7 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -156,7 +156,10 @@ class MeanPool(SimplePooler): ) -> Union[list[torch.Tensor], torch.Tensor]: prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) - cumsum = torch.cumsum(hidden_states, dim=0) + # Use float32 for torch.cumsum in MeanPool, + # otherwise precision will be lost significantly. + cumsum = torch.cumsum(hidden_states, dim=0, dtype=torch.float32) + start_indices = torch.cat([ torch.tensor([0], device=hidden_states.device), torch.cumsum(prompt_lens[:-1], dim=0) @@ -220,6 +223,13 @@ class PoolerHead(nn.Module): def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor], pooling_metadata: PoolingMetadata): + # Using float32 in PoolerHead + if isinstance(pooled_data, list): + for i in range(len(pooled_data)): + pooled_data[i] = pooled_data[i].to(torch.float32) + else: + pooled_data = pooled_data.to(torch.float32) + dimensions_list = [ pooling_param.dimensions for _, pooling_param in pooling_metadata.seq_groups diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index f8bc3ab5e7d1e..fe42e26a17061 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -1,19 +1,23 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any, Optional, Union import torch from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) + QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.parameter import (GroupQuantScaleParameter, PackedvLLMParameter) +logger = init_logger(__name__) + class AWQConfig(QuantizationConfig): """Config class for AWQ. @@ -74,12 +78,42 @@ class AWQConfig(QuantizationConfig): config, ["modules_to_not_convert"], None) return cls(weight_bits, group_size, zero_point, modules_to_not_convert) - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["LinearMethodBase"]: + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional[Union["LinearMethodBase", "QuantizeMethodBase"]]: if isinstance(layer, LinearBase): if is_layer_skipped_awq(prefix, self.modules_to_not_convert): return UnquantizedLinearMethod() return AWQLinearMethod(self) + elif isinstance(layer, FusedMoE): + # Lazy import to avoid circular import. + from .awq_marlin import AWQMarlinConfig, AWQMoEMethod + from .moe_wna16 import MoeWNA16Config + from .utils.marlin_utils import check_moe_marlin_supports_layer + if not check_moe_marlin_supports_layer(layer, self.group_size): + logger.warning_once( + f"Layer '{prefix}' is not supported by AWQMoeMarlin. " + "Falling back to Moe WNA16 kernels.") + config = { + "quant_method": "awq", + "bits": self.weight_bits, + "group_size": self.group_size, + "zero_point": self.zero_point, + "lm_head": False, + } + return MoeWNA16Config.from_config(config).get_quant_method( + layer, prefix) + marlin_compatible_config_dict = { + "quant_method": "awq", + "bits": self.weight_bits, + "group_size": self.group_size, + "zero_point": self.zero_point, + "lm_head": False, + "modules_to_not_convert": self.modules_to_not_convert, + } + awq_marlin_config = AWQMarlinConfig.from_config( + marlin_compatible_config_dict) + return AWQMoEMethod(awq_marlin_config) return None diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 38935bc967855..53ba84ea8e754 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -71,9 +71,7 @@ class BitsAndBytesConfig(QuantizationConfig): @staticmethod def get_config_filenames() -> list[str]: - return [ - "adapter_config.json", - ] + return [] @classmethod def from_config(cls, config: dict[str, Any]) -> "BitsAndBytesConfig": diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 1ee4617e10544..e5702c871cc9a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -24,10 +24,10 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso CompressedTensorsMoEMethod) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24, - CompressedTensorsScheme, CompressedTensorsW4A16Fp4, - CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, - CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, - CompressedTensorsWNA16) + CompressedTensorsScheme, CompressedTensorsW4A4Fp4, + CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24, + CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, + CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( find_matched_target, is_activation_quantization_format, should_ignore_layer) @@ -218,6 +218,26 @@ class CompressedTensorsConfig(QuantizationConfig): else: return False + def _is_fp4a4_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel): + + if weight_quant is None or input_quant is None: + return False + + is_tensor_group_quant = (weight_quant.strategy + == QuantizationStrategy.TENSOR_GROUP.value + and input_quant.strategy + == QuantizationStrategy.TENSOR_GROUP.value) + is_symmetric = weight_quant.symmetric and input_quant.symmetric + + is_group_size_16 = (weight_quant.group_size == 16 + and input_quant.group_size == 16) + is_float_type = (weight_quant.type == QuantizationType.FLOAT + and input_quant.type == QuantizationType.FLOAT.value) + is_4_bits = weight_quant.num_bits == 4 and input_quant.num_bits == 4 + + return (is_tensor_group_quant and is_float_type and is_4_bits + and is_group_size_16 and is_symmetric) + def _is_fp4a16_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel): @@ -353,6 +373,16 @@ class CompressedTensorsConfig(QuantizationConfig): actorder=weight_quant.actorder) if is_activation_quantization_format(self.quant_format): + if self._is_fp4a4_nvfp4(weight_quant, input_quant): + if CompressedTensorsW4A4Fp4.cutlass_fp4_supported(): + return CompressedTensorsW4A4Fp4() + else: + logger.warning_once( + "Current platform does not support cutlass NVFP4." + " Running CompressedTensorsW4A16Fp4.") + return CompressedTensorsW4A16Fp4( + has_input_global_scale=True) + if self._is_fp8_w8a8(weight_quant, input_quant): is_fp8_w8a8_supported = self._check_scheme_supported( CompressedTensorsW8A8Fp8.get_min_capability(), error=False) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index bc9d399cf135b..f14131c5f05b3 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -562,9 +562,12 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod): (moe.num_experts + prepare_finalize.world_size - 1) // prepare_finalize.world_size) experts = CutlassExpertsFp8( - max_experts_per_worker, moe.in_dtype, + max_experts_per_worker, + moe.in_dtype, self.input_quant.strategy == QuantizationStrategy.TOKEN, - self.weight_quant.strategy == QuantizationStrategy.CHANNEL) + self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + use_batched_format=True, + ) if has_pplx and isinstance( prepare_finalize, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index 25924c733e760..6e4e75df76043 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .compressed_tensors_scheme import CompressedTensorsScheme +from .compressed_tensors_w4a4_nvfp4 import CompressedTensorsW4A4Fp4 from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24) from .compressed_tensors_w4a16_nvfp4 import CompressedTensorsW4A16Fp4 @@ -18,5 +19,6 @@ __all__ = [ "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24", "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8", "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS", - "CompressedTensors24", "CompressedTensorsW4A16Fp4" + "CompressedTensors24", "CompressedTensorsW4A16Fp4", + "CompressedTensorsW4A4Fp4" ] diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py index 8202ce9514969..96dccf04d490f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py @@ -18,7 +18,8 @@ __all__ = ["CompressedTensorsW4A16Fp4"] class CompressedTensorsW4A16Fp4(CompressedTensorsScheme): - def __init__(self): + def __init__(self, has_input_global_scale: bool = False): + self.has_input_global_scale = has_input_global_scale self.group_size = 16 @classmethod @@ -64,6 +65,13 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme): layer.register_parameter("weight_scale", weight_scale) + if self.has_input_global_scale: + input_global_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), + dtype=torch.float32), + weight_loader=weight_loader) + layer.register_parameter("input_global_scale", input_global_scale) + def process_weights_after_loading(self, layer) -> None: # Process parameters for marlin repacking @@ -77,6 +85,10 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme): requires_grad=False) del layer.weight_global_scale + if self.has_input_global_scale: + layer.input_global_scale = torch.nn.Parameter( + layer.input_global_scale.data, requires_grad=False) + prepare_fp4_layer_for_marlin(layer) def apply_weights(self, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py new file mode 100644 index 0000000000000..32718972a627a --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Callable, Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm._custom_ops import (cutlass_scaled_fp4_mm, + cutlass_scaled_mm_supports_fp4, scaled_fp4_quant) +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.parameter import (GroupQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter) +from vllm.platforms import current_platform + +logger = init_logger(__name__) + +__all__ = ["CompressedTensorsW4A4Fp4"] + + +class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): + + def __init__(self): + self.group_size = 16 + + @classmethod + def get_min_capability(cls) -> int: + return 100 + + @classmethod + def cutlass_fp4_supported(cls) -> bool: + if not current_platform.is_cuda(): + return False + capability_tuple = current_platform.get_device_capability() + capability = -1 if capability_tuple is None else capability_tuple.to_int( # noqa: E501 + ) + return cutlass_scaled_mm_supports_fp4(capability) + + def create_weights(self, layer: torch.nn.Module, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + # Weight + weight = ModelWeightParameter(data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // 2, + dtype=torch.uint8), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + layer.register_parameter("weight_packed", weight) + + # Global Weight Scale + weight_global_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + layer.register_parameter("weight_global_scale", weight_global_scale) + + # Per Group Weight Scale + weight_scale = GroupQuantScaleParameter(data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // self.group_size, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + + layer.register_parameter("weight_scale", weight_scale) + + input_global_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + layer.register_parameter("input_global_scale", input_global_scale) + + def swizzle_blockscale(self, scale: torch.tensor): + assert (scale.dtype == torch.float8_e4m3fn) + # Pad and blockwise interleave weight_scale + scale_ndim = scale.ndim + if scale.ndim == 2: + scale = scale.unsqueeze(0) + assert scale.ndim == 3 + B, M, K = scale.shape + round_up_multiple = lambda x, m: (x + m - 1) // m * m + M_padded = round_up_multiple(M, 128) + K_padded = round_up_multiple(K, 4) + padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype) + padded_scale[:B, :M, :K] = scale + batches, rows, cols = padded_scale.shape + assert rows % 128 == 0 + assert cols % 4 == 0 + padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32, + cols // 4, 4) + swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5)) + swizzled_scale = swizzled_scale.contiguous().cuda() + return (swizzled_scale.reshape(M, K) + if scale_ndim == 2 else swizzled_scale.reshape(B, M, K)) + + def process_weights_after_loading(self, layer) -> None: + + global_input_scale = layer.input_global_scale.max().to(torch.float32) + layer.input_global_scale = Parameter(global_input_scale, + requires_grad=False) + + layer.weight_global_scale = Parameter( + layer.weight_global_scale.max().to(torch.float32), + requires_grad=False) + + swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale) + layer.weight_scale_swizzled = Parameter(swizzled_weight_scale, + requires_grad=False) + + # required by cutlass kernel; need Parameter, not ModelWeightParameter + layer.weight = Parameter(layer.weight_packed.data, requires_grad=False) + + layer.alpha = Parameter(layer.input_global_scale * + layer.weight_global_scale, + requires_grad=False) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + + output_dtype = x.dtype + output_shape = [x.shape[0], layer.weight.shape[0]] + + # quantize BF16 or FP16 to (FP4 and interleaved block scale) + x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale) + + out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale, + layer.weight_scale_swizzled, + 1 / layer.alpha, output_dtype) + if bias is not None: + out = out + bias + return out.view(*output_shape) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py index 9bcf1aa2bc1cd..d926b4c12db14 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py @@ -144,10 +144,10 @@ def triton_scaled_mm(input: torch.Tensor, scale_b = scale_b.reshape(-1, 1) if scale_b.dim() <= 1 else scale_b assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point() - assert scale_a.shape == torch.Size([1, 1]) or scale_a.shape == torch.Size( - [M, 1]) - assert scale_b.shape == torch.Size([1, 1]) or scale_b.shape == torch.Size( - [N, 1]) + assert scale_a.shape[1] == 1 and (scale_a.shape[0] == 1 + or scale_a.shape[0] == M) + assert scale_b.shape[1] == 1 and (scale_b.shape[0] == 1 + or scale_b.shape[0] == N) assert out_dtype.is_floating_point assert bias is None or bias.is_floating_point() assert is_weak_contiguous(input) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 402646498cee1..099d8613fc1a7 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -15,6 +15,7 @@ def is_activation_quantization_format(format: str) -> bool: CompressionFormat.naive_quantized.value, CompressionFormat.int_quantized.value, CompressionFormat.float_quantized.value, + CompressionFormat.nvfp4_pack_quantized.value ] return format in _ACTIVATION_QUANTIZATION_FORMATS diff --git a/vllm/model_executor/layers/quantization/deepgemm.py b/vllm/model_executor/layers/quantization/deepgemm.py new file mode 100644 index 0000000000000..1d40f4915a1be --- /dev/null +++ b/vllm/model_executor/layers/quantization/deepgemm.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: Apache-2.0 +import importlib.util +import logging + +import torch + +from vllm.platforms import current_platform +from vllm.triton_utils import triton +from vllm.utils import direct_register_custom_op + +has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None +if has_deep_gemm: + import deep_gemm + +logger = logging.getLogger(__name__) + + +def prepare_block_fp8_matmul_inputs( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype = torch.float16, +) -> tuple[int, int, int, torch.Tensor]: + assert len(block_size) == 2 + block_n, block_k = block_size[0], block_size[1] + + assert A.shape[-1] == B.shape[-1] + assert A.shape[:-1] == As.shape[:-1] + assert A.is_contiguous() + assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1] + + M = A.numel() // A.shape[-1] + + assert B.ndim == 2 + assert B.is_contiguous() + assert Bs.ndim == 2 + N, K = B.shape + assert triton.cdiv(N, block_n) == Bs.shape[0] + assert triton.cdiv(K, block_k) == Bs.shape[1] + + C_shape = A.shape[:-1] + (N, ) + C = A.new_empty(C_shape, dtype=output_dtype) + + return M, N, K, C + + +def w8a8_block_fp8_matmul_deepgemm( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype, +) -> torch.Tensor: + M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, + output_dtype) + # Deepgemm only supports output tensor type as bfloat16 + assert C.dtype == torch.bfloat16 + deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C) + return C + + +def w8a8_block_fp8_matmul_deepgemm_fake( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype, +) -> torch.Tensor: + M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, + output_dtype) + return C + + +direct_register_custom_op( + op_name="w8a8_block_fp8_matmul_deepgemm", + op_func=w8a8_block_fp8_matmul_deepgemm, + mutates_args=[], + fake_impl=w8a8_block_fp8_matmul_deepgemm_fake, + dispatch_key=current_platform.dispatch_key, +) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index c785e0d1674da..b3042bfaed3d7 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -402,6 +402,7 @@ class Fp8LinearMethod(LinearMethodBase): if self.block_quant: assert self.quant_config.weight_block_size is not None + return torch.ops.vllm.apply_w8a8_block_fp8_linear( input=x, weight=layer.weight, diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 2171f729afad1..9c8f74545d37d 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -99,6 +99,10 @@ MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, qweight_type: int) -> torch.Tensor: + if qweight_type in IMATRIX_QUANT_TYPES: + mmvq_safe = 8 if qweight.shape[0] > 5120 else 16 + else: + mmvq_safe = 2 if qweight.shape[0] > 5120 else 6 # HACK: when doing chunked prefill we don't generate output tokens # so input to logits generator is empty which causes invalid parameter if x.shape[0] == 0: @@ -110,7 +114,7 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, if qweight_type in UNQUANTIZED_TYPES: return x @ qweight.T # enable MMVQ in contiguous batching with batch_size=1 - if x.shape[0] == 1 and qweight_type in MMVQ_QUANT_TYPES: + if x.shape[0] <= mmvq_safe and qweight_type in MMVQ_QUANT_TYPES: y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0]) # Use MMQ Kernel if it's available (standard + k-quants) elif qweight_type in MMQ_QUANT_TYPES: diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index a7d9332032a28..9c909a3a430cb 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -17,25 +17,46 @@ from vllm.model_executor.utils import set_weight_attrs logger = init_logger(__name__) +def should_skip(prefix: str, skip_modules: list[str]) -> bool: + """ + Robust skipping logic: + should_skip("model.model.layers.1.q_proj", + ["model.model.layers.1.q_proj"]) # True + should_skip("model.model.layers.10.o_proj", ["o_proj"]) -> True + should_skip("visual.model.layers.1.q_proj", ["visual"]) -> True + should_skip("model.model.layers.1.q_proj", ["layers.1"]) -> True + should_skip("model.model.layers.11.q_proj", ["layers.1"]) -> False + """ + for s in skip_modules: + if prefix == s: + return True + if f".{s}." in f".{prefix}.": + return True + return False + + class TorchAOConfig(QuantizationConfig): """Config class for torchao.""" - def __init__(self, torchao_config) -> None: - self.torchao_config = torchao_config + def __init__(self, + torchao_config, + skip_modules: Optional[list[str]] = None) -> None: """ # TorchAO quantization relies on tensor subclasses. In order, # to enable proper caching this needs standalone compile - if is_torch_equal_or_newer("2.8.0"): + if is_torch_equal_or_newer("2.8.0a"): os.environ["VLLM_TEST_STANDALONE_COMPILE"] = "1" logger.info( "Using TorchAO: Setting VLLM_TEST_STANDALONE_COMPILE=1") # TODO: remove after the torch dependency is updated to 2.8 if is_torch_equal_or_newer( - "2.7.0") and not is_torch_equal_or_newer("2.8.0"): + "2.7.0") and not is_torch_equal_or_newer("2.8.0a"): os.environ["VLLM_DISABLE_COMPILE_CACHE"] = "1" logger.info("Using TorchAO: Setting VLLM_DISABLE_COMPILE_CACHE=1") """ + self.torchao_config = torchao_config + self.skip_modules = skip_modules or [] def __repr__(self) -> str: return f"TorchAOConfig({self.torchao_config})" @@ -67,11 +88,28 @@ class TorchAOConfig(QuantizationConfig): hf_config = cls.get_from_keys_or(config, ["quant_type"], None) assert hf_config is not None, "quant_type must be specified" - assert (len(hf_config) == 1 and "default" in hf_config - ), "Expected only one key 'default' in quant_type dictionary" + assert len(hf_config) == 1 and "default" in hf_config, ( + "Expected only one key 'default' in quant_type dictionary") quant_type = hf_config["default"] ao_config = config_from_dict(quant_type) - return cls(ao_config) + + # Adds skipped modules defined in "modules_to_not_convert" + skip_modules = config.get("modules_to_not_convert", []) or [] + + # Adds skipped modules defined in "module_fqn_to_config" + _data = quant_type.get("_data", {}) + if not isinstance(_data, dict): + _data = {} + + module_fqn = _data.get("module_fqn_to_config", {}) + if not isinstance(module_fqn, dict): + module_fqn = {} + + for layer, layer_cfg in module_fqn.items(): + if layer_cfg is None: + skip_modules.append(layer) + + return cls(ao_config, skip_modules) def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: @@ -80,13 +118,16 @@ class TorchAOConfig(QuantizationConfig): from torchao.quantization import ModuleFqnToConfig + if should_skip(prefix, self.skip_modules): + return UnquantizedLinearMethod() + module_fqn = prefix if isinstance(self.torchao_config, ModuleFqnToConfig): module_fqn_to_config = self.torchao_config.module_fqn_to_config c = module_fqn_to_config.get( module_fqn) or module_fqn_to_config.get("_default", None) if c is not None: - current_torchao_config = TorchAOConfig(c) + current_torchao_config = TorchAOConfig(c, self.skip_modules) return TorchAOLinearMethod(current_torchao_config) else: return UnquantizedLinearMethod() @@ -108,8 +149,17 @@ def torchao_quantize_param_data(param: torch.Tensor, """ from torchao.core.config import AOBaseConfig from torchao.quantization import quantize_ + assert isinstance(torchao_config, AOBaseConfig), f"{torchao_config}" - dummy_linear = torch.nn.Linear(param.shape[1], param.shape[0], bias=False) + """ + Avoid real weight allocation for faster load, since we will + end up setting it to param. + """ + with torch.device("meta"): + dummy_linear = torch.nn.Linear(param.shape[1], + param.shape[0], + bias=False) + dummy_linear.weight = param quantize_(dummy_linear, torchao_config) return dummy_linear.weight diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 270979c8e932e..754650ebeffb5 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -3,12 +3,14 @@ # Adapted from https://github.com/sgl-project/sglang/pull/2575 import functools +import importlib.util import json import os from typing import Any, Callable, Optional, Union import torch +import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -20,6 +22,7 @@ from vllm.triton_utils import tl, triton from vllm.utils import direct_register_custom_op logger = init_logger(__name__) +has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool: @@ -98,6 +101,19 @@ def dispatch_w8a8_blockscale_func( return w8a8_block_fp8_matmul +def should_use_deepgemm(output_dtype: torch.dtype, weight: torch.Tensor): + """ + Check if DeepGEMM should be used based on the output dtype and weight shape. + DeepGEMM is only supported for bfloat16 output dtype and weights with shape + divisible by 128. + """ + + return (current_platform.is_cuda() + and current_platform.is_device_capability(90) and has_deep_gemm + and envs.VLLM_USE_DEEP_GEMM and output_dtype == torch.bfloat16 + and weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0) + + # TODO fix ROCm->Triton custom path: # https://github.com/vllm-project/vllm/issues/14397 def apply_w8a8_block_fp8_linear( @@ -114,6 +130,30 @@ def apply_w8a8_block_fp8_linear( # View input as 2D matrix for fp8 methods input_2d = input.view(-1, input.shape[-1]) output_shape = [*input.shape[:-1], weight.shape[0]] + output_dtype = input.dtype + + if should_use_deepgemm(output_dtype, weight): + + input_2d = input.view(-1, input.shape[-1]) + output_shape = [*input.shape[:-1], weight.shape[0]] + + q_input, x_scale = per_token_group_quant_fp8( + input_2d, + block_size[1], + column_major_scales=True, + ) + + import vllm.model_executor.layers.quantization.deepgemm # noqa: F401 + output = torch.ops.vllm.w8a8_block_fp8_matmul_deepgemm( + q_input, + weight, + x_scale, + weight_scale, + block_size, + output_dtype=output_dtype) + if bias is not None: + output += bias + return output.to(dtype=output_dtype).view(*output_shape) if current_platform.is_cuda(): if current_platform.has_device_capability(100): @@ -134,7 +174,6 @@ def apply_w8a8_block_fp8_linear( w8a8_blockscale_func = dispatch_w8a8_blockscale_func( use_cutlass, use_aiter_and_is_supported) - if use_cutlass: q_input, x_scale = per_token_group_quant_fp8( input_2d, block_size[1], column_major_scales=use_cutlass) @@ -234,8 +273,13 @@ def _per_token_group_quant_fp8( row = g_id // groups_per_row row_g_id = g_id % groups_per_row - y_ptr += (row * y_row_stride) + (row_g_id * group_size) - y_q_ptr += g_id * group_size + # Ensure offset calculations use int64 to prevent overflow + y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) * + group_size) + y_ptr += y_ptr_offset + + y_q_ptr_offset = g_id.to(tl.int64) * group_size + y_q_ptr += y_q_ptr_offset y_s_ptr += g_id cols = tl.arange(0, BLOCK) # N <= BLOCK @@ -282,15 +326,23 @@ def _per_token_group_quant_fp8_colmajor( row = g_id // groups_per_row row_g_id = g_id % groups_per_row - y_ptr += (row * y_row_stride) + (row_g_id * group_size) - y_q_ptr += g_id * group_size + # Ensure offset calculations use int64 to prevent overflow + y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) * + group_size) + y_ptr += y_ptr_offset + + y_q_ptr_offset = g_id.to(tl.int64) * group_size + y_q_ptr += y_q_ptr_offset # Convert g_id the flattened block coordinate to 2D so we can index # into the output y_scales matrix blocks_per_row = y_num_columns // group_size scale_col = g_id % blocks_per_row scale_row = g_id // blocks_per_row - y_s_ptr += scale_col * y_s_col_stride + scale_row + # Ensure offset calculation uses int64 for y_s_ptr + y_s_ptr_offset = (scale_col.to(tl.int64) * y_s_col_stride) + scale_row.to( + tl.int64) + y_s_ptr += y_s_ptr_offset cols = tl.arange(0, BLOCK) # group_size <= BLOCK mask = cols < group_size @@ -311,6 +363,7 @@ def per_token_group_quant_fp8( eps: float = 1e-10, dtype: Optional[torch.dtype] = None, column_major_scales: bool = False, + out_q: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, torch.Tensor]: """Function to perform per-token-group quantization on an input tensor `x`. It converts the tensor values into signed float8 values and returns the @@ -321,6 +374,8 @@ def per_token_group_quant_fp8( eps: The minimum to avoid dividing zero. dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn` is supported for now. + column_major_scales: Outputs scales in column major. + out_q: Optional output tensor. If not provided, function will create. Returns: tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization. @@ -335,7 +390,11 @@ def per_token_group_quant_fp8( fp8_min = finfo.min fp8_max = finfo.max - x_q = torch.empty_like(x, device=x.device, dtype=dtype) + assert out_q is None or out_q.shape == x.shape + x_q = out_q + if x_q is None: + x_q = torch.empty_like(x, device=x.device, dtype=dtype) + M = x.numel() // group_size N = group_size if column_major_scales: diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py index a694a191745d8..1fdf7d174e25e 100644 --- a/vllm/model_executor/layers/quantization/utils/int8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py @@ -219,7 +219,7 @@ def per_token_group_quant_int8( quantized tensor along with the scaling factor used for quantization. Args: - x: The input tenosr with ndim >= 2. + x: The input tensor with ndim >= 2. group_size: The group size used for quantization. eps: The minimum to avoid dividing zero. dtype: The dype of output tensor. Note that only `torch.int8` diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py index 6e8e98d544f8c..d5ce6d7ad757a 100644 --- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py @@ -2,10 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch -__all__ = [ - "break_fp4_bytes", - "dequantize_to_dtype", -] +from vllm.scalar_type import scalar_types + +__all__ = ["break_fp4_bytes", "dequantize_to_dtype", "ref_nvfp4_quant"] + +FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.], dtype=torch.float32) @@ -60,3 +61,73 @@ def dequantize_to_dtype(tensor_fp4, # scale the tensor out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k) return out.to(dtype) + + +def get_reciprocal(x): + if isinstance(x, torch.Tensor): + return torch.where(x == 0, torch.tensor(0.0, dtype=x.dtype), 1.0 / x) + elif isinstance(x, (float, int)): + return 0.0 if x == 0 else 1.0 / x + else: + raise TypeError("Input must be a float, int, or a torch.Tensor.") + + +def cast_to_fp4(x): + sign = torch.sign(x) + x = torch.abs(x) + x[(x >= 0.0) & (x <= 0.25)] = 0.0 + x[(x > 0.25) & (x < 0.75)] = 0.5 + x[(x >= 0.75) & (x <= 1.25)] = 1.0 + x[(x > 1.25) & (x < 1.75)] = 1.5 + x[(x >= 1.75) & (x <= 2.5)] = 2.0 + x[(x > 2.5) & (x < 3.5)] = 3.0 + x[(x >= 3.5) & (x <= 5.0)] = 4.0 + x[x > 5.0] = 6.0 + return x * sign + + +def ref_nvfp4_quant(x, global_scale, block_size): + assert global_scale.dtype == torch.float32 + assert x.ndim == 2 + m, n = x.shape + x = torch.reshape(x, (m, n // block_size, block_size)) + vec_max = torch.max(torch.abs(x), dim=-1, + keepdim=True)[0].to(torch.float32) + scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX)) + scale = torch.clamp(scale, max=448, min=-448) + scale = scale.to(torch.float8_e4m3fn).to(torch.float32) + output_scale = get_reciprocal(scale * get_reciprocal(global_scale)) + + scaled_x = x.to(torch.float32) * output_scale + clipped_x = torch.clamp(scaled_x, -6.0, 6.0).reshape(m, n) + # both outputs are float32 + return cast_to_fp4(clipped_x), scale.squeeze(-1) + + +def run_nvfp4_emulations(x: torch.Tensor, input_global_scale: torch.Tensor, + weight: torch.Tensor, + weight_scale_swizzled: torch.Tensor, + weight_global_scale: torch.Tensor): + group_size = 16 + x_m, x_k = x.shape + output_dtype = x.dtype + + # quantize input to (FP4 and interleaved block scale) + x_fp4, x_blockscale = ref_nvfp4_quant(x, input_global_scale, group_size) + + # dequantize input + x_fp4 = x_fp4.reshape(x_m, x_k // group_size, group_size) + x_blockscale = x_blockscale.unsqueeze(-1) / input_global_scale + x_dq = (x_fp4 * x_blockscale).reshape(x_m, x_k).to(output_dtype) + del x_fp4, x_blockscale + + # dequantize weight + w_fp4 = weight.data.view(torch.uint8) + w_dq = dequantize_to_dtype(w_fp4, weight_scale_swizzled.data, + weight_global_scale, output_dtype, x.device, + group_size) + + # matmul + out = torch.matmul(x_dq, w_dq.t()) + del w_dq, x_dq + return out diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index a6e58a77d42cd..db68f18726d38 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -283,14 +283,14 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler): batch_size, k, _ = draft_probs.shape batch_indices = torch.arange(batch_size, device=target_probs.device)[:, None] - probs_indicies = torch.arange(k, device=target_probs.device) + probs_indices = torch.arange(k, device=target_probs.device) # shape [batch_size, k] - selected_draft_probs = draft_probs[batch_indices, probs_indicies, + selected_draft_probs = draft_probs[batch_indices, probs_indices, draft_token_ids] # shape [batch_size, k] - selected_target_probs = target_probs[batch_indices, probs_indicies, + selected_target_probs = target_probs[batch_indices, probs_indices, draft_token_ids] uniform_rand = self._create_uniform_samples(seeded_seqs, batch_size, diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 0f636d83a6dd9..9ff3a7a7327d9 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -176,17 +176,17 @@ class VocabParallelEmbedding(torch.nn.Module): Therefore, the tensor format looks like the following: TP1, rank 0 (no sharding): |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >| - corresponding token_id: | 0 | 1 | ... | 1009 | -1 | ... | -1 | 1010 | ... | 1015 | -1 | ... | -1 | + corresponding token_id: | 0 | 1 | ... | 1009 | -1 | ... | -1 | 1010 | ... | 1025 | -1 | ... | -1 | index: | 0 | 1 | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 | TP2, rank 0: |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >| - corresponding token_id: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 1000 | ... | 1015 | -1 | ... | -1 | - index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 520 | ... | 543 | + corresponding token_id: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 1010 | ... | 1025 | -1 | ... | -1 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 528 | ... | 543 | TP2, rank 1: |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >| corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1 | ... | -1 | -1 | ... | -1 | -1 | ... | -1 | - index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 528 | ... | 543 | Args: num_embeddings: vocabulary size. diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index ebbb021cad645..3146c35a4e6fa 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -392,7 +392,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): def _get_bnb_target_modules(self, model: nn.Module) -> None: for name, module in model.named_modules(): - if isinstance(module, (LinearBase, )): + if (isinstance(module, LinearBase) and + hasattr(module.quant_method, "quant_config")): if modules_info := self.modules_mapping.get_sub_modules(name): # Map vllm's names to transformers's names. rep_name, sub_modules = modules_info @@ -400,7 +401,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): self.target_modules.append( name.replace(rep_name, sub_name)) # Add original module name even if the module has stacked map, - # in case model has a mixture of disk-merged and disk-splitted + # in case model has a mixture of disk-merged and disk-split # weights with same last name. self.target_modules.append(name) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index e6eaade090275..79e6fa7b16dc7 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -58,7 +58,9 @@ def initialize_model( all_params = [param.name for param in signatures.parameters.values()] if "vllm_config" in all_params and "prefix" in all_params: # new-style model class - with set_current_vllm_config(vllm_config, check_compile=True): + with set_current_vllm_config(vllm_config, + check_compile=True, + prefix=prefix): return model_class(vllm_config=vllm_config, prefix=prefix) msg = ("vLLM model class should accept `vllm_config` and `prefix` as " @@ -86,7 +88,9 @@ def initialize_model( kwargs["lora_config"] = vllm_config.lora_config if "scheduler_config" in all_params: kwargs["scheduler_config"] = vllm_config.scheduler_config - with set_current_vllm_config(vllm_config, check_compile=True): + with set_current_vllm_config(vllm_config, + check_compile=True, + prefix=prefix): return model_class(**kwargs) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index bb4177dfc4574..b69c7b6a9376d 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -601,11 +601,11 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] multimodal_embeddings = self._process_image_input(image_input) return multimodal_embeddings diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 7e15e57a4d032..6a95ac089ff4a 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -406,11 +406,11 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] return self._process_image_input(image_input, **kwargs) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 0de5de5e835ac..804a2f1785d5c 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -131,7 +131,7 @@ class BaiChuanAttention(nn.Module): self.num_heads = (self.total_num_heads // tensor_model_parallel_world_size) self.head_dim = hidden_size // self.total_num_heads - self.postion_embedding = position_embedding + self.position_embedding = position_embedding self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings @@ -151,7 +151,7 @@ class BaiChuanAttention(nn.Module): quant_config=quant_config, ) # Create the alibi slopes and slice them. - if self.postion_embedding == "ALIBI": + if self.position_embedding == "ALIBI": tp_rank = get_tensor_model_parallel_rank() head_start = tp_rank * self.num_heads head_end = (tp_rank + 1) * self.num_heads @@ -187,7 +187,7 @@ class BaiChuanAttention(nn.Module): ) -> torch.Tensor: qkv, _ = self.W_pack(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - if self.postion_embedding != "ALIBI": + if self.position_embedding != "ALIBI": q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index cacec7342ac2e..389393987c811 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -414,15 +414,10 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant): intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - hidden_states = self.model(input_ids=input_ids, - position_ids=positions, - inputs_embeds=inputs_embeds, - intermediate_tensors=intermediate_tensors) - - # convert the embedding output to float32, - # otherwise precision will be lost significantly - hidden_states = hidden_states.to(torch.float32) - return hidden_states + return self.model(input_ids=input_ids, + position_ids=positions, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors) def pooler( self, diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index d1b84a9f04fa9..0f22393c79d98 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -432,12 +432,7 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant): else: hidden_states = self.embeddings(input_ids=input_ids, token_type_ids=token_type_ids) - hidden_states = self.encoder(positions, hidden_states) - - # convert the embedding output to float32, - # otherwise precision will be lost significantly - hidden_states = hidden_states.to(torch.float32) - return hidden_states + return self.encoder(positions, hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 279541bed55a0..87fc6b5b02405 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -627,11 +627,11 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] vision_embeddings = self._process_image_input(image_input) return vision_embeddings diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index aea44261dd69f..21f29dc43c268 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -987,11 +987,11 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] assert self.model.vqmodel is not None image_tokens = self.model.get_image_tokens(image_input["data"].to( self.config.torch_dtype)) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index ee67cc64050e7..817c6bb9a7f92 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -51,7 +51,8 @@ from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant -from .utils import (extract_layer_index, is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, extract_layer_index, + is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -286,6 +287,7 @@ class CohereModel(nn.Module): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config + self.quant_config = quant_config self.config = config lora_vocab = (lora_config.lora_extra_vocab_size * @@ -339,6 +341,62 @@ class CohereModel(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + + for param_name, shard_name, shard_id in stacked_params_mapping: + if shard_name not in name: + continue + name = name.replace(shard_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): packed_modules_mapping = { @@ -408,65 +466,6 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - - # Skip loading rotary embeddings since vLLM has its own - if "rotary_emb.inv_freq" in name: - continue - - if (self.quant_config is not None and - (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else - loaded_weight[0]) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - - for param_name, shard_name, shard_id in stacked_params_mapping: - if shard_name not in name: - continue - name = name.replace(shard_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # lm_head is not used in vllm as it is tied with embed_token. - # To prevent errors, skip loading lm_head.weight. - if "lm_head.weight" in name: - continue - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader( + self, skip_prefixes=["lm_head", "rotary_emb.inv_freq"]) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 765718e575203..6341c65a5d4cf 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -344,7 +344,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.image_newline = nn.Parameter( torch.randn(self.projector_config.n_embed) * embed_std) # This is a typo in original implementation - self.view_seperator = nn.Parameter( + self.view_separator = nn.Parameter( torch.randn(self.projector_config.n_embed) * embed_std) else: raise ValueError( @@ -549,13 +549,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): if self.global_view_pos == "head": global_local_features = torch.cat([ global_features, - self.view_seperator[None, :], + self.view_separator[None, :], local_features, ]) else: global_local_features = torch.cat([ local_features, - self.view_seperator[None, :], + self.view_separator[None, :], global_features, ]) @@ -586,11 +586,11 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] vision_embeddings = self._process_image_input(image_input) return vision_embeddings diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 2219321457b2a..c551ecd68ef86 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -74,6 +74,7 @@ class EAGLE(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + self.dtype = vllm_config.model_config.dtype self.config = config architectures = getattr(self.config.model, "architectures", []) @@ -197,7 +198,7 @@ class EAGLE(nn.Module): return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B + # This implementation is incompatible with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B # due to missing lm_head weights and its config being that of a # Llama model. Here's a compatible version with the same weights: # https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm @@ -250,7 +251,7 @@ class EAGLE(nn.Module): lm_head_weight = torch.zeros( self.lm_head.org_vocab_size, self.lm_head.embedding_dim, - dtype=self.config.torch_dtype, + dtype=self.dtype, ) weight_loader = getattr(self.lm_head.weight, "weight_loader", diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 47760aabb9591..4b220ea483e81 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -1032,11 +1032,11 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] vision_embeddings = self._process_image_input(image_input) return vision_embeddings diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index cb141dbc5aa37..462f85c3dd623 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -324,11 +324,11 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] return self._process_image_input(image_input) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 23e25170799ba..b633c0003c637 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -568,11 +568,11 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] return self._process_image_input(image_input) @@ -634,13 +634,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, kwargs["has_images"] = True # NOTE(woosuk): Here, we distinguish the sequences by the position id 0. # This is a HACK. Fix this. - start_idices = (positions == 0).cpu().nonzero() - num_seqs = len(start_idices) + start_indices = (positions == 0).cpu().nonzero() + num_seqs = len(start_indices) seq_lens = [] for i in range(num_seqs): - start_idx = start_idices[i].item() + start_idx = start_indices[i].item() if i < num_seqs - 1: - end_idx = start_idices[i + 1].item() + end_idx = start_indices[i + 1].item() else: end_idx = len(input_ids) seq_lens.append(end_idx - start_idx) diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 034c7654f4d94..e9271367a472b 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -593,11 +593,11 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, def get_language_model(self) -> torch.nn.Module: return self.transformer - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] vision_embeddings = self._process_image_input(image_input) return vision_embeddings diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 831164ba88a4d..137aad926cb90 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -706,10 +706,11 @@ class GraniteSpeechForConditionalGeneration( def get_multimodal_embeddings( self, **kwargs: object, - ) -> Optional[MultiModalEmbeddings]: + ) -> MultiModalEmbeddings: """Compute the audio embeddings if audio inputs are present.""" audio_input = self._parse_and_validate_audio_input(**kwargs) if audio_input is None: + return [] return None audio_features = self._process_audio_input(audio_input) return audio_features diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index f434b7a74e486..26b5b3ac15345 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -67,13 +67,15 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module): activation=config.hidden_act, quant_config=quant_config) - self.block_sparse_moe = GraniteMoeMoE( - num_experts=config.num_local_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - quant_config=quant_config, - prefix=f"{prefix}.block_sparse_moe") + self.block_sparse_moe = None + if getattr(config, "num_local_experts", 0) > 0: + self.block_sparse_moe = GraniteMoeMoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + quant_config=quant_config, + prefix=f"{prefix}.block_sparse_moe") self.shared_mlp = None if \ getattr(config, 'shared_intermediate_size', 0) == 0 \ @@ -105,13 +107,19 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module): residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) if self.shared_mlp is None: - hidden_states = self.block_sparse_moe(hidden_states) + if self.block_sparse_moe is not None: + hidden_states = self.block_sparse_moe(hidden_states) + # else: skip else: # create a copy since block_sparse_moe modifies in-place - moe_hidden_states = hidden_states.clone() - moe_hidden_states = self.block_sparse_moe(moe_hidden_states) - hidden_states = moe_hidden_states + self.shared_mlp(hidden_states) - del moe_hidden_states + if self.block_sparse_moe is not None: + moe_hidden_states = hidden_states.clone() + moe_hidden_states = self.block_sparse_moe(moe_hidden_states) + hidden_states = moe_hidden_states + self.shared_mlp( + hidden_states) + del moe_hidden_states + else: + hidden_states = self.shared_mlp(hidden_states) hidden_states = residual + hidden_states * self.residual_multiplier return hidden_states, residual @@ -137,13 +145,15 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module): quant_config=quant_config, prefix=f"{prefix}.self_attn") - self.block_sparse_moe = GraniteMoeMoE( - num_experts=config.num_local_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - quant_config=quant_config, - prefix=f"{prefix}.block_sparse_moe") + self.block_sparse_moe = None + if getattr(config, "num_local_experts", 0) > 0: + self.block_sparse_moe = GraniteMoeMoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + quant_config=quant_config, + prefix=f"{prefix}.block_sparse_moe") self.shared_mlp = None if \ getattr(config, 'shared_intermediate_size', 0) == 0 \ @@ -178,13 +188,19 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module): residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) if self.shared_mlp is None: - hidden_states = self.block_sparse_moe(hidden_states) + if self.block_sparse_moe is not None: + hidden_states = self.block_sparse_moe(hidden_states) + # else: skip else: # create a copy since block_sparse_moe modifies in-place - moe_hidden_states = hidden_states.clone() - moe_hidden_states = self.block_sparse_moe(moe_hidden_states) - hidden_states = moe_hidden_states + self.shared_mlp(hidden_states) - del moe_hidden_states + if self.block_sparse_moe is not None: + moe_hidden_states = hidden_states.clone() + moe_hidden_states = self.block_sparse_moe(moe_hidden_states) + hidden_states = moe_hidden_states + self.shared_mlp( + hidden_states) + del moe_hidden_states + else: + hidden_states = self.shared_mlp(hidden_states) hidden_states = residual + hidden_states * self.residual_multiplier return hidden_states, residual diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index de8596282ca9c..be04ad0422df8 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -706,11 +706,11 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] return self._process_image_input(image_input) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index cb2a4062b84cf..0e7e4e73eca98 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -44,8 +44,8 @@ class SupportsMultiModal(Protocol): MRO of your model class. """ - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: """ Returns multimodal embeddings generated from multimodal kwargs to be merged with text embeddings. diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 0c61369c5f518..9d5cceccff2ff 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1304,11 +1304,12 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: + return [] return None # The result multimodal_embeddings is tuple of tensors, with each diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index a852be66bde82..9fb73261cd892 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -52,7 +52,7 @@ class Llama4MoE(nn.Module): renormalize: bool, ) -> tuple[torch.Tensor, torch.Tensor]: router_scores, router_indices = fast_topk(gating_output, topk, dim=-1) - # psuedo-standard is that the router scores are floats + # pseudo-standard is that the router scores are floats router_scores = torch.sigmoid(router_scores.float()) return (router_scores, router_indices.to(torch.int32)) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 725e1b2c19481..7dea260a58e0d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -659,11 +659,11 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] return self._process_image_input(image_input) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 6f5f231875de5..60ede454ff272 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -478,11 +478,11 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] vision_embeddings = self._process_image_input(image_input) return vision_embeddings @@ -492,7 +492,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: - if multimodal_embeddings is None: + if not multimodal_embeddings: return self.language_model.get_input_embeddings(input_ids) inputs_embeds = embed_multimodal( diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index a3406d090db85..78084465e7a27 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -401,11 +401,11 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: video_input = self._parse_and_validate_video_input(**kwargs) if video_input is None: - return None + return [] vision_embeddings = self._process_video_pixels(video_input) return vision_embeddings diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index d90d3d4a0960d..265f63d7bd295 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -839,11 +839,12 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: mm_input_by_modality = self._parse_and_validate_multimodal_inputs( **kwargs) if not mm_input_by_modality: + return [] return None # The result multimodal_embeddings is tuple of tensors, with each diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 4100fee0ec841..b923287dca3e0 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -878,11 +878,11 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): def get_language_model(self) -> torch.nn.Module: return self.llm - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: - return None + return [] return self._process_multimodal_inputs(modalities) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 02800449bda3c..87480796ae98f 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -856,7 +856,7 @@ class MiniMaxText01Model(nn.Module): self._dtype = _dummy.dtype del _dummy - self.minimax_cache = MinimaxCacheManager(dtype=self._dtype, + self.minimax_cache = MinimaxCacheManager(dtype=torch.float32, cache_shape=self.cache_shape) rope_theta = getattr(config, "rope_theta", 10000) @@ -1021,7 +1021,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, else: self.lm_head = PPMissingLayer() - + self.lm_head.float() flash_layer_count = sum(1 for attn_type in self.config.attn_type_list if attn_type == 1) self.kv_cache = [torch.tensor([]) for _ in range(flash_layer_count)] @@ -1054,7 +1054,7 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, + logits = self.logits_processor(self.lm_head, hidden_states.float(), sampling_metadata) return logits diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index b2ededcaf67ce..bc00af2ec6b9e 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -318,11 +318,11 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, raise AssertionError("This line should be unreachable.") - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] return self._process_image_input(image_input) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 9147240b2b2a9..59deacffd2851 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -495,11 +495,11 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] vision_embeddings = self._process_image_input(image_input) diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 3183c762d2b14..c8ad358c622d2 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -114,9 +114,9 @@ class MixtralMoE(nn.Module): f"Tensor parallel size {self.tp_size} is greater than " f"the number of experts {self.num_total_experts}.") # Split experts equally between ranks - self.expert_indicies = np.array_split(range( - self.num_total_experts), self.tp_size)[self.rank].tolist() - if not self.expert_indicies: + self.expert_indices = np.array_split(range(self.num_total_experts), + self.tp_size)[self.rank].tolist() + if not self.expert_indices: raise ValueError( f"Rank {self.rank} has no experts assigned to it.") @@ -125,7 +125,7 @@ class MixtralMoE(nn.Module): config.hidden_size, config.intermediate_size, quant_config=quant_config) - if idx in self.expert_indicies else None + if idx in self.expert_indices else None for idx in range(self.num_total_experts) ]) self.gate = ReplicatedLinear(config.hidden_size, @@ -146,7 +146,7 @@ class MixtralMoE(nn.Module): routing_weights /= routing_weights.sum(dim=-1, keepdim=True) final_hidden_states = None - for expert_idx in self.expert_indicies: + for expert_idx in self.expert_indices: expert_layer = self.experts[expert_idx] expert_mask = (selected_experts == expert_idx) expert_weights = (routing_weights * expert_mask).sum(dim=-1, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 54fae279d531d..bf4bd309eea27 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -794,11 +794,10 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings(self, - **kwargs) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] return self._process_image_input(image_input) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 1fa76b9ac7afa..70c60c6d528bf 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1473,11 +1473,11 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, def get_language_model(self) -> torch.nn.Module: return self.model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] return self._process_image_input(image_input) diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 5c11d54c61247..900a1f5de4581 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -283,7 +283,7 @@ class OvisProcessingInfo(BaseProcessingInfo): def get_image_size_with_most_features(self) -> ImageSize: height, width = self.get_hf_processor().get_image_size() hs = self.get_hf_config().visual_tokenizer_config.hidden_stride - # NOTE(Isotr0py): 9 is `max_partion` hardcoded in original code + # NOTE(Isotr0py): 9 is `max_partition` hardcoded in original code # https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96 return ImageSize(width=width * hs * 9, height=height * hs * 9) @@ -499,11 +499,11 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP): return tuple(vision_embeddings) - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] image_features = self._process_image_input(image_input) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index a0e2912578c51..cc2cebe4a4a37 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -338,11 +338,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] vision_embeddings = self._process_image_input(image_input) # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5) diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 533655fd52004..754ddda233f42 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -145,7 +145,7 @@ class Phi3SmallSelfAttention(nn.Module): self.num_q_per_kv = self.num_heads // self.num_key_value_heads if self.tp_size > 1: assert self.num_key_value_heads % self.tp_size == 0 - self.num_kv_heads_per_partion = max( + self.num_kv_heads_per_partition = max( 1, self.num_key_value_heads // self.tp_size) self.num_heads_per_partition = self.num_heads // self.tp_size @@ -212,7 +212,7 @@ class Phi3SmallSelfAttention(nn.Module): bs_params = { 'max_seqlen': self.max_position_embeddings, 'num_heads': self.num_heads_per_partition, - "num_kv_heads": self.num_kv_heads_per_partion, + "num_kv_heads": self.num_kv_heads_per_partition, "block_size": self.sparse_block_size, "local_blocks": self.local_blocks, "vert_stride": self.vert_stride, @@ -222,7 +222,7 @@ class Phi3SmallSelfAttention(nn.Module): self.attn = Attention(self.num_heads_per_partition, self.head_dim, self.scale, - num_kv_heads=self.num_kv_heads_per_partion, + num_kv_heads=self.num_kv_heads_per_partition, cache_config=cache_config, quant_config=quant_config, blocksparse_params=bs_params, @@ -243,8 +243,8 @@ class Phi3SmallSelfAttention(nn.Module): # NOTE: this is required by RotaryEmbed, which indeed does not have to # TODO: allow 3D QK for rotary forward q = q.reshape(-1, self.head_dim * self.num_heads_per_partition) - k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion) - v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion) + k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partition) + v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partition) q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 376c53d2cb99a..9cec7831ae0cf 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -655,11 +655,11 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] vision_embeddings = self._process_image_input(image_input) return vision_embeddings @@ -669,7 +669,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.embed_tokens(input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.image_token_id) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 924e6436897d4..a3ca72d1f5cf9 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1112,11 +1112,12 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): image_attention_mask) return image_embeds - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: + return [] return None # The result multimodal_embeddings is tuple of tensors, with each diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index ae7a8a732c446..0b0d66ae771dd 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -41,7 +41,7 @@ class ConformerEncoderLayer(nn.Module): for the last pointwise conv after swish activation. depthwise_seperable_out_channel: int if set different to 0, the number of - depthwise_seperable_out_channel will be used as a + depthwise_seperable_out_channel will be used as a channel_out of the second conv1d layer. otherwise, it equal to 0, the second conv1d layer is skipped. depthwise_multiplier: int @@ -126,7 +126,7 @@ class ConformerEncoderLayer(nn.Module): (Multi-Head Attention), 1 = typical Multi-Head Attention, 1 < attn_group_sizes < attention_heads = Grouped-Query Attention - attn_group_sizes = attenion_heads = Multi-Query Attention + attn_group_sizes = attention_heads = Multi-Query Attention """ def __init__( @@ -318,7 +318,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module): 1 = typical Multi-Head Attention, 1 < attention_group_size < attention_heads = Grouped-Query Attention - attention_group_size = attenion_heads = Multi-Query Attention + attention_group_size = attention_heads = Multi-Query Attention """ def __init__( @@ -744,7 +744,7 @@ class ConformerEncoder(TransformerEncoderBase): 1 = typical Multi-Head Attention, 1 < attention_group_size < attention_heads = Grouped-Query Attention - attention_group_size = attenion_heads = Multi-Query Attention + attention_group_size = attention_heads = Multi-Query Attention """ extra_multi_layer_output_idxs: list[int] diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index dddd19c7462be..cdb7e0d18d51e 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -147,15 +147,15 @@ class mp(torch.autograd.Function): grad_at_output = grad_at_output * multiplier - grad_at_scores_expaned = masked_gates * grad_at_output.mul(-1) - grad_at_scores_expaned.scatter_add_( + grad_at_scores_expanded = masked_gates * grad_at_output.mul(-1) + grad_at_scores_expanded.scatter_add_( dim=-1, index=selected_experts, src=grad_at_output, ) return ( - grad_at_scores_expaned, + grad_at_scores_expanded, None, None, None, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 705586b6a6ea6..320c0e10d06a1 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -409,11 +409,11 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] return self._process_image_input(image_input) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 7172394e42005..ad1e8fcb39d54 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -772,13 +772,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration( def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: mm_input_by_modality = self._parse_and_validate_multimodal_inputs( **kwargs) if not mm_input_by_modality: - return None + return [] # The result multimodal_embeddings is tuple of tensors, with each # tensor correspoending to a multimodal data item (image or video). diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 7770ec711ce78..202cd5e860d18 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -965,9 +965,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, grid_thw_list = grid_thw.tolist() if image_input["type"] == "image_embeds": - image_embeds = image_input["image_embeds"].type(self.visual.dtype) + image_embeds = image_input["image_embeds"] else: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) + pixel_values = image_input["pixel_values"] image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) # Split concatenated embeddings for each image item. @@ -985,10 +985,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, grid_thw_list = grid_thw.tolist() if video_input["type"] == "video_embeds": - video_embeds = video_input["video_embeds"].type(self.visual.dtype) + video_embeds = video_input["video_embeds"] else: - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) + pixel_values_videos = video_input["pixel_values_videos"] video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw_list) @@ -1017,13 +1016,13 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: mm_input_by_modality = self._parse_and_validate_multimodal_inputs( **kwargs) if not mm_input_by_modality: - return None + return [] # The result multimodal_embeddings is tuple of tensors, with each # tensor correspoending to a multimodal data item (image or video). diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 6951630c6f231..e77a8e05d2001 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -350,11 +350,11 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: audio_input = self._parse_and_validate_audio_input(**kwargs) if audio_input is None: - return None + return [] masked_audio_features = self._process_audio_input(audio_input) return masked_audio_features diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index a4f8a361ec710..49b709069cd23 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1208,9 +1208,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, assert grid_thw.ndim == 2 if image_input["type"] == "image_embeds": - image_embeds = image_input["image_embeds"].type(self.visual.dtype) + image_embeds = image_input["image_embeds"] else: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) + pixel_values = image_input["pixel_values"] image_embeds = self.visual(pixel_values, grid_thw=grid_thw) # Split concatenated embeddings for each image item. @@ -1226,10 +1226,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, assert grid_thw.ndim == 2 if video_input["type"] == "video_embeds": - video_embeds = video_input["video_embeds"].type(self.visual.dtype) + video_embeds = video_input["video_embeds"] else: - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) + pixel_values_videos = video_input["pixel_values_videos"] video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) # Split concatenated embeddings for each video item. @@ -1258,11 +1257,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: + return [] return None # The result multimodal_embeddings is tuple of tensors, with each diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 393ce41a91a00..bad0f6b1ffb73 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -38,13 +38,15 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors +from vllm.sequence import IntermediateTensors, PoolerOutput -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsCrossEncoding, SupportsLoRA, SupportsPP from .qwen2 import Qwen2MLP as Qwen3MLP from .qwen2 import Qwen2Model from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix @@ -319,3 +321,122 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights) + + +class Qwen3ForSequenceClassification(nn.Module, SupportsLoRA, + SupportsCrossEncoding): + + def __init__( + self, + vllm_config: "VllmConfig", + prefix: str = "", + ) -> None: + super().__init__() + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + pooler_config = vllm_config.model_config.pooler_config + + self.vllm_config = vllm_config + self.config = config + self.quant_config = quant_config + self.prefix = prefix + self.model = Qwen3Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.score = RowParallelLinear(config.hidden_size, + config.num_labels, + quant_config=quant_config, + input_is_parallel=False, + bias=False, + prefix=maybe_prefix(prefix, "score")) + + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.LAST, + normalize=False, + softmax=True) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + return self.model(input_ids=input_ids, + positions=positions, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors) + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + hidden_states = self._pooler.extract_states(hidden_states, + pooling_metadata) + logits, _ = self.score(hidden_states) + pooled_data = self._pooler.head(logits, pooling_metadata) + pooled_outputs = [ + self._pooler.build_output(data.squeeze(-1)) for data in pooled_data + ] + return PoolerOutput(outputs=pooled_outputs) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + is_original_qwen3_reranker = getattr(self.config, + "is_original_qwen3_reranker", + False) + + if not is_original_qwen3_reranker: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + return self.load_weights_from_original_qwen3_reranker(weights) + + def load_weights_from_original_qwen3_reranker( + self, weights: Iterable[tuple[str, torch.Tensor]]): + tokens = getattr(self.config, "classifier_from_token", None) + assert tokens is not None and len(tokens) == 2, \ + ("Try loading the original Qwen3 Reranker?, see: " + "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py") + + self.config.num_labels = 1 + model_config = self.vllm_config.model_config + + device = self.score.weight.device + self.score = RowParallelLinear(self.config.hidden_size, + self.config.num_labels, + quant_config=self.quant_config, + input_is_parallel=False, + bias=False, + prefix=maybe_prefix( + self.prefix, "score")).to(device) + + if self.config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size, + quant_config=self.quant_config, + prefix=maybe_prefix( + self.prefix, "lm_head")) + + loader = AutoWeightsLoader(self) + loaded_weights = loader.load_weights(weights) + + from vllm.transformers_utils.tokenizer import get_tokenizer + tokenizer = get_tokenizer( + model_config.tokenizer, + revision=model_config.tokenizer_revision, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code) + + a = tokenizer.convert_tokens_to_ids(tokens[0]) + b = tokenizer.convert_tokens_to_ids(tokens[1]) + weight = self.lm_head.weight.data[b].to( + device) - self.lm_head.weight.data[a].to(device) + self.score.weight.data.copy_(weight) + + del self.lm_head + loaded_weights.add("classifier.weight") + loaded_weights.discard("lm_head.weight") diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index e828ce9c98499..546737621a7c1 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -738,11 +738,11 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA, def get_language_model(self) -> torch.nn.Module: return self.transformer - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] vision_embeddings = self._process_image_input(image_input) return vision_embeddings diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index e82e366380694..83f7cc6eee0fd 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -36,6 +36,7 @@ _TEXT_GENERATION_MODELS = { "AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2 "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"), "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"), + "MiniMaxM1ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"), # baichuan-7b, upper case 'C' in the class name "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-13b, lower case 'c' in the class name @@ -172,6 +173,7 @@ _CROSS_ENCODER_MODELS = { "RobertaForSequenceClassification"), "ModernBertForSequenceClassification": ("modernbert", "ModernBertForSequenceClassification"), + "Qwen3ForSequenceClassification": ("qwen3", "Qwen3ForSequenceClassification"), # noqa: E501 } _MULTIMODAL_MODELS = { diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 08c47facad974..9fba24ac5cecb 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -869,11 +869,11 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP): def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] return self._process_image_input(image_input) diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 5aa3ddabc19ec..2645e700fcda1 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -585,11 +585,11 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] return self._process_image_input(image_input) def get_input_embeddings( diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 43836f2956c3b..f6b9d19694efa 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -546,11 +546,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: audio_input = self._parse_and_validate_audio_input(**kwargs) if audio_input is None: - return None + return [] audio_embeddings = self._process_audio_input(audio_input) return audio_embeddings diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 3ee5f7dba01f0..8cf2a009d6670 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -687,8 +687,8 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, def get_language_model(self) -> torch.nn.Module: return self.model.decoder - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: # TODO: This method does not obey the interface for SupportsMultiModal. # Refactor this once encoder/decoder support is implemented in V1. audio_input = self._parse_and_validate_audio_input(**kwargs) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index b7988359737ac..db8b2e2b39592 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -3,7 +3,6 @@ import pickle from collections.abc import Iterable, Mapping -from typing import TYPE_CHECKING, Optional import numpy as np import torch @@ -13,9 +12,6 @@ from PIL import Image from vllm.logger import init_logger from vllm.multimodal.image import convert_image_mode -if TYPE_CHECKING: - from vllm.inputs import TokensPrompt - logger = init_logger(__name__) MultiModalHashDict = Mapping[str, list[str]] @@ -91,28 +87,3 @@ class MultiModalHasher: hasher.update(v_bytes) return hasher.hexdigest() - - @classmethod - def hash_prompt_mm_data( - cls, prompt: "TokensPrompt") -> Optional["MultiModalHashDict"]: - """Hash multimodal data in the user input prompt if they exist.""" - - if "multi_modal_data" not in prompt: - return None - - mm_data = prompt["multi_modal_data"] - if not mm_data: - # mm_data can be None or an empty dict. - return None - - mm_items = { - modality: items if isinstance(items, list) else [items] - for modality, items in mm_data.items() - } - - mm_hashes = { - modality: [cls.hash_kwargs(**{modality: item}) for item in items] - for modality, items in mm_items.items() - } - - return mm_hashes diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 0bf5b1cf1c6c7..5cb720381d94b 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -680,7 +680,8 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): return self._items_by_modality.keys() @staticmethod - def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: + def _try_stack(nested_tensors: NestedTensors, + pin_memory: bool = False) -> NestedTensors: """ Stack the inner dimensions that have the same shape in a nested list of tensors. @@ -697,7 +698,9 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): if isinstance(nested_tensors, (int, float)): return torch.tensor(nested_tensors) - stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors] + stacked = [ + MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors + ] if not is_list_of(stacked, torch.Tensor, check="all"): # Only tensors (not lists) can be stacked. return stacked @@ -713,10 +716,16 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): # The tensors have incompatible shapes and can't be stacked. return tensors_ - return torch.stack(tensors_) + outputs = torch.empty(len(tensors_), + *tensors_[0].shape, + dtype=tensors_[0].dtype, + device=tensors_[0].device, + pin_memory=pin_memory) + return torch.stack(tensors_, out=outputs) @staticmethod - def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs: + def batch(inputs_list: list["MultiModalKwargs"], + pin_memory: bool = False) -> BatchedTensorInputs: """ Batch multiple inputs together into a dictionary. @@ -738,7 +747,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): item_lists[k].append(v) return { - k: MultiModalKwargs._try_stack(item_list) + k: MultiModalKwargs._try_stack(item_list, pin_memory) for k, item_list in item_lists.items() } diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 11a25f8515462..5e61d460fa428 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -324,7 +324,7 @@ def merge_and_sort_multimodal_metadata( Returns: list[str]: List of item modalities in order of their positions in the input sequence. - list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from + list[PlaceholderRange]: Sorted list of all PlaceholderRanges from mm_positions. Optional[list[str]]: Sorted list of all hashes from mm_hashes if given, None otherwise. diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 71c964fbfbb5e..1dfd394db608d 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -89,10 +89,6 @@ class CpuPlatform(Platform): import vllm.envs as envs from vllm.utils import GiB_bytes model_config = vllm_config.model_config - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - if not model_config.enforce_eager: - model_config.enforce_eager = True model_config.disable_cascade_attn = True @@ -171,9 +167,21 @@ class CpuPlatform(Platform): compilation_config = vllm_config.compilation_config if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level == CompilationLevel.PIECEWISE): + + # Note: vLLM V1 is using PIECEWISE level compilation, which will + # take time to compile kernels just-in-time with the inductor + # backend. For CPU CI tests, most of them are executed fast and + # compilations consume too much time, even with torch compile + # cache. So use VLLM_CPU_CI_ENV to indicate the CI environment, + # and just execute model with dynamo + eager mode to save time. + # VLLM_CPU_CI_ENV is only used as an internal variable. + if os.environ.get("VLLM_CPU_CI_ENV", "0") != "0": + backend = "eager" + else: + backend = "inductor" + compilation_config.level = CompilationLevel.DYNAMO_ONCE - compilation_config.backend = "eager" - compilation_config.custom_ops += ["none"] + compilation_config.backend = backend compilation_config.inductor_compile_config.update({ "dce": True, diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 48d1aacba1858..2d07ddc36613a 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -6,7 +6,7 @@ pynvml. However, it should not initialize cuda context. import os from datetime import timedelta -from functools import wraps +from functools import cache, wraps from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union import torch @@ -226,15 +226,21 @@ class CudaPlatformBase(Platform): if selected_backend == _Backend.FLASHINFER: logger.info_once("Using FlashInfer backend on V1 engine.") return "vllm.v1.attention.backends.flashinfer.FlashInferBackend" - if selected_backend == _Backend.FLEX_ATTENTION: + elif selected_backend == _Backend.FLEX_ATTENTION: logger.info("Using FlexAttenion backend on V1 engine.") return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501 - if selected_backend == _Backend.TRITON_ATTN_VLLM_V1: + elif selected_backend == _Backend.TRITON_ATTN_VLLM_V1: logger.info_once("Using Triton backend on V1 engine.") return ("vllm.v1.attention.backends." "triton_attn.TritonAttentionBackend") + elif selected_backend == _Backend.FLASH_ATTN: + logger.info_once("Using Flash Attention backend on V1 engine.") + return ("vllm.v1.attention.backends." + "flash_attn.FlashAttentionBackend") + + # Default backends for V1 engine + # Prefer FlashInfer for Blackwell GPUs if installed if cls.is_device_capability(100): - # Prefer FlashInfer for V1 on Blackwell GPUs if installed try: import flashinfer # noqa: F401 logger.info_once( @@ -248,10 +254,13 @@ class CudaPlatformBase(Platform): "Blackwell (SM 10.0) GPUs; it is recommended to " "install FlashInfer for better performance.") pass - if cls.has_device_capability(80): + # FlashAttention is the default for SM 8.0+ GPUs + elif cls.has_device_capability(80): logger.info_once("Using Flash Attention backend on V1 engine.") return ("vllm.v1.attention.backends." "flash_attn.FlashAttentionBackend") + + # Backends for V0 engine if selected_backend == _Backend.FLASHINFER: logger.info("Using FlashInfer backend.") return "vllm.attention.backends.flashinfer.FlashInferBackend" @@ -389,6 +398,7 @@ class CudaPlatformBase(Platform): class NvmlCudaPlatform(CudaPlatformBase): @classmethod + @cache @with_nvml_context def get_device_capability(cls, device_id: int = 0 @@ -486,6 +496,7 @@ class NvmlCudaPlatform(CudaPlatformBase): class NonNvmlCudaPlatform(CudaPlatformBase): @classmethod + @cache def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: major, minor = torch.cuda.get_device_capability(device_id) return DeviceCapability(major=major, minor=minor) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 9bc3b8e09ada7..52a7a903cd8ef 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -23,6 +23,7 @@ from transformers.models.auto.image_processing_auto import ( get_image_processor_config) from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) +from transformers.models.auto.tokenization_auto import get_tokenizer_config from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm import envs @@ -142,7 +143,9 @@ def list_repo_files( modelscope_list_repo_files) return modelscope_list_repo_files(repo_id, revision=revision, - token=token) + token=os.getenv( + "MODELSCOPE_API_TOKEN", + None)) return hf_list_repo_files(repo_id, revision=revision, repo_type=repo_type, @@ -867,3 +870,18 @@ def try_get_safetensors_metadata( "Error retrieving safetensors") except Exception: return None + + +def try_get_tokenizer_config( + pretrained_model_name_or_path: Union[str, os.PathLike], + trust_remote_code: bool, + revision: Optional[str] = None, +) -> Optional[dict[str, Any]]: + try: + return get_tokenizer_config( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + revision=revision, + ) + except Exception: + return None diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py index 4fe76d0df622b..557d251c45f3b 100644 --- a/vllm/transformers_utils/processors/ovis.py +++ b/vllm/transformers_utils/processors/ovis.py @@ -68,7 +68,7 @@ class OvisProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "image_pad_token", "image_segement_len"] + valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index 068fa303137c1..a003e4eb02c07 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -12,6 +12,36 @@ HAS_TRITON = ( find_spec("triton") is not None or find_spec("pytorch-triton-xpu") is not None # Not compatible ) +if HAS_TRITON: + try: + from triton.backends import backends + + # It's generally expected that x.driver exists and has + # an is_active method. + # The `x.driver and` check adds a small layer of safety. + active_drivers = [ + x.driver for x in backends.values() + if x.driver and x.driver.is_active() + ] + if len(active_drivers) != 1: + logger.info( + "Triton is installed but %d active driver(s) found " + "(expected 1). Disabling Triton to prevent runtime errors.", + len(active_drivers)) + HAS_TRITON = False + except ImportError: + # This can occur if Triton is partially installed or triton.backends + # is missing. + logger.warning( + "Triton is installed, but `triton.backends` could not be imported. " + "Disabling Triton.") + HAS_TRITON = False + except Exception as e: + # Catch any other unexpected errors during the check. + logger.warning( + "An unexpected error occurred while checking Triton active drivers:" + " %s. Disabling Triton.", e) + HAS_TRITON = False if not HAS_TRITON: logger.info("Triton not installed or not compatible; certain GPU-related" diff --git a/vllm/utils.py b/vllm/utils.py index c19c0221cf838..dc408e1676f1c 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -190,6 +190,16 @@ TORCH_DTYPE_TO_NUMPY_DTYPE = { torch.int64: np.int64, } + +@contextlib.contextmanager +def set_default_torch_num_threads(num_threads: int): + """Sets the default number of threads for PyTorch to the given value.""" + old_num_threads = torch.get_num_threads() + torch.set_num_threads(num_threads) + yield + torch.set_num_threads(old_num_threads) + + P = ParamSpec('P') T = TypeVar("T") U = TypeVar("U") @@ -1466,7 +1476,7 @@ class FlexibleArgumentParser(ArgumentParser): pattern = re.compile(r"(?<=--)[^\.]*") # Convert underscores to dashes and vice versa in argument names - processed_args = [] + processed_args = list[str]() for arg in args: if arg.startswith('--'): if '=' in arg: @@ -1483,7 +1493,7 @@ class FlexibleArgumentParser(ArgumentParser): else: processed_args.append(arg) - def create_nested_dict(keys: list[str], value: str): + def create_nested_dict(keys: list[str], value: str) -> dict[str, Any]: """Creates a nested dictionary from a list of keys and a value. For example, `keys = ["a", "b", "c"]` and `value = 1` will create: @@ -1494,7 +1504,10 @@ class FlexibleArgumentParser(ArgumentParser): nested_dict = {key: nested_dict} return nested_dict - def recursive_dict_update(original: dict, update: dict): + def recursive_dict_update( + original: dict[str, Any], + update: dict[str, Any], + ): """Recursively updates a dictionary with another dictionary.""" for k, v in update.items(): if isinstance(v, dict) and isinstance(original.get(k), dict): @@ -1502,19 +1515,25 @@ class FlexibleArgumentParser(ArgumentParser): else: original[k] = v - delete = set() - dict_args: dict[str, dict] = defaultdict(dict) + delete = set[int]() + dict_args = defaultdict[str, dict[str, Any]](dict) for i, processed_arg in enumerate(processed_args): if processed_arg.startswith("--") and "." in processed_arg: if "=" in processed_arg: - processed_arg, value = processed_arg.split("=", 1) + processed_arg, value_str = processed_arg.split("=", 1) if "." not in processed_arg: # False positive, . was only in the value continue else: - value = processed_args[i + 1] + value_str = processed_args[i + 1] delete.add(i + 1) + key, *keys = processed_arg.split(".") + try: + value = json.loads(value_str) + except json.decoder.JSONDecodeError: + value = value_str + # Merge all values with the same key into a single dict arg_dict = create_nested_dict(keys, value) recursive_dict_update(dict_args[key], arg_dict) @@ -2269,6 +2288,8 @@ def kill_process_tree(pid: int): class MemorySnapshot: """Memory snapshot.""" torch_peak: int = 0 + free_memory: int = 0 + total_memory: int = 0 cuda_memory: int = 0 torch_memory: int = 0 non_torch_memory: int = 0 @@ -2288,8 +2309,8 @@ class MemorySnapshot: self.torch_peak = torch.cuda.memory_stats().get( "allocated_bytes.all.peak", 0) - self.cuda_memory = torch.cuda.mem_get_info( - )[1] - torch.cuda.mem_get_info()[0] + self.free_memory, self.total_memory = torch.cuda.mem_get_info() + self.cuda_memory = self.total_memory - self.free_memory # torch.cuda.memory_reserved() is how many bytes # PyTorch gets from cuda (by calling cudaMalloc, etc.) @@ -2302,6 +2323,8 @@ class MemorySnapshot: def __sub__(self, other: MemorySnapshot) -> MemorySnapshot: return MemorySnapshot( torch_peak=self.torch_peak - other.torch_peak, + free_memory=self.free_memory - other.free_memory, + total_memory=self.total_memory - other.total_memory, cuda_memory=self.cuda_memory - other.cuda_memory, torch_memory=self.torch_memory - other.torch_memory, non_torch_memory=self.non_torch_memory - other.non_torch_memory, @@ -2323,6 +2346,16 @@ class MemoryProfilingResult: after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) profile_time: float = 0.0 + def __repr__(self) -> str: + return (f"Memory profiling takes {self.profile_time:.2f} seconds. " + f"Total non KV cache memory: " + f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; " + f"torch peak memory increase: " + f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; " + f"non-torch forward increase memory: " + f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; " + f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB.") + @contextlib.contextmanager def memory_profiling( diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index d7a580c2883c3..1c4604cc27e47 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -7,7 +7,8 @@ from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl, TorchSDPAMetadata) from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.ops.ipex_attn import PagedAttention -from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, + CommonAttentionMetadata) from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.worker.block_table import BlockTable @@ -53,7 +54,7 @@ class TorchSDPABackend: return False -class TorchSDPAMetadataBuilderV1: +class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]): def __init__(self, runner: CPUModelRunner, kv_cache_spec: AttentionSpec, block_table: BlockTable) -> None: @@ -118,9 +119,12 @@ class TorchSDPAMetadataBuilderV1: return True - def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int, - common_prefix_len: int, + def build(self, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata): + num_reqs = common_attn_metadata.num_reqs + num_actual_tokens = common_attn_metadata.num_actual_tokens + max_query_len = common_attn_metadata.max_query_len + runner = self.runner block_table = self.block_table seq_lens_np = runner.seq_lens_np[:num_reqs] diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index cfcfcc06af5db..88be86749d8d3 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashAttention.""" from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, ClassVar, Optional import numpy as np import torch @@ -16,18 +16,16 @@ from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8, get_flash_attn_version) from vllm.config import VllmConfig, get_layers_from_vllm_config -from vllm.distributed.kv_transfer.kv_connector.utils import ( - get_kv_connector_cache_layout) from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import cdiv -from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, + CommonAttentionMetadata, + get_kv_cache_layout) from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.worker.block_table import BlockTable if TYPE_CHECKING: - from vllm.v1.core.sched.output import SchedulerOutput - from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner if current_platform.is_cuda(): @@ -76,16 +74,15 @@ class FlashAttentionBackend(AttentionBackend): @staticmethod def get_kv_cache_stride_order() -> tuple[int, ...]: - # NOTE When running disaggregated PD with NIXL, HND layout is used for - # faster transfer. `stride_order` indicates the permutation that gets + # `stride_order` indicates the permutation that gets # us from `get_kv_cache_shape` to the actual memory layout we want. - cache_layout = get_kv_connector_cache_layout() + cache_layout = get_kv_cache_layout() if cache_layout == "NHD": stride_order = (0, 1, 2, 3, 4) elif cache_layout == "HND": stride_order = (0, 1, 3, 2, 4) else: - raise ValueError("Unknown cache layout format %s.", cache_layout) + raise ValueError(f"Unknown cache layout format {cache_layout}.") return stride_order @@ -308,7 +305,9 @@ def _get_sliding_window_configs( return sliding_window_configs -class FlashAttentionMetadataBuilder: +class FlashAttentionMetadataBuilder( + AttentionMetadataBuilder[FlashAttentionMetadata]): + full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3 def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec, block_table: BlockTable): @@ -338,9 +337,13 @@ class FlashAttentionMetadataBuilder: # populated on first build() call. self.aot_sliding_window: Optional[tuple[int, int]] = None - def reorder_batch(self, input_batch: "InputBatch", - scheduler_output: "SchedulerOutput") -> bool: - return False + def build( + self, common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata + ) -> FlashAttentionMetadata: + num_reqs = common_attn_metadata.num_reqs + num_actual_tokens = common_attn_metadata.num_actual_tokens + max_query_len = common_attn_metadata.max_query_len def build_slice( self, @@ -353,7 +356,7 @@ class FlashAttentionMetadataBuilder: num_reqs = req_slice.stop - req_slice.start num_tokens = token_slice.stop - token_slice.start - max_seq_len = self.runner.seq_lens_np[req_slice].max() + max_seq_len = int(self.runner.seq_lens_np[req_slice].max()) query_start_loc = slice_query_start_locs( common_attn_metadata.query_start_loc, req_slice) seq_lens = common_attn_metadata.seq_lens[req_slice] @@ -502,9 +505,13 @@ class FlashAttentionMetadataBuilder: ) return attn_metadata - def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int, - common_prefix_len: int, - common_attn_metadata: CommonAttentionMetadata): + def build( + self, common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata + ) -> FlashAttentionMetadata: + num_reqs = common_attn_metadata.num_reqs + num_actual_tokens = common_attn_metadata.num_actual_tokens + max_query_len = common_attn_metadata.max_query_len return self.build_slice( req_slice=slice(0, num_reqs), token_slice=slice(0, num_actual_tokens), @@ -513,6 +520,11 @@ class FlashAttentionMetadataBuilder: common_attn_metadata=common_attn_metadata, ) + def can_run_in_cudagraph( + self, common_attn_metadata: CommonAttentionMetadata) -> bool: + # Full CUDA Graph always supported (FA2 support checked separately) + return True + def use_cascade_attention(self, *args, **kwargs) -> bool: return False #use_cascade_attention(*args, **kwargs) @@ -555,7 +567,6 @@ class FlashAttentionImpl(AttentionImpl): self.logits_soft_cap = logits_soft_cap self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads support_head_sizes = FlashAttentionBackend.get_supported_head_sizes() @@ -586,6 +597,7 @@ class FlashAttentionImpl(AttentionImpl): kv_cache: torch.Tensor, attn_metadata: FlashAttentionMetadata, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -603,6 +615,11 @@ class FlashAttentionImpl(AttentionImpl): """ assert output is not None, "Output tensor must be provided." + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for FlashAttentionImpl") + if attn_metadata is None: # Profiling run. return output diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index f1b61c152a9d8..03a2ed7139c7c 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -18,7 +18,9 @@ from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.v1.attention.backends.flash_attn import use_cascade_attention -from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, + CommonAttentionMetadata, + get_kv_cache_layout) from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.worker.block_table import BlockTable @@ -65,6 +67,19 @@ class FlashInferBackend(AttentionBackend): ) -> tuple[int, ...]: return (num_blocks, 2, block_size, num_kv_heads, head_size) + @staticmethod + def get_kv_cache_stride_order() -> tuple[int, ...]: + # `stride_order` indicates the permutation that gets us from + # `get_kv_cache_shape` to the actual memory layout we want. + cache_layout = get_kv_cache_layout() + if cache_layout == "NHD": + stride_order = (0, 1, 2, 3, 4) + elif cache_layout == "HND": + stride_order = (0, 1, 3, 2, 4) + else: + raise ValueError(f"Unknown cache layout format {cache_layout}.") + return stride_order + @dataclass class PerLayerParameters: @@ -202,7 +217,7 @@ class FlashInferMetadata: f" received {self.head_dim}.") -class FlashInferMetadataBuilder: +class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): def __init__(self, runner: GPUModelRunner, kv_cache_spec: AttentionSpec, block_table: BlockTable): @@ -289,7 +304,7 @@ class FlashInferMetadataBuilder: def _get_prefill_wrapper(self): if self._prefill_wrapper is None: self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper( - self._get_workspace_buffer(), "NHD") + self._get_workspace_buffer(), get_kv_cache_layout()) return self._prefill_wrapper def _get_decode_wrapper(self): @@ -302,14 +317,14 @@ class FlashInferMetadataBuilder: num_qo_heads // num_kv_heads > 4) self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( self._get_workspace_buffer(), - "NHD", + get_kv_cache_layout(), use_tensor_cores=use_tensor_cores) return self._decode_wrapper def _get_cascade_wrapper(self): if self._cascade_wrapper is None: self._cascade_wrapper = MultiLevelCascadeAttentionWrapper( - 2, self._get_workspace_buffer(), "NHD") + 2, self._get_workspace_buffer(), get_kv_cache_layout()) return self._cascade_wrapper def _plan(self, attn_metadata: FlashInferMetadata): @@ -399,9 +414,11 @@ class FlashInferMetadataBuilder: kv_data_type=attn_metadata.data_type, ) - def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int, - common_prefix_len: int, + def build(self, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata): + num_reqs = common_attn_metadata.num_reqs + num_actual_tokens = common_attn_metadata.num_actual_tokens + assert self._num_decodes + self._num_prefills == num_reqs assert (self._num_decode_tokens + self._num_prefill_tokens == num_actual_tokens) @@ -508,7 +525,12 @@ class FlashInferImpl(AttentionImpl): logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, + use_irope: bool = False, ) -> None: + if use_irope: + logger.warning_once( + "Using irope in FlashInfer is not supported yet, it will fall" + " back to global attention for long context.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) @@ -524,7 +546,6 @@ class FlashInferImpl(AttentionImpl): self.logits_soft_cap = logits_soft_cap self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads if attn_type != AttentionType.DECODER: @@ -542,6 +563,7 @@ class FlashInferImpl(AttentionImpl): kv_cache: torch.Tensor, attn_metadata: FlashInferMetadata, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashInfer. @@ -556,6 +578,11 @@ class FlashInferImpl(AttentionImpl): """ assert output is not None, "Output tensor must be provided." + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for FlashInferImpl") + if attn_metadata is None: # Profiling run. return output @@ -607,6 +634,7 @@ class FlashInferImpl(AttentionImpl): num_decode_tokens = attn_metadata.num_decode_tokens num_prefill_tokens = attn_metadata.num_prefill_tokens + stride_order = FlashInferBackend.get_kv_cache_stride_order() # Regular attention (common case). # Decodes are at the front and prefills are at the back, # according to reorder_batch() @@ -621,7 +649,7 @@ class FlashInferImpl(AttentionImpl): assert prefill_wrapper._sm_scale == self.scale prefill_wrapper.run( prefill_query, - kv_cache, + kv_cache.permute(*stride_order), k_scale=layer._k_scale_float, v_scale=layer._v_scale_float, out=output[num_decode_tokens:], @@ -637,7 +665,7 @@ class FlashInferImpl(AttentionImpl): assert decode_wrapper._sm_scale == self.scale decode_wrapper.run( decode_query, - kv_cache, + kv_cache.permute(*stride_order), k_scale=layer._k_scale_float, v_scale=layer._v_scale_float, out=output[:num_decode_tokens], diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 5b473b1461a68..dd8d7994ed333 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -15,7 +15,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, is_quantized_kv_cache) from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, + CommonAttentionMetadata) from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.worker.block_table import BlockTable @@ -25,8 +26,6 @@ if current_platform.is_cuda(): logger = init_logger(__name__) if TYPE_CHECKING: - from vllm.v1.core.sched.output import SchedulerOutput - from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner create_block_mask_compiled = torch.compile(create_block_mask, @@ -243,6 +242,7 @@ class FlexAttentionMetadata: None, self.num_actual_tokens, self.total_cache_tokens, + device=self.block_table.device, ) def __post_init__(self): @@ -256,7 +256,8 @@ class FlexAttentionMetadata: self.block_mask = self.build_block_mask() -class FlexAttentionMetadataBuilder: +class FlexAttentionMetadataBuilder( + AttentionMetadataBuilder[FlexAttentionMetadata]): def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec, block_table: BlockTable): @@ -272,13 +273,12 @@ class FlexAttentionMetadataBuilder: self.kv_cache_spec = kv_cache_spec self.block_table = block_table - def reorder_batch(self, input_batch: "InputBatch", - scheduler_output: "SchedulerOutput") -> bool: - return False - - def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int, - common_prefix_len: int, + def build(self, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata): + num_reqs = common_attn_metadata.num_reqs + num_actual_tokens = common_attn_metadata.num_actual_tokens + max_query_len = common_attn_metadata.max_query_len + max_seq_len = self.runner.seq_lens_np[:num_reqs].max() query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens @@ -332,9 +332,6 @@ class FlexAttentionMetadataBuilder: ) return out - def use_cascade_attention(self, *args, **kwargs) -> bool: - return False - class FlexAttentionImpl(AttentionImpl): sliding_window: Optional[tuple[int, int]] @@ -380,7 +377,6 @@ class FlexAttentionImpl(AttentionImpl): raise NotImplementedError( "FlexAttention does not support logits soft cap yet.") - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads if kv_sharing_target_layer_name is not None: @@ -414,6 +410,7 @@ class FlexAttentionImpl(AttentionImpl): kv_cache: torch.Tensor, attn_metadata: FlexAttentionMetadata, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FLexAttention. @@ -427,6 +424,11 @@ class FlexAttentionImpl(AttentionImpl): shape = [num_tokens, num_heads * head_size] """ assert output is not None, "Output tensor must be provided." + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for FlexAttentionImpl") + enable_gqa = self.num_kv_heads != self.num_heads if attn_metadata is None: diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 9ada9e952b996..fbd95423b1c3b 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -207,7 +207,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, UnquantizedLinearMethod) from vllm.platforms import current_platform from vllm.utils import cdiv, round_down -from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, + CommonAttentionMetadata, slice_query_start_locs) from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.worker.block_table import BlockTable @@ -330,7 +331,7 @@ class MLACommonMetadata(Generic[D]): M = TypeVar("M", bound=MLACommonMetadata) -class MLACommonMetadataBuilder(Generic[M]): +class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): """ NOTE: Please read the comment at the top of the file before trying to understand this class @@ -486,6 +487,12 @@ class MLACommonMetadataBuilder(Generic[M]): block_table_tensor = block_table.get_device_tensor()[req_slice] slot_mapping = block_table.slot_mapping_cpu[token_slice].to( device, non_blocking=True).long() + # block_table_tensor = block_table.get_device_tensor()[:num_reqs] + # block_table.slot_mapping[:num_actual_tokens].copy_( + # block_table.slot_mapping_cpu[:num_actual_tokens], + # non_blocking=True) + # block_table.slot_mapping[num_actual_tokens:].fill_(-1) + # slot_mapping = block_table.slot_mapping[:num_actual_tokens] query_start_loc = slice_query_start_locs( common_attn_metadata.query_start_loc, req_slice) @@ -597,9 +604,13 @@ class MLACommonMetadataBuilder(Generic[M]): decode=decode_metadata, ) - def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int, - common_prefix_len: int, - common_attn_metadata: CommonAttentionMetadata): + def build(self, common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata) -> M: + num_reqs = common_attn_metadata.num_reqs + num_actual_tokens = common_attn_metadata.num_actual_tokens + max_query_len = common_attn_metadata.max_query_len + + assert self._num_decodes + self._num_prefills == num_reqs return self.build_slice( req_slice=slice(0, num_reqs), token_slice=slice(0, num_actual_tokens), @@ -608,9 +619,33 @@ class MLACommonMetadataBuilder(Generic[M]): common_attn_metadata=common_attn_metadata, ) + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata) -> M: + """ + This method builds the metadata for full cudagraph capture. + Currently, only decode is supported for full cudagraphs with MLA. + """ + m = common_attn_metadata + assert m.num_reqs == m.num_actual_tokens, \ + "MLA only supports decode-only full CUDAGraph capture. " \ + "Make sure all cudagraph capture sizes <= max_num_seq." + + m.max_query_len = 1 # decode-only + + # Update state usually set in reorder_batch. + self._num_decodes = m.num_reqs + self._num_decode_tokens = m.num_actual_tokens + self._num_prefills = 0 + self._num_prefill_tokens = 0 + return self.build(0, m) + def use_cascade_attention(self, *args, **kwargs) -> bool: return False + def can_run_in_cudagraph( + self, common_attn_metadata: CommonAttentionMetadata) -> bool: + return common_attn_metadata.max_query_len == 1 + class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): """ @@ -909,10 +944,16 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): kv_cache: torch.Tensor, attn_metadata: M, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: assert output is not None, "Output tensor must be provided." + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for MLACommonImpl") + if attn_metadata is None: # The zero fill is required when used with DP + EP # to ensure all ranks within a DP group compute the diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 70aee058e2963..c8ec571989c68 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -40,12 +40,13 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) unsupported_features = [ alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 318b8ede14366..be26e0060db5e 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Any, Optional +from typing import Any, ClassVar, Optional import torch @@ -44,7 +44,7 @@ class FlashMLABackend(MLACommonBackend): @dataclass class FlashMLADecodeMetadata(MLACommonDecodeMetadata): - tile_scheduler_metadata: tuple[torch.Tensor, torch.Tensor] + tile_scheduler_metadata: torch.Tensor num_splits: torch.Tensor @@ -54,14 +54,18 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]): class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): + full_cudagraph_supported: ClassVar[bool] = True # Decode-only def __init__(self, runner, kv_cache_spec: AttentionSpec, block_table: BlockTable): - super().__init__(runner, kv_cache_spec, block_table) + super().__init__(runner, kv_cache_spec, block_table, FlashMLAMetadata) self.num_q_heads = self.runner.model_config.get_num_attention_heads( self.runner.parallel_config) + self.cg_buf_tile_scheduler_metadata = None + self.cg_buf_num_splits = None + def _build_decode(self, block_table_tensor: torch.Tensor, seq_lens: torch.Tensor) -> FlashMLADecodeMetadata: tile_scheduler_metadata, num_splits = \ @@ -71,6 +75,30 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): 1, # MQA for the decode path ) + if self.runner.full_cuda_graph: + # First time around (CUDAGraph capture), allocate the static buffer + if self.cg_buf_tile_scheduler_metadata is None: + self.cg_buf_tile_scheduler_metadata = tile_scheduler_metadata + self.cg_buf_num_splits = num_splits + else: + assert self.cg_buf_num_splits is not None + + # Metadata per-SM, fixed size (#SMs, TileMetadataSize) + assert (self.cg_buf_tile_scheduler_metadata.size() == + tile_scheduler_metadata.size()) + self.cg_buf_tile_scheduler_metadata.\ + copy_(tile_scheduler_metadata) + tile_scheduler_metadata = self.cg_buf_tile_scheduler_metadata + + # Num splits is per-batch, varying size (batch_size,) + n = num_splits.size(0) + # make sure static buffer is large enough + assert n <= self.cg_buf_num_splits.size(0) + num_splits_view = self.cg_buf_num_splits[:n] + num_splits_view.copy_(num_splits) + self.cg_buf_num_splits[n:].fill_(0) # fill the rest with 0s + num_splits = num_splits_view + return FlashMLADecodeMetadata( block_table=block_table_tensor, seq_lens=seq_lens, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 1f0406a7ac1f8..9fbca2e955e72 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -66,7 +66,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): def __init__(self, runner, kv_cache_spec: AttentionSpec, block_table: BlockTable): - super().__init__(runner, kv_cache_spec, block_table) + super().__init__(runner, kv_cache_spec, block_table, AiterMLAMetadata) assert self.kv_cache_spec.block_size == 1, "AITER MLA" \ "only supports block size 1." diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index 0f956ba88b9c1..1069578cfd292 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -17,6 +17,9 @@ from vllm.utils import cdiv, next_power_of_2 logger = init_logger(__name__) +# TPU requires the head size to be a multiple of 128. +TPU_HEAD_SIZE_ALIGNMENT = 128 + class PallasAttentionBackend(AttentionBackend): @@ -43,6 +46,14 @@ class PallasAttentionBackend(AttentionBackend): num_kv_heads: int, head_size: int, ) -> tuple[int, ...]: + padded_head_size = cdiv( + head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT + num_blocks = num_blocks * head_size // padded_head_size + if padded_head_size != head_size: + logger.warning_once( + "head size is padded to %d, and num_blocks is adjusted to %d" + " accordingly", padded_head_size, num_blocks) + head_size = padded_head_size return (num_blocks, block_size, num_kv_heads * 2, head_size) @staticmethod @@ -131,10 +142,7 @@ class PallasAttentionBackendImpl(AttentionImpl): self.logits_soft_cap = logits_soft_cap self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads - if head_size % 128 != 0: - raise NotImplementedError("Head size must be a multiple of 128.") if alibi_slopes is not None: raise NotImplementedError("Alibi slopes is not supported.") if kv_cache_dtype != "auto": @@ -161,6 +169,7 @@ class PallasAttentionBackendImpl(AttentionImpl): kv_cache: torch.Tensor, attn_metadata: PallasMetadata, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with Pallas attention. @@ -173,6 +182,11 @@ class PallasAttentionBackendImpl(AttentionImpl): Returns: shape = [num_tokens, num_heads * head_size] """ + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for PallasAttentionBackendImpl") + # For determine_available_memory case. if kv_cache.numel() == 0: if output is None: @@ -182,6 +196,18 @@ class PallasAttentionBackendImpl(AttentionImpl): assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 num_tokens, hidden_size = query.shape query = query.view(num_tokens, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + if self.head_size % TPU_HEAD_SIZE_ALIGNMENT != 0: + padded_head_size = cdiv( + self.head_size, + TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT + query = torch.nn.functional.pad( + query, (0, padded_head_size - self.head_size), value=0.0) + key = torch.nn.functional.pad( + key, (0, padded_head_size - self.head_size), value=0.0) + value = torch.nn.functional.pad( + value, (0, padded_head_size - self.head_size), value=0.0) if self.kv_sharing_target_layer_name is None and kv_cache.numel() > 0: # Write input keys and values to the KV cache. @@ -208,6 +234,9 @@ class PallasAttentionBackendImpl(AttentionImpl): soft_cap=self.logits_soft_cap, ) + if self.head_size % TPU_HEAD_SIZE_ALIGNMENT != 0: + output = output[:, :, :self.head_size] + return output.reshape(num_tokens, hidden_size) @@ -226,11 +255,8 @@ def write_to_kv_cache( """ _, _, num_combined_kv_heads, head_size = kv_cache.shape - num_kv_heads = num_combined_kv_heads // 2 - - key = key.view(-1, num_kv_heads, head_size) - value = value.view(-1, num_kv_heads, head_size) - + head_size = cdiv(head_size, + TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT kv = torch.cat([key, value], axis=-1).reshape(-1, num_combined_kv_heads, head_size) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 5db592b150107..9782ec087babb 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -114,7 +114,6 @@ class TritonAttentionImpl(AttentionImpl): self.use_irope = use_irope - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads support_head_sizes = TritonAttentionBackend.get_supported_head_sizes() @@ -142,6 +141,7 @@ class TritonAttentionImpl(AttentionImpl): kv_cache: torch.Tensor, attn_metadata: FlashAttentionMetadata, output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -156,6 +156,11 @@ class TritonAttentionImpl(AttentionImpl): """ assert output is not None, "Output tensor must be provided." + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for TritonAttentionImpl") + if attn_metadata is None: # Profiling run. return output diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 51b44f74c4fcd..8aa78e7018129 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -1,15 +1,31 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import abc +import functools +from abc import abstractmethod from dataclasses import dataclass +from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar +import numpy as np import torch +if TYPE_CHECKING: + from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.worker.gpu_input_batch import InputBatch + +import vllm.envs as envs +from vllm.distributed.kv_transfer.kv_connector.utils import ( + get_kv_connector_cache_layout) +from vllm.logger import init_logger + +logger = init_logger(__name__) + @dataclass class CommonAttentionMetadata: """ - Attention metadata attributes that can be shared by layers in different KV - cache groups and thus having different block table. + Per-batch attention metadata, shared across layers and backends. + AttentionMetadataBuilder instances use it to construct per-layer metadata. """ query_start_loc: torch.Tensor @@ -18,6 +34,67 @@ class CommonAttentionMetadata: """(batch_size,), the length of each request including both computed tokens and newly scheduled tokens""" + num_reqs: int + """Number of requests""" + num_actual_tokens: int + """Total number of tokens in batch""" + max_query_len: int + """Longest query in batch""" + + +M = TypeVar("M") + + +class AttentionMetadataBuilder(abc.ABC, Generic[M]): + # Does this backend/builder support CUDA Graphs for attention. + full_cudagraph_supported: ClassVar[bool] = False + + @abstractmethod + def build(self, common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata) -> M: + """ + Central method that builds attention metadata. + Some builders (MLA) require reorder_batch to be called prior to build. + """ + raise NotImplementedError + + def can_run_in_cudagraph( + self, common_attn_metadata: CommonAttentionMetadata) -> bool: + """ + Can this batch (with given metadata) use CUDA Graphs for attention. + """ + return False + + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata) -> M: + """ + Build attention metadata for CUDA graph capture. Uses build by default. + Subclasses that override this method should call self.build or + super().build_for_cudagraph_capture. + """ + return self.build(common_prefix_len=0, + common_attn_metadata=common_attn_metadata) + + def use_cascade_attention( + self, + common_prefix_len: int, + query_lens: np.ndarray, + num_query_heads: int, + num_kv_heads: int, + use_alibi: bool, + use_sliding_window: bool, + num_sms: int, + ) -> bool: + return False + + def reorder_batch(self, input_batch: "InputBatch", + scheduler_output: "SchedulerOutput") -> bool: + """ + This method can reorder the batch if desired by the backend. + :return: Has the batch been reordered (default False). + """ + return False + def slice_query_start_locs( query_start_loc: torch.Tensor, @@ -57,3 +134,16 @@ def validate_kv_sharing_target(current_layer_name, target_layer_name, raise ValueError( error_msg + f"must be the same type as the current layer ({expected}).") + + +@functools.lru_cache +def get_kv_cache_layout(): + # Override with format specified by the user. + cache_layout = envs.VLLM_KV_CACHE_LAYOUT + if cache_layout is None: + cache_layout = get_kv_connector_cache_layout() + else: + logger.info_once("`FLASHINFER_KV_CACHE_LAYOUT` environment variable " \ + "detected. Setting KV cache layout to %s.", cache_layout) + + return cache_layout diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 3b2a4f9360006..d21f94727cf61 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -89,8 +89,8 @@ class BlockPool: BlockHashWithGroupId(block_hash, group_id)) if not cached_blocks_one_group: return None - first_block_id = next(iter(cached_blocks_one_group)) - cached_blocks.append(cached_blocks_one_group[first_block_id]) + first_block = next(iter(cached_blocks_one_group.values())) + cached_blocks.append(first_block) return cached_blocks def cache_full_blocks( @@ -260,7 +260,7 @@ class BlockPool: return True return False - def touch(self, blocks: list[list[KVCacheBlock]]) -> None: + def touch(self, blocks: tuple[list[KVCacheBlock], ...]) -> None: """Touch a block increases its reference count by 1, and may remove the block from the free queue. This is used when a block is hit by another request with the same prefix. @@ -299,7 +299,7 @@ class BlockPool: bool: True if the prefix cache is successfully reset, False otherwise. """ - num_used_blocks = (self.num_gpu_blocks - self.get_num_free_blocks()) + num_used_blocks = self.num_gpu_blocks - self.get_num_free_blocks() if num_used_blocks != 1: # The null block is always marked as used logger.warning( "Failed to reset prefix cache because some " diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index 231bad1df9228..5620d9bee7a3b 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -5,8 +5,7 @@ from typing import Callable, Optional from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock from vllm.v1.core.single_type_kv_cache_manager import ( - FullAttentionManager, SingleTypeKVCacheManager, - get_manager_for_kv_cache_spec) + FullAttentionManager, get_manager_for_kv_cache_spec) from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig from vllm.v1.request import Request @@ -30,25 +29,21 @@ class KVCacheCoordinator(ABC): self.block_pool = BlockPool(kv_cache_config.num_blocks, enable_caching, enable_kv_cache_events) - self.single_type_managers: list[SingleTypeKVCacheManager] = [] # Needs special handling for find_longest_cache_hit if eagle is enabled self.use_eagle = use_eagle - - for i in range(len(self.kv_cache_config.kv_cache_groups)): - kv_cache_spec = self.kv_cache_config.kv_cache_groups[ - i].kv_cache_spec - self.single_type_managers.append( - get_manager_for_kv_cache_spec( - kv_cache_spec=kv_cache_spec, - block_pool=self.block_pool, - kv_cache_group_id=i, - caching_hash_fn=caching_hash_fn, - )) + self.single_type_managers = tuple( + get_manager_for_kv_cache_spec( + kv_cache_spec=kv_cache_group.kv_cache_spec, + block_pool=self.block_pool, + kv_cache_group_id=i, + caching_hash_fn=caching_hash_fn, + ) for i, kv_cache_group in enumerate( + self.kv_cache_config.kv_cache_groups)) def get_num_blocks_to_allocate( self, request_id: str, num_tokens: int, - new_computed_blocks: list[list[KVCacheBlock]]) -> int: + new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> int: """ Get the number of blocks needed to be allocated for the request. @@ -70,7 +65,7 @@ class KVCacheCoordinator(ABC): def save_new_computed_blocks( self, request_id: str, - new_computed_blocks: list[list[KVCacheBlock]]) -> None: + new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> None: """ Add the new computed blocks to the request. @@ -84,7 +79,7 @@ class KVCacheCoordinator(ABC): new_computed_blocks[i]) def allocate_new_blocks(self, request_id: str, - num_tokens: int) -> list[list[KVCacheBlock]]: + num_tokens: int) -> tuple[list[KVCacheBlock], ...]: """ Allocate new blocks for the request to give it at least `num_tokens` token slots. @@ -97,11 +92,9 @@ class KVCacheCoordinator(ABC): Returns: The new allocated blocks. """ - new_blocks = [] - for manager in self.single_type_managers: - new_blocks.append( - manager.allocate_new_blocks(request_id, num_tokens)) - return new_blocks + return tuple( + manager.allocate_new_blocks(request_id, num_tokens) + for manager in self.single_type_managers) def cache_blocks(self, request: Request, block_hashes: list[BlockHash], num_computed_tokens: int) -> None: @@ -159,19 +152,20 @@ class KVCacheCoordinator(ABC): for manager in self.single_type_managers: manager.remove_skipped_blocks(request_id, num_computed_tokens) - def get_blocks(self, request_id: str) -> list[list[KVCacheBlock]]: + def get_blocks(self, request_id: str) -> tuple[list[KVCacheBlock], ...]: """ Get the blocks for the request. """ - return [ + return tuple( manager.req_to_blocks.get(request_id) or [] - for manager in self.single_type_managers - ] + for manager in self.single_type_managers) @abstractmethod def find_longest_cache_hit( - self, block_hashes: list[BlockHash], - max_cache_hit_length: int) -> tuple[list[list[KVCacheBlock]], int]: + self, + block_hashes: list[BlockHash], + max_cache_hit_length: int, + ) -> tuple[tuple[list[KVCacheBlock], ...], int]: pass @@ -195,8 +189,10 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): "UnitaryKVCacheCoordinator assumes only one kv cache group") def find_longest_cache_hit( - self, block_hashes: list[BlockHash], - max_cache_hit_length: int) -> tuple[list[list[KVCacheBlock]], int]: + self, + block_hashes: list[BlockHash], + max_cache_hit_length: int, + ) -> tuple[tuple[list[KVCacheBlock], ...], int]: hit_blocks = self.single_type_managers[0].find_longest_cache_hit( block_hashes=block_hashes, max_length=max_cache_hit_length, @@ -275,11 +271,24 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): "KVCacheCoordinator assumes the block_size of full attention " "layers is divisible by other layers now.") + if max(self.full_attention_group_ids) < min(self.other_group_ids): + self.full_attn_first = True + elif max(self.other_group_ids) < min(self.full_attention_group_ids): + self.full_attn_first = False + else: + raise ValueError( + "HybridKVCacheCoordinator assumes the full " + "attention group ids and other attention group ids " + "do not interleave, either full attention group ids " + "are before other attention group ids or vice versa." + "This is for simplifying merging hit_blocks_full_attn and " + "hit_blocks_other_attn to hit_blocks.") + def find_longest_cache_hit( self, block_hashes: list[BlockHash], max_cache_hit_length: int, - ) -> tuple[list[list[KVCacheBlock]], int]: + ) -> tuple[tuple[list[KVCacheBlock], ...], int]: """ Find the longest cache hit for the request. @@ -318,27 +327,25 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): )) hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size - # NOTE: the prefix cache hit length must be a multiply of block_size as + # NOTE: the prefix cache hit length must be a multiple of block_size as # we don't support partial block cache hit yet. The cache hit length - # of other attention is ensured to be a multiply of the block size of + # of other attention is ensured to be a multiple of the block size of # full attention layers in current implementation, because hit_length is - # a multiply of other attention's block size, and other attention's - # block size is a multiply of full attention's block size (verified in + # a multiple of other attention's block size, and other attention's + # block size is a multiple of full attention's block size (verified in # `verify_and_split_kv_cache_groups`). assert hit_length % self.full_attention_block_size == 0 # Truncate the full attention cache hit to the length of the # cache hit of the other attention. - for i in range(len(hit_blocks_full_attn)): - del hit_blocks_full_attn[i][hit_length // - self.full_attention_block_size:] + for group_hit_blocks in hit_blocks_full_attn: + del group_hit_blocks[hit_length // self.full_attention_block_size:] # Merge the hit blocks of full attention and other attention. - hit_blocks = hit_blocks_other_attn - for group_id, blocks in enumerate(hit_blocks_full_attn): - # NOTE: there is only one full attention group in most cases. So - # the time complexity of insert is fine. - hit_blocks.insert(group_id, blocks) + if self.full_attn_first: + hit_blocks = hit_blocks_full_attn + hit_blocks_other_attn + else: + hit_blocks = hit_blocks_other_attn + hit_blocks_full_attn return hit_blocks, hit_length @@ -351,8 +358,6 @@ def get_kv_cache_coordinator( use_eagle, enable_caching, caching_hash_fn, enable_kv_cache_events) - else: - return HybridKVCacheCoordinator(kv_cache_config, max_model_len, - use_eagle, enable_caching, - caching_hash_fn, - enable_kv_cache_events) + return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle, + enable_caching, caching_hash_fn, + enable_kv_cache_events) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 35fb189fda346..99531e7d213dd 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -21,11 +21,11 @@ logger = init_logger(__name__) @dataclass class KVCacheBlocks: """ - The allocation result of KVCacheManager, work as the interface between - Scheduler and KVCacheManager, to hide KVCacheManager's internal data + The allocation result of KVCacheManager, work as the interface between + Scheduler and KVCacheManager, to hide KVCacheManager's internal data structure from the Scheduler. """ - blocks: list[list[KVCacheBlock]] + blocks: tuple[list[KVCacheBlock], ...] """ blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens. We don't use block of tokens as the outer dimension because it assumes all @@ -37,21 +37,19 @@ class KVCacheBlocks: def __add__(self, other: "KVCacheBlocks") -> "KVCacheBlocks": """Adds two KVCacheBlocks instances.""" return KVCacheBlocks( - [blk1 + blk2 for blk1, blk2 in zip(self.blocks, other.blocks)]) + tuple(blk1 + blk2 + for blk1, blk2 in zip(self.blocks, other.blocks))) - def get_block_ids(self) -> list[list[int]]: + def get_block_ids(self) -> tuple[list[int], ...]: """ Converts the KVCacheBlocks instance to block_ids. Returns: - list[list[int]]: A two-level list where - * the outer list corresponds to KV cache groups + tuple[list[int], ...]: A tuple of lists where + * the outer tuple corresponds to KV cache groups * each inner list contains the block_ids of the blocks in that group """ - block_ids = [] - for group in self.blocks: - block_ids.append([blk.block_id for blk in group]) - return block_ids + return tuple([blk.block_id for blk in group] for group in self.blocks) def get_unhashed_block_ids(self) -> list[int]: """Get block_ids of unhashed blocks from KVCacheBlocks instance.""" @@ -63,7 +61,7 @@ class KVCacheBlocks: def new_empty(self) -> "KVCacheBlocks": """Creates a new KVCacheBlocks instance with no blocks.""" - return KVCacheBlocks([[] for _ in range(len(self.blocks))]) + return KVCacheBlocks(tuple([] for _ in range(len(self.blocks)))) class KVCacheManager: @@ -232,9 +230,8 @@ class KVCacheManager: if new_computed_blocks is not None: new_computed_block_list = new_computed_blocks.blocks else: - new_computed_block_list = [ - [] for _ in range(len(self.kv_cache_config.kv_cache_groups)) - ] + new_computed_block_list = tuple( + [] for _ in range(len(self.kv_cache_config.kv_cache_groups))) # Free the blocks that are skipped during the attention computation # (e.g., tokens outside the sliding window). @@ -267,7 +264,7 @@ class KVCacheManager: if self.enable_caching: self.block_pool.touch(new_computed_block_list) else: - assert all(not blocks for blocks in new_computed_block_list), ( + assert not any(new_computed_block_list), ( "Computed blocks should be empty when " "prefix caching is disabled") @@ -378,17 +375,19 @@ class KVCacheManager: """ return self.block_pool.take_events() - def get_block_ids(self, request_id: str) -> list[list[int]]: + def get_block_ids(self, request_id: str) -> tuple[list[int], ...]: """Get the block ids of a request.""" return KVCacheBlocks( self.coordinator.get_blocks(request_id)).get_block_ids() - def cache_blocks(self, request: Request, block_hashes: list[BlockHash], - num_computed_tokens: int) -> None: - """Cache the blocks for the request.""" - self.coordinator.cache_blocks(request, block_hashes, - num_computed_tokens) + def cache_blocks(self, request: Request, num_computed_tokens: int) -> None: + """Cache the blocks for the request, if enabled.""" + if self.enable_caching: + block_hashes = self.req_to_block_hashes[request.request_id] + self.coordinator.cache_blocks(request, block_hashes, + num_computed_tokens) def create_empty_block_list(self) -> KVCacheBlocks: """Creates a new KVCacheBlocks instance with no blocks.""" - return KVCacheBlocks([[] for _ in range(self.num_kv_cache_groups)]) + return KVCacheBlocks(tuple([] + for _ in range(self.num_kv_cache_groups))) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 6d4bcfe64a357..9489bcf433fd2 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -660,6 +660,7 @@ def get_num_blocks(vllm_config: VllmConfig, num_layers: int, logger.info( "Overriding num_gpu_blocks=%d with " "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) + num_blocks = num_gpu_blocks_override return num_blocks diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index b404c70eb1e44..9b0a439fe7dcd 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -27,7 +27,7 @@ class NewRequestData: mm_hashes: list[str] mm_positions: list[PlaceholderRange] sampling_params: SamplingParams - block_ids: list[list[int]] + block_ids: tuple[list[int], ...] num_computed_tokens: int lora_request: Optional[LoRARequest] @@ -35,7 +35,7 @@ class NewRequestData: def from_request( cls, request: Request, - block_ids: list[list[int]], + block_ids: tuple[list[int], ...], ) -> NewRequestData: return cls( req_id=request.request_id, @@ -86,7 +86,7 @@ class CachedRequestData: # request's block IDs instead of appending to the existing block IDs. resumed_from_preemption: bool new_token_ids: list[int] - new_block_ids: list[list[int]] + new_block_ids: tuple[list[int], ...] num_computed_tokens: int @classmethod @@ -95,7 +95,7 @@ class CachedRequestData: request: Request, resumed_from_preemption: bool, new_token_ids: list[int], - new_block_ids: list[list[int]], + new_block_ids: tuple[list[int], ...], ) -> CachedRequestData: return cls( req_id=request.request_id, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index b3293d9a541f7..2d2274ab6a4d5 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -180,7 +180,7 @@ class Scheduler(SchedulerInterface): # uses structured decoding. structured_output_request_ids: dict[str, int] = {} - req_to_new_block_ids: dict[str, list[list[int]]] = {} + req_to_new_block_ids: dict[str, tuple[list[int], ...]] = {} num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Encoder-related. @@ -471,7 +471,7 @@ class Scheduler(SchedulerInterface): token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - # Count the number of prifix cached tokens. + # Count the number of prefix cached tokens. if request.num_cached_tokens < 0: request.num_cached_tokens = num_computed_tokens # Encoder-related. @@ -588,7 +588,7 @@ class Scheduler(SchedulerInterface): request: Request, num_scheduled_tokens: int, num_scheduled_spec_tokens: int, - new_block_ids: list[list[int]], + new_block_ids: tuple[list[int], ...], resumed_from_preemption: bool, ) -> CachedRequestData: # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating @@ -947,7 +947,7 @@ class Scheduler(SchedulerInterface): return SchedulerStats( num_running_reqs=len(self.running), num_waiting_reqs=len(self.waiting), - gpu_cache_usage=self.kv_cache_manager.usage, + kv_cache_usage=self.kv_cache_manager.usage, prefix_cache_stats=prefix_cache_stats, spec_decoding_stats=spec_decoding_stats, ) @@ -1015,11 +1015,8 @@ class Scheduler(SchedulerInterface): num_computed_tokens = min(num_computed_tokens, request.num_tokens) if num_computed_tokens == request.num_tokens: num_computed_tokens -= 1 - self.kv_cache_manager.cache_blocks( - request, - self.kv_cache_manager.req_to_block_hashes[request.request_id], - num_computed_tokens, - ) + # This will cache the blocks iff caching is enabled. + self.kv_cache_manager.cache_blocks(request, num_computed_tokens) # Update the request state for scheduling. request.num_computed_tokens = num_computed_tokens diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 98d758f820ad6..95222779c3afb 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -197,7 +197,7 @@ class SingleTypeKVCacheManager(ABC): block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, - ) -> list[list[KVCacheBlock]]: + ) -> tuple[list[KVCacheBlock], ...]: """ Get the longest cache hit prefix of the blocks that is not longer than `max_length`. The prefix should be a common prefix hit for all the @@ -222,7 +222,7 @@ class SingleTypeKVCacheManager(ABC): element is a list of cached blocks for the i-th kv cache group in `kv_cache_group_ids`. For example, sliding window manager should return a list like - [[NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)]] for block size 4 + ([NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)]) for block size 4 and sliding window 8 and len(kv_cache_group_ids) = 1. """ @@ -254,27 +254,25 @@ class FullAttentionManager(SingleTypeKVCacheManager): block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, - ) -> list[list[KVCacheBlock]]: + ) -> tuple[list[KVCacheBlock], ...]: assert isinstance(kv_cache_spec, FullAttentionSpec), ( "FullAttentionManager can only be used for full attention groups") - computed_blocks: list[list[KVCacheBlock]] = [ - [] for _ in range(len(kv_cache_group_ids)) - ] + computed_blocks: tuple[list[KVCacheBlock], ...] = tuple( + [] for _ in range(len(kv_cache_group_ids))) max_num_blocks = max_length // kv_cache_spec.block_size - for i in range(max_num_blocks): - block_hash = block_hashes[i] + for i, block_hash in zip(range(max_num_blocks), block_hashes): # block_hashes is a chain of block hashes. If a block hash is not # in the cached_block_hash_to_id, the following block hashes are # not computed yet for sure. if cached_block := block_pool.get_cached_block( block_hash, kv_cache_group_ids): - for j in range(len(kv_cache_group_ids)): - computed_blocks[j].append(cached_block[j]) + for computed, cached in zip(computed_blocks, cached_block): + computed.append(cached) else: break - if use_eagle and len(computed_blocks[0]) > 0: - for j in range(len(kv_cache_group_ids)): - computed_blocks[j].pop() + if use_eagle and computed_blocks[0]: + for computed in computed_blocks: + computed.pop() return computed_blocks def remove_skipped_blocks(self, request_id: str, @@ -311,7 +309,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager): block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, - ) -> list[list[KVCacheBlock]]: + ) -> tuple[list[KVCacheBlock], ...]: assert isinstance(kv_cache_spec, SlidingWindowSpec), ( "SlidingWindowManager can only be used for sliding window groups") @@ -332,23 +330,23 @@ class SlidingWindowManager(SingleTypeKVCacheManager): # sliding_window_contiguous_blocks), # which is good for low cache hit rate scenarios. max_num_blocks = max_length // kv_cache_spec.block_size - computed_blocks = [[block_pool.null_block] * max_num_blocks - for _ in range(len(kv_cache_group_ids))] + computed_blocks = tuple([block_pool.null_block] * max_num_blocks + for _ in range(len(kv_cache_group_ids))) num_contiguous_blocks = 0 match_found = False # Search from right to left and early stop when a match is found. for i in range(max_num_blocks - 1, -1, -1): if cached_block := block_pool.get_cached_block( block_hashes[i], kv_cache_group_ids): - for j in range(len(kv_cache_group_ids)): - computed_blocks[j][i] = cached_block[j] + for computed, cached in zip(computed_blocks, cached_block): + computed[i] = cached num_contiguous_blocks += 1 - if (num_contiguous_blocks >= sliding_window_contiguous_blocks): + if num_contiguous_blocks >= sliding_window_contiguous_blocks: # Trim the trailing blocks. # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3] # when sliding_window_contiguous_blocks=2. - for j in range(len(kv_cache_group_ids)): - del computed_blocks[j][i + num_contiguous_blocks:] + for computed in computed_blocks: + del computed[i + num_contiguous_blocks:] match_found = True break else: @@ -356,11 +354,11 @@ class SlidingWindowManager(SingleTypeKVCacheManager): if not match_found: # The first `num_contiguous_blocks` is a cache hit even if # `num_contiguous_blocks < sliding_window_contiguous_blocks`. - for j in range(len(kv_cache_group_ids)): - del computed_blocks[j][num_contiguous_blocks:] - if use_eagle and len(computed_blocks[0]) > 0: - for j in range(len(kv_cache_group_ids)): - computed_blocks[j].pop() + for computed in computed_blocks: + del computed[num_contiguous_blocks:] + if use_eagle and computed_blocks[0]: + for computed in computed_blocks: + computed.pop() return computed_blocks def remove_skipped_blocks(self, request_id: str, diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index 4f6ba099c650c..031e9b85f24c6 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -183,11 +183,12 @@ class CoordinatorProc: # engines are paused, so that we can wake the other # engines. engine_to_exclude, wave = msgspec.msgpack.decode(buffer) - if wave < self.current_wave: - # If the wave number is stale, ensure the message is - # handled by all the engines. - engine_to_exclude = None if not self.engines_running: + if wave < self.current_wave: + # If the wave number is stale, ensure the message + # is handled by all the engines. + engine_to_exclude = None + self.engines_running = True self.stats_changed = True self._send_start_wave(publish_back, self.current_wave, @@ -203,22 +204,24 @@ class CoordinatorProc: assert outputs.utility_output is None eng_index = outputs.engine_index - if outputs.scheduler_stats: + scheduler_stats = outputs.scheduler_stats + if scheduler_stats: # 1. Updated request load stats - update our local # state with these. stats = self.engines[eng_index].request_counts - stats[0] = outputs.scheduler_stats.num_waiting_reqs - stats[1] = outputs.scheduler_stats.num_running_reqs + stats[0] = scheduler_stats.num_waiting_reqs + stats[1] = scheduler_stats.num_running_reqs self.stats_changed = True if (wave := outputs.wave_complete) is not None: # 2. Notification from rank 0 engine that we've # moved into the global paused state - # (engines_running==False) + # (engines_running==False). if self.current_wave <= wave: + new_wave = wave + 1 logger.debug("Moving DP wave from %d to %d.", - self.current_wave, wave) - self.current_wave = wave + 1 + self.current_wave, new_wave) + self.current_wave = new_wave self.engines_running = False self.stats_changed = True elif (wave := outputs.start_wave) is not None and ( diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f36a491a19702..57fcf8daa5a1b 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -84,6 +84,8 @@ class EngineCore: vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks + self.collective_rpc("initialize_cache", + args=(num_gpu_blocks, num_cpu_blocks)) self.structured_output_manager = StructuredOutputManager(vllm_config) @@ -209,11 +211,14 @@ class EngineCore: def execute_model(self, scheduler_output: SchedulerOutput): try: return self.model_executor.execute_model(scheduler_output) - except BaseException as err: + except Exception as err: + # We do not want to catch BaseException here since we're only + # interested in dumping info when the exception is due to an + # error from execute_model itself. + # NOTE: This method is exception-free dump_engine_exception(self.vllm_config, scheduler_output, self.scheduler.make_stats()) - # Re-raise exception raise err def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index c6fe2d339c93d..35aceba0fe766 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -17,6 +17,14 @@ from vllm.v1.engine import EngineCoreRequest logger = init_logger(__name__) +# Only tokenizers >= 0.21.1 supports DecodeStream used for +# FastIncrementalDetokenizer. +USE_FAST_DETOKENIZER = version.parse( + tokenizers.__version__) >= version.parse("0.21.1") + +# Error string from https://github.com/huggingface/tokenizers/blob/909fdde2a4ffedd9295206f705eb612be2a91b12/tokenizers/src/tokenizer/mod.rs#L1042 +INVALID_PREFIX_ERR_MSG = "Invalid prefix encountered" + class IncrementalDetokenizer: @@ -46,10 +54,9 @@ class IncrementalDetokenizer: # No tokenizer => skipping detokenization. return IncrementalDetokenizer() - if (isinstance(tokenizer, PreTrainedTokenizerFast) and version.parse( - tokenizers.__version__) >= version.parse("0.21.1")): + if USE_FAST_DETOKENIZER and isinstance(tokenizer, + PreTrainedTokenizerFast): # Fast tokenizer => use tokenizers library DecodeStream. - # And only tokenizers >= 0.21.1 supports Fast Detokenizer. return FastIncrementalDetokenizer(tokenizer, request) # Fall back to slow python-based incremental detokenization. @@ -157,8 +164,11 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): super().__init__(request) sampling_params = request.sampling_params + + self.request_id = request.request_id + self.skip_special_tokens = sampling_params.skip_special_tokens self.stream = DecodeStream( - skip_special_tokens=sampling_params.skip_special_tokens) + skip_special_tokens=self.skip_special_tokens) self.tokenizer: Tokenizer = tokenizer._tokenizer @@ -174,7 +184,7 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): # Prime the stream. for tid in prompt_suffix: - self.stream.step(self.tokenizer, tid) + self._protected_step(tid) self.spaces_between_special_tokens = ( sampling_params.skip_special_tokens @@ -199,7 +209,7 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): self.spaces_between_special_tokens = True def decode_next(self, next_token_id: int) -> str: - token = self.stream.step(self.tokenizer, next_token_id) + token = self._protected_step(next_token_id) if not self.spaces_between_special_tokens: special_token = self.added_token_ids.get(next_token_id) @@ -211,6 +221,23 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): return token or "" + def _protected_step(self, next_token_id: int) -> Optional[str]: + try: + token = self.stream.step(self.tokenizer, next_token_id) + except Exception as e: + if str(e) != INVALID_PREFIX_ERR_MSG: + raise e + # Recover from edge case where tokenizer can produce non-monotonic, + # invalid UTF-8 output, which breaks the internal state of + # tokenizers' DecodeStream. + # See https://github.com/vllm-project/vllm/issues/17448. + logger.warning( + "Encountered invalid prefix detokenization error" + " for request %s, resetting decode stream.", self.request_id) + self.stream = DecodeStream(self.skip_special_tokens) + token = self.stream.step(self.tokenizer, next_token_id) + return token + class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer): diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 0bd7383b5f0e4..2148680d5f565 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -20,6 +20,7 @@ from typing import Any, Callable, Optional, Union, cast import cloudpickle +import vllm.envs as envs from vllm.config import VllmConfig from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel) @@ -72,7 +73,10 @@ class MultiprocExecutor(Executor): # Initialize worker and set up message queues for SchedulerOutputs # and ModelRunnerOutputs - self.rpc_broadcast_mq = MessageQueue(self.world_size, self.world_size) + max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024 + self.rpc_broadcast_mq = MessageQueue(self.world_size, + self.world_size, + max_chunk_bytes=max_chunk_bytes) scheduler_output_handle = self.rpc_broadcast_mq.export_handle() # Create workers diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 2d621ec31038f..11865a0fd1f27 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -127,7 +127,7 @@ class LoggingStatLogger(StatLoggerBase): generation_throughput, scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, - scheduler_stats.gpu_cache_usage * 100, + scheduler_stats.kv_cache_usage * 100, self.prefix_caching_metrics.hit_rate * 100, ) self.spec_decoding_logging.log(log_fn=log_fn) @@ -185,22 +185,49 @@ class PrometheusStatLogger(StatLoggerBase): # # GPU cache # + # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc + # TODO: in 0.10, only enable if show_hidden_metrics=True self.gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", - documentation="GPU KV-cache usage. 1 means 100 percent usage.", + documentation=( + "GPU KV-cache usage. 1 means 100 percent usage." + "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), multiprocess_mode="mostrecent", labelnames=labelnames).labels(*labelvalues) + # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries + # TODO: in 0.10, only enable if show_hidden_metrics=True self.counter_gpu_prefix_cache_queries = self._counter_cls( name="vllm:gpu_prefix_cache_queries", documentation= - "GPU prefix cache queries, in terms of number of queried tokens.", + ("GPU prefix cache queries, in terms of number of queried tokens." + "DEPRECATED: Use vllm:prefix_cache_queries instead."), labelnames=labelnames).labels(*labelvalues) + # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits + # TODO: in 0.10, only enable if show_hidden_metrics=True self.counter_gpu_prefix_cache_hits = self._counter_cls( name="vllm:gpu_prefix_cache_hits", - documentation= - "GPU prefix cache hits, in terms of number of cached tokens.", + documentation=( + "GPU prefix cache hits, in terms of number of cached tokens." + "DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames).labels(*labelvalues) + + self.gauge_kv_cache_usage = self._gauge_cls( + name="vllm:kv_cache_usage_perc", + documentation="KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames).labels(*labelvalues) + + self.counter_prefix_cache_queries = self._counter_cls( + name="vllm:prefix_cache_queries", + documentation=( + "Prefix cache queries, in terms of number of queried tokens."), + labelnames=labelnames).labels(*labelvalues) + + self.counter_prefix_cache_hits = self._counter_cls( + name="vllm:prefix_cache_hits", + documentation=( + "Prefix cache hits, in terms of number of cached tokens."), labelnames=labelnames).labels(*labelvalues) # @@ -400,13 +427,19 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) - self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) + self.gauge_gpu_cache_usage.set(scheduler_stats.kv_cache_usage) + self.gauge_kv_cache_usage.set(scheduler_stats.kv_cache_usage) self.counter_gpu_prefix_cache_queries.inc( scheduler_stats.prefix_cache_stats.queries) self.counter_gpu_prefix_cache_hits.inc( scheduler_stats.prefix_cache_stats.hits) + self.counter_prefix_cache_queries.inc( + scheduler_stats.prefix_cache_stats.queries) + self.counter_prefix_cache_hits.inc( + scheduler_stats.prefix_cache_stats.hits) + if scheduler_stats.spec_decoding_stats is not None: self.spec_decoding_prom.observe( scheduler_stats.spec_decoding_stats) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 50c8b07fe54d2..4a5d5fac49d1d 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -33,7 +33,7 @@ class SchedulerStats: num_running_reqs: int = 0 num_waiting_reqs: int = 0 - gpu_cache_usage: float = 0.0 + kv_cache_usage: float = 0.0 prefix_cache_stats: PrefixCacheStats = field( default_factory=PrefixCacheStats) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 4b5c9b7ec640e..153b67fe57147 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -138,15 +138,17 @@ class EagleProposer: max_query_len = query_lens.max().item() common_attn_metadata = CommonAttentionMetadata( - query_start_loc=cu_num_tokens, seq_lens=seq_lens) + query_start_loc=cu_num_tokens, + seq_lens=seq_lens, + num_reqs=batch_size, + num_actual_tokens=num_tokens, + max_query_len=max_query_len, + ) assert self.runner is not None # FIXME: need to consider multiple kv_cache_groups attn_metadata = self.runner.attn_metadata_builder.build( - num_reqs=batch_size, - num_actual_tokens=num_tokens, - max_query_len=max_query_len, common_prefix_len=0, common_attn_metadata=common_attn_metadata, ) @@ -320,8 +322,10 @@ class EagleProposer: target_attn_layer_names = set( get_layers_from_vllm_config(self.vllm_config, Attention).keys()) - self.model = get_model(vllm_config=self.vllm_config, - model_config=draft_model_config) + from vllm.compilation.backends import set_model_tag + with set_model_tag("eagle_head"): + self.model = get_model(vllm_config=self.vllm_config, + model_config=draft_model_config) draft_attn_layer_names = ( get_layers_from_vllm_config(self.vllm_config, Attention).keys() - @@ -329,16 +333,24 @@ class EagleProposer: self.attn_layer_names = list(draft_attn_layer_names) + if supports_multimodal(target_model): + # handle multimodality + self.model.config.image_token_index = ( + target_model.config.image_token_index) + target_language_model = target_model.get_language_model() + else: + target_language_model = target_model # share embed_tokens with the target model if needed if get_pp_group().world_size == 1 \ and self.model.model.embed_tokens.weight.shape \ - == target_model.model.embed_tokens.weight.shape: + == target_language_model.model.embed_tokens.weight.shape: logger.info( "Assuming the EAGLE head shares the same vocab embedding" \ " with the target model." ) del self.model.model.embed_tokens - self.model.model.embed_tokens = target_model.model.embed_tokens + self.model.model.embed_tokens = ( + target_language_model.model.embed_tokens) else: logger.info( "The EAGLE head's vocab embedding will be loaded separately" \ @@ -349,12 +361,9 @@ class EagleProposer: # some model definition do not define lm_head explicitly # and reuse embed_tokens for lm_head, e.g., CohereForCausalLM if self.vllm_config.speculative_config.method != "eagle3" and \ - hasattr(target_model, "lm_head"): + hasattr(target_language_model, "lm_head"): logger.info("Loading EAGLE LM head weights from the target model.") - if supports_multimodal(target_model): - self.model.lm_head = target_model.get_language_model().lm_head - else: - self.model.lm_head = target_model.lm_head + self.model.lm_head = target_language_model.lm_head @torch.inference_mode() def dummy_run( diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index f516bf486b8b5..309fd926aecd7 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -48,9 +48,11 @@ class MedusaProposer: return [list(row) for row in zip(*draft_tokens)] def load_model(self, target_model: nn.Module) -> None: - self.model = get_model(vllm_config=self.vllm_config, - model_config=self.vllm_config. - speculative_config.draft_model_config) + from vllm.compilation.backends import set_model_tag + with set_model_tag("medusa_head"): + self.model = get_model(vllm_config=self.vllm_config, + model_config=self.vllm_config. + speculative_config.draft_model_config) @torch.inference_mode() def dummy_run(self, num_tokens: int) -> None: diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 5b497e66c4bf3..192c9067740c2 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -407,7 +407,7 @@ class CoreEngineActorManager: # For now, each DP rank can only be assigned to one node # TODO(rui): support allocating a single DP rank # to multiple nodes - available_engine_count = node_resources["GPU"] // world_size + available_engine_count = int(node_resources["GPU"]) // world_size if node_ip == dp_master_ip: assert available_engine_count >= local_engine_count, ( "Not enough resources to allocate DP ranks " diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 5cd5674fb5220..8f4e8d64c615d 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -112,11 +112,12 @@ class MultiGroupBlockTable: for block_size in block_sizes ] - def append_row(self, block_ids: list[list[int]], row_idx: int) -> None: + def append_row(self, block_ids: tuple[list[int], ...], + row_idx: int) -> None: for i, block_table in enumerate(self.block_tables): block_table.append_row(block_ids[i], row_idx) - def add_row(self, block_ids: list[list[int]], row_idx: int) -> None: + def add_row(self, block_ids: tuple[list[int], ...], row_idx: int) -> None: for i, block_table in enumerate(self.block_tables): block_table.add_row(block_ids[i], row_idx) diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 607cfc0ef69cd..6631c9636eacd 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -60,7 +60,8 @@ class CPUModelRunner(GPUModelRunner): def warming_up_model(self) -> None: logger.info("Warming up model for the compilation...") # Only generate graph for the generic shape - self._dummy_run(max(16, self.max_num_reqs)) + with _set_global_compilation_settings(self.vllm_config): + self._dummy_run(max(16, self.max_num_reqs)) logger.info("Warming up done.") def _init_device_properties(self) -> None: @@ -71,16 +72,15 @@ class CPUModelRunner(GPUModelRunner): @contextmanager -def _set_global_compilation_settings(): +def _set_global_compilation_settings(config: VllmConfig): import torch._inductor.config - # Note: The CPPGEMM backend requires freezing parameters. - freezing_value = torch._inductor.config.freezing - torch._inductor.config.freezing = True - # Note: workaround for "ValueError: fast mode: can't pickle cyclic objects - # including object type dict" - force_disable_caches = torch._inductor.config.force_disable_caches - torch._inductor.config.force_disable_caches = True - yield - torch._inductor.config.freezing = freezing_value - torch._inductor.config.force_disable_caches = force_disable_caches + inductor_config = config.compilation_config.inductor_compile_config + try: + # Note: The MKLDNN and CPPGEMM backend requires freezing parameters. + freezing_value = torch._inductor.config.freezing + if inductor_config.get("max_autotune", False): + torch._inductor.config.freezing = True + yield + finally: + torch._inductor.config.freezing = freezing_value diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 0b710b7bc203f..9a35e88120386 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import os +from importlib import util from typing import Optional import torch @@ -38,10 +39,14 @@ class CPUWorker(Worker): def init_device(self): # Setup OpenMP threads affinity. omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND - if omp_cpuids == "all": - self.local_omp_cpuid = "all" + self.local_omp_cpuid = "all" + if omp_cpuids == "auto": + self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes( + ) else: self.local_omp_cpuid = omp_cpuids.split("|")[self.rank] + + if self.local_omp_cpuid != "all": ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) if ret: logger.info(ret) @@ -99,3 +104,49 @@ class CPUWorker(Worker): assert isinstance(output, ModelRunnerOutput) return output if self.is_driver_worker else None + + def get_cpus_id_binding_based_on_numa_nodes(self) -> str: + """Return CPUs id binding based on NUMA nodes. + """ + rank_to_cpus = self.local_omp_cpuid + # Setup OpenMP thread affinity based on NUMA nodes automatically + world_size = self.vllm_config.parallel_config.world_size + libnuma_found = util.find_spec("numa") is not None + psutil_found = util.find_spec("psutil") is not None + if libnuma_found and psutil_found: + import psutil + from numa import info + cpu_count = psutil.cpu_count(logical=False) + cpus_allow_list = psutil.Process().cpu_affinity() + numa_size = info.get_num_configured_nodes() + cpu_count_per_numa = cpu_count // numa_size + num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU, + cpu_count_per_numa // 2) + + # check allow node_to_cpus list + node_to_cpus = [] + for i in range(numa_size): + node_intersect = set( + info.node_to_cpus(i)).intersection(cpus_allow_list) + if bool(node_intersect): + node_to_cpus.append(list(node_intersect)) + + if world_size > len(node_to_cpus): + logger.error( + "Auto thread-binding failed due to " + "world size: %d is larger than " + "allowed NUMA nodes number: %d." + "Please try to bind threads manually.", world_size, + len(node_to_cpus)) + else: + end = cpu_count_per_numa - num_of_reserved_cpu + rank_to_cpus_list = node_to_cpus[self.rank][:end] + rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list) + logger.info("auto thread-binding list: %s", rank_to_cpus) + else: + logger.warning( + "Auto thread-binding is not supported due to " + "the lack of package numa and psutil," + "fallback to no thread-binding. To get better performance," + "please try to manually bind threads.") + return rank_to_cpus diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 34737029f6bf3..ebb770a7ddb29 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -30,7 +30,7 @@ class CachedRequestState: sampling_params: SamplingParams generator: Optional[torch.Generator] - block_ids: list[list[int]] + block_ids: tuple[list[int], ...] num_computed_tokens: int output_token_ids: list[int] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ebec98f7219c4..842700ea23583 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -6,18 +6,19 @@ import gc import threading import time import weakref +from contextlib import contextmanager from typing import TYPE_CHECKING, Any, Optional, TypeAlias, Union import numpy as np import torch import torch.distributed import torch.nn as nn +from tqdm import tqdm +import vllm.envs as envs from vllm.attention import AttentionType, get_attn_backend -from vllm.attention.backends.abstract import (AttentionBackend, - AttentionMetadataBuilder) +from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import (CompilationLevel, VllmConfig, get_layers_from_vllm_config) from vllm.distributed.kv_transfer import (get_kv_transfer_group, @@ -42,7 +43,8 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, check_use_alibi, is_pin_memory_available, current_stream) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata -from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, + CommonAttentionMetadata) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheSpec, @@ -68,11 +70,15 @@ from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, if TYPE_CHECKING: import xgrammar as xgr + import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile # noqa: E501 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import SchedulerOutput else: xgr = LazyLoader("xgr", globals(), "xgrammar") + xgr_torch_compile = LazyLoader( + "xgr_torch_compile", globals(), + "xgrammar.kernels.apply_token_bitmask_inplace_torch_compile") logger = init_logger(__name__) @@ -95,6 +101,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.vllm_config = vllm_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config + self.compilation_config = vllm_config.compilation_config self.lora_config = vllm_config.lora_config self.load_config = vllm_config.load_config self.parallel_config = vllm_config.parallel_config @@ -203,7 +210,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): block_sizes=[self.cache_config.block_size], ) - self.use_cuda_graph = (self.vllm_config.compilation_config.level + self.use_cuda_graph = (self.compilation_config.level == CompilationLevel.PIECEWISE and not self.model_config.enforce_eager) logger.info(f"self.use_cuda_graph {self.use_cuda_graph}") @@ -212,8 +219,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): # self.cudagraph_batch_sizes sorts in ascending order. # The batch sizes in the config are in descending order. self.cudagraph_batch_sizes = list( - reversed( - self.vllm_config.compilation_config.cudagraph_capture_sizes)) + reversed(self.compilation_config.cudagraph_capture_sizes)) + + self.full_cuda_graph = self.compilation_config.full_cuda_graph # Cache the device properties. self._init_device_properties() @@ -474,10 +482,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Update the block IDs. if not req_data.resumed_from_preemption: # Append the new blocks to the existing block IDs. - for block_ids, new_block_ids in zip( # type: ignore[call-overload] - req_state.block_ids, - req_data.new_block_ids, - strict=True): + for block_ids, new_block_ids in zip(req_state.block_ids, + req_data.new_block_ids): block_ids.extend(new_block_ids) else: # The request is resumed from preemption. @@ -601,9 +607,16 @@ class GPUModelRunner(LoRAModelRunnerMixin): def _prepare_inputs( self, scheduler_output: "SchedulerOutput" - ) -> tuple[PerLayerAttnMetadata, torch.Tensor, + ) -> tuple[PerLayerAttnMetadata, bool, torch.Tensor, Optional[SpecDecodeMetadata], Optional[UBatchSlices], int, Optional[torch.Tensor]]: + """ + :return: tuple[ + attn_metadata: layer-to-attention_metadata mapping, + attention_cuda_graphs: whether attention can run in cudagraph + logits_indices, spec_decode_metadata + ] + """ total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens assert total_num_scheduled_tokens > 0 num_reqs = self.input_batch.num_reqs @@ -757,13 +770,21 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Fill unused with -1. Needed for reshape_and_cache self.seq_lens[num_reqs:].fill_(0) - self.query_start_loc[num_reqs + 1:].fill_(-1) + # Note: pad query_start_loc to be non-decreasing, as kernels + # like FlashAttention requires that + self.query_start_loc[num_reqs + 1:].fill_( + self.query_start_loc_cpu[num_reqs].item()) query_start_loc = self.query_start_loc[:num_reqs + 1] seq_lens = self.seq_lens[:num_reqs] common_attn_metadata = CommonAttentionMetadata( - query_start_loc=query_start_loc, seq_lens=seq_lens) + query_start_loc=query_start_loc, + seq_lens=seq_lens, + num_reqs=num_reqs, + num_actual_tokens=total_num_scheduled_tokens, + max_query_len=max_num_scheduled_tokens, + ) attn_metadata: PerLayerAttnMetadata = {} if ubatch_slices is not None: @@ -776,13 +797,14 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Prepare for cascade attention if enabled & beneficial. common_prefix_len = 0 + builder = self.attn_metadata_builders[kv_cache_group_id] if self.cascade_attn_enabled: common_prefix_len = self._compute_cascade_attn_prefix_len( num_scheduled_tokens, scheduler_output. num_common_prefix_blocks[kv_cache_group_id], kv_cache_group_spec.kv_cache_spec, - self.attn_metadata_builders[kv_cache_group_id], + builder, ) # Fill unused with -1. Needed for reshape_and_cache in full cuda @@ -814,15 +836,16 @@ class GPUModelRunner(LoRAModelRunnerMixin): else: attn_metadata_i = ( self.attn_metadata_builders[kv_cache_group_id].build( - num_reqs=num_reqs, - num_actual_tokens=total_num_scheduled_tokens, - max_query_len=max_num_scheduled_tokens, common_prefix_len=common_prefix_len, common_attn_metadata=common_attn_metadata)) for layer_name in kv_cache_group_spec.layer_names: assert type(attn_metadata) is dict attn_metadata[layer_name] = attn_metadata_i + attention_cuda_graphs = all( + b.can_run_in_cudagraph(common_attn_metadata) + for b in self.attn_metadata_builders) + use_spec_decode = len( scheduler_output.scheduled_spec_decode_tokens) > 0 if not use_spec_decode: @@ -851,8 +874,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): if self.lora_config: self.set_active_loras(self.input_batch, num_scheduled_tokens) - return (attn_metadata, logits_indices, spec_decode_metadata, - ubatch_slices, num_pad_tokens, num_tokens_after_padding) + return (attn_metadata, attention_cuda_graphs, logits_indices, + spec_decode_metadata, ubatch_slices, num_pad_tokens, + num_tokens_after_padding) def _compute_cascade_attn_prefix_len( self, @@ -1091,7 +1115,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): encoder_outputs = [] for grouped_mm_inputs in grouped_mm_inputs_list: - batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) + batched_mm_inputs = MultiModalKwargs.batch( + grouped_mm_inputs, pin_memory=self.pin_memory) batched_mm_inputs = MultiModalKwargs.as_kwargs( batched_mm_inputs, device=self.device, @@ -1230,7 +1255,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): # so we receive it in that format. grammar_bitmask = torch.from_numpy(grammar_bitmask) - xgr.apply_token_bitmask_inplace( + # Force use of the torch.compile implementation from xgrammar to work + # around issues with the Triton kernel in concurrent structured output + # scenarios. See PR #19565 and issues #19493, #18376 for details. + xgr_torch_compile.apply_token_bitmask_inplace_torch_compile( logits, grammar_bitmask.to(self.device, non_blocking=True), indices=out_indices, @@ -1245,7 +1273,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_tokens = tokens_slice.stop - tokens_slice.start tp = self.vllm_config.parallel_config.tensor_parallel_size - enabled_sp = self.vllm_config.compilation_config.pass_config. \ + enabled_sp = self.compilation_config.pass_config. \ enable_sequence_parallelism if enabled_sp: # When sequence parallelism is enabled, we always pad num_tokens @@ -1520,7 +1548,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): ubatch_slices: Optional[UBatchSlices] = None, scheduler_output: Optional["SchedulerOutput"] = None, is_dummy_run: bool = False, - num_tokens_across_dp: Optional[torch.Tensor] = None): + num_tokens_across_dp: Optional[torch.Tensor] = None, + skip_cuda_graphs: bool = False): num_dummy_tokens = num_scheduled_tokens if is_dummy_run else 1 @@ -1587,7 +1616,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): if attn_metadata is not None else None, self.vllm_config, num_tokens=num_tokens, - num_tokens_across_dp=num_tokens_across_dp) + num_tokens_across_dp=num_tokens_across_dp, + skip_cuda_graphs=skip_cuda_graphs) thread = threading.Thread(target=_ubatch_thread, args=( @@ -1626,7 +1656,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): set_forward_context(attn_metadata, vllm_config=self.vllm_config, num_tokens=num_scheduled_tokens or 1, - num_tokens_across_dp=num_tokens_across_dp), + num_tokens_across_dp=num_tokens_across_dp, + skip_cuda_graphs=skip_cuda_graphs), is_dummy_run) return model_output @@ -1649,7 +1680,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # num_scheduled_tokens_old = scheduler_output.total_num_scheduled_tokens # num_pad_tokens, num_tokens_after_padding = self.get_dp_padding(num_scheduled_tokens_old) # Prepare the decoder inputs. - attn_metadata, logits_indices, spec_decode_metadata, ubatch_slices, num_pad_tokens, num_tokens_after_padding = ( + attn_metadata, attention_cuda_graphs, logits_indices, spec_decode_metadata, ubatch_slices, num_pad_tokens, num_tokens_after_padding = ( self._prepare_inputs(scheduler_output)) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens num_input_tokens = num_scheduled_tokens @@ -1662,6 +1693,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): # logger.info("NORMAL BATCH DONE") num_input_tokens += num_pad + # Some attention backends only support CUDA Graphs in pure decode. + # If attention doesn't support CUDA Graphs for this batch, but we + # compiled with full CUDA graphs, we have to skip them entirely. + skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs + # logger.info("RUNNING MODEL") # Run the decoder. # Use persistent buffers for CUDA graphs. @@ -1671,7 +1707,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_scheduled_tokens=num_input_tokens, ubatch_slices=ubatch_slices, scheduler_output=scheduler_output, - num_tokens_across_dp=num_tokens_after_padding + num_tokens_across_dp=num_tokens_after_padding, + skip_cuda_graphs=skip_cuda_graphs, ) self.maybe_wait_for_kv_save() finished_sending, finished_recving = ( @@ -2142,15 +2179,45 @@ class GPUModelRunner(LoRAModelRunnerMixin): return prompt_logprobs_dict + @contextmanager + def maybe_randomize_inputs(self, input_ids: torch.Tensor): + """ + Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set. + This is to help balance expert-selection + - during profile_run + - during DP rank dummy run + """ + dp_size = self.vllm_config.parallel_config.data_parallel_size + randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1 + if not randomize_inputs: + yield + else: + import functools + + @functools.cache + def rand_input_ids() -> torch.Tensor: + return torch.randint_like( + self.input_ids, + low=0, + high=self.model_config.get_vocab_size(), + dtype=input_ids.dtype) + + logger.debug("Randomizing dummy data for DP Rank") + input_ids.copy_(rand_input_ids()[:input_ids.size(0)], + non_blocking=True) + yield + input_ids.fill_(0) + @torch.inference_mode() def _dummy_run( self, num_tokens: int, skip_attn: bool = True, + # Maybe return a cudagraph here + capture_attn_cudagraph: bool = False, # For profiling runs we dont want microbatching but for # dp dummy runs we do. allow_microbatching: bool = False, - # Maybe return a cudagraph here ) -> torch.Tensor: should_microbatch = False @@ -2178,9 +2245,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) - if skip_attn: - attn_metadata: Optional[dict[str, Any]] = None - else: + attn_metadata: Optional[dict[str, Any]] = None + if capture_attn_cudagraph: + attn_metadata = {} + query_start_loc = self.query_start_loc[:num_reqs + 1] # Make sure max_model_len is used at the graph capture time. self.seq_lens_np[:num_reqs] = self.max_model_len @@ -2190,19 +2258,19 @@ class GPUModelRunner(LoRAModelRunnerMixin): seq_lens = self.seq_lens[:num_reqs] common_attn_metadata = CommonAttentionMetadata( - query_start_loc=query_start_loc, seq_lens=seq_lens) + query_start_loc=query_start_loc, + seq_lens=seq_lens, + num_reqs=num_reqs, + num_actual_tokens=num_tokens, + max_query_len=num_tokens, + ) - attn_metadata = {} for kv_cache_group_id, kv_cache_group_spec in enumerate( self.kv_cache_config.kv_cache_groups): - attn_metadata_i = ( - self.attn_metadata_builders[kv_cache_group_id].build( - num_reqs=num_reqs, - num_actual_tokens=num_tokens, - max_query_len=num_tokens, - common_prefix_len=0, - common_attn_metadata=common_attn_metadata, - )) + + attn_metadata_i = self.attn_metadata_builders[ + kv_cache_group_id].build_for_cudagraph_capture( + common_attn_metadata) for layer_name in kv_cache_group_spec.layer_names: attn_metadata[layer_name] = attn_metadata_i @@ -2363,7 +2431,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): ).multi_modal_data batched_dummy_mm_inputs = MultiModalKwargs.batch( - [dummy_mm_kwargs] * max_num_mm_items) + [dummy_mm_kwargs] * max_num_mm_items, + pin_memory=self.pin_memory) batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, device=self.device, @@ -2408,17 +2477,15 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. with graph_capture(device=self.device): - skip_attn = not self.vllm_config.compilation_config.full_cuda_graph + full_cg = self.full_cuda_graph allow_microbatching = self.parallel_config.enable_microbatching - for num_tokens in reversed(self.cudagraph_batch_sizes): - for _ in range(self.vllm_config.compilation_config. - cudagraph_num_of_warmups): - self._dummy_run(num_tokens, skip_attn=skip_attn, - allow_microbatching=allow_microbatching) - # print("CUDAGRAPH CAPTURE START") - self._dummy_run(num_tokens, skip_attn=skip_attn, - allow_microbatching=allow_microbatching) - # print("CUDAGRAPH CAPTURE END") + for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes), + desc="Capturing CUDA graphs", + total=len(self.cudagraph_batch_sizes)): + for _ in range( + self.compilation_config.cudagraph_num_of_warmups): + self._dummy_run(num_tokens, capture_attn_cudagraph=full_cg) + self._dummy_run(num_tokens, capture_attn_cudagraph=full_cg) logger.info("CAPTURE MODEL END") end_time = time.perf_counter() @@ -2462,20 +2529,20 @@ class GPUModelRunner(LoRAModelRunnerMixin): "Non-Attention backend is not supported by V1 " "GPUModelRunner.") - if self.vllm_config.compilation_config.full_cuda_graph: - attn_backend_name = attn_backend_i.__name__ - flash_attn_version = get_flash_attn_version() - if attn_backend_name != "FlashAttentionBackend" or \ - flash_attn_version != 3: - raise ValueError( - f"full_cuda_graph is only supported with " - f"FA3. Current attention backend is " - f"{attn_backend_name}, FlashAttention version is " - f"{flash_attn_version}.") - block_table_i = self.input_batch.block_table[i] attn_metadata_builder_i = attn_backend_i.get_builder_cls()( - weakref.proxy(self), kv_cache_spec, block_table_i) + weakref.proxy(self), + kv_cache_spec, + block_table_i, + ) + + if (self.full_cuda_graph + and not attn_metadata_builder_i.full_cudagraph_supported): + raise ValueError( + f"Full CUDAGraph not supported for " + f"{attn_backend_i.__name__}. Turn off CompilationConfig." + f"full_cuda_graph or use a different attention backend.") + self.attn_backends.append(attn_backend_i) self.attn_metadata_builders.append(attn_metadata_builder_i) @@ -2515,9 +2582,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): to be reshaped to the desired shape before being used by the models. Args: - kv_cache_config: The KV cache config + kv_cache_config: The KV cache config Returns: - dict[str, torch.Tensor]: A map between layer names to their + dict[str, torch.Tensor]: A map between layer names to their corresponding memory buffer for KV cache. """ kv_cache_raw_tensors: dict[str, torch.Tensor] = {} @@ -2544,11 +2611,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): Reshape the KV cache tensors to the desired shape and dtype. Args: - kv_cache_config: The KV cache config - kv_cache_raw_tensors: The KV cache buffer of each layer, with + kv_cache_config: The KV cache config + kv_cache_raw_tensors: The KV cache buffer of each layer, with correct size but uninitialized shape. Returns: - Dict[str, torch.Tensor]: A map between layer names to their + Dict[str, torch.Tensor]: A map between layer names to their corresponding memory buffer for KV cache. """ kv_caches: dict[str, torch.Tensor] = {} @@ -2600,7 +2667,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): Args: kv_cache_config: The KV cache config Returns: - Dict[str, torch.Tensor]: A map between layer names to their + Dict[str, torch.Tensor]: A map between layer names to their corresponding memory buffer for KV cache. """ # Initialize the memory buffer for KV cache @@ -2618,10 +2685,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): kv_caches, ) - bind_kv_cache( - kv_caches, - self.vllm_config.compilation_config.static_forward_context, - self.kv_caches) + bind_kv_cache(kv_caches, + self.compilation_config.static_forward_context, + self.kv_caches) return kv_caches def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0bb790028fbdb..e485cfcfa8f99 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -22,7 +22,7 @@ from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import GiB_bytes +from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.utils import report_usage_stats @@ -112,6 +112,11 @@ class Worker(WorkerBase): buffer.data.copy_(self._sleep_saved_buffers[name].data) self._sleep_saved_buffers = {} + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + def init_device(self): if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until @@ -130,20 +135,22 @@ class Worker(WorkerBase): _check_if_gpu_supports_dtype(self.model_config.dtype) gc.collect() torch.cuda.empty_cache() - self.init_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info() - requested_memory = (total_gpu_memory * - self.cache_config.gpu_memory_utilization) - if self.init_gpu_memory < requested_memory: + + # take current memory snapshot + self.init_snapshot = MemorySnapshot() + self.requested_memory = (self.init_snapshot.total_memory * + self.cache_config.gpu_memory_utilization) + if self.init_snapshot.free_memory < self.requested_memory: GiB = lambda b: round(b / GiB_bytes, 2) raise ValueError( - f"Free memory on device ({GiB(self.init_gpu_memory)}/" - f"{GiB(total_gpu_memory)} GiB) on startup is less than " - f"desired GPU memory utilization " + f"Free memory on device " + f"({GiB(self.init_snapshot.free_memory)}/" + f"{GiB(self.init_snapshot.total_memory)} GiB) on startup " + f"is less than desired GPU memory utilization " f"({self.cache_config.gpu_memory_utilization}, " - f"{GiB(requested_memory)} GiB). Decrease GPU memory " + f"{GiB(self.requested_memory)} GiB). Decrease GPU memory " f"utilization or reduce GPU memory used by other processes." ) - else: raise RuntimeError( f"Not support device type: {self.device_config.device}") @@ -192,57 +199,39 @@ class Worker(WorkerBase): """ torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() + GiB = lambda b: b / GiB_bytes - _, total_gpu_memory = torch.cuda.mem_get_info() # Execute a forward pass with dummy inputs to profile the memory usage # of the model. - self.model_runner.profile_run() + with memory_profiling( + self.init_snapshot, + weights_memory=int( + self.model_runner.model_memory_usage)) as profile_result: + self.model_runner.profile_run() - free_gpu_memory, _ = torch.cuda.mem_get_info() + free_gpu_memory = profile_result.after_profile.free_memory # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. - assert self.init_gpu_memory > free_gpu_memory, ( + assert self.init_snapshot.free_memory > free_gpu_memory, ( "Error in memory profiling. " - f"Initial free memory {self.init_gpu_memory/GiB_bytes} GiB, " - f"current free memory {free_gpu_memory/GiB_bytes} GiB. " - f"This happens when the GPU memory was not properly cleaned up " - f"before initializing the vLLM instance.") + f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, " + f"current free memory {GiB(free_gpu_memory)} GiB. " + "This happens when other processes sharing the same container " + "release GPU memory while vLLM is profiling during initialization. " + "To fix this, ensure consistent GPU memory allocation or " + "isolate vLLM in its own container.") + available_kv_cache_memory = self.requested_memory \ + - profile_result.non_kv_cache_memory - # Get the peak memory allocation recorded by torch - peak_torch_memory = torch.cuda.memory_stats( - )["allocated_bytes.all.peak"] - - # Check for any memory left around that may have been allocated on the - # gpu outside of `torch`. NCCL operations, for example, can use a few - # GB during a forward pass. - torch.cuda.empty_cache() - torch_allocated_bytes = torch.cuda.memory_stats( - )["allocated_bytes.all.current"] - - # Reset after emptying torch cache - free_gpu_memory = torch.cuda.mem_get_info()[0] - - # Total forward allocation (current) is equal to the diff in free memory - fwd_alloc_bytes = self.init_gpu_memory - free_gpu_memory - # We assume current non-torch allocation is equal to peak - non_torch_alloc_bytes = max(0, fwd_alloc_bytes - torch_allocated_bytes) - # Total forward allocation (peak) is peak torch + non-torch - peak_memory = peak_torch_memory + non_torch_alloc_bytes - - available_kv_cache_memory = ( - total_gpu_memory * self.cache_config.gpu_memory_utilization - - peak_memory) - - GiB = lambda b: b / GiB_bytes logger.debug( "Initial free memory: %.2f GiB, free memory: %.2f GiB, " - "total GPU memory: %.2f GiB", GiB(self.init_gpu_memory), - GiB(free_gpu_memory), GiB(total_gpu_memory)) - logger.debug( - "Peak torch memory: %.2f GiB, non-torch forward-pass memory: " - "%.2f GiB, available KVCache memory: %.2f GiB", - GiB(peak_torch_memory), GiB(non_torch_alloc_bytes), - GiB(available_kv_cache_memory)) + "requested GPU memory: %.2f GiB", + GiB(self.init_snapshot.free_memory), GiB(free_gpu_memory), + GiB(self.requested_memory)) + logger.debug(profile_result) + logger.info("Available KV cache memory: %.2f GiB", + GiB(available_kv_cache_memory)) + gc.collect() return int(available_kv_cache_memory) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index d5f40e4d3103c..89c6373b37730 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -413,10 +413,8 @@ class TPUModelRunner(LoRAModelRunnerMixin): req_state.num_computed_tokens = req_data.num_computed_tokens if not req_data.resumed_from_preemption: # Append the new blocks to the existing block IDs. - for block_ids, new_block_ids in zip( # type: ignore[call-overload] - req_state.block_ids, - req_data.new_block_ids, - strict=True): + for block_ids, new_block_ids in zip(req_state.block_ids, + req_data.new_block_ids): block_ids.extend(new_block_ids) else: # The request is resumed from preemption. diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 16a9f0959b5c5..87af8e476707c 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -93,6 +93,11 @@ class TPUWorker: if self.model_config.seed is None: self.model_config.seed = 0 + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + def init_device(self): os.environ["PJRT_DEVICE"] = "TPU" # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D @@ -101,7 +106,10 @@ class TPUWorker: # fix this. It will be removed after the bug in XLA compiler is fixed. os.environ["LIBTPU_INIT_ARGS"] = ( os.environ.get("LIBTPU_INIT_ARGS", "") + - " --xla_tpu_force_1d_allreduce_at_chunk_count=1") + " --xla_tpu_force_1d_allreduce_at_chunk_count=1" + " --xla_jf_conv_input_fusion=False") + # --xla_jf_conv_input_fusion=False is used to improve the perf of + # quantized matmul. torch.set_grad_enabled(False) torch.set_default_dtype(self.model_config.dtype) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 055cf01530f02..70339ff2f0051 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -4,11 +4,12 @@ from typing import Optional import torch +from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.v1.kv_cache_interface import KVCacheGroupSpec def sanity_check_mm_encoder_outputs( - mm_embeddings: object, + mm_embeddings: MultiModalEmbeddings, expected_num_items: int, ) -> None: """ diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index b04a9a1eb08d1..9e834befd68ab 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A CPU worker class.""" import os +from importlib import util from typing import Dict, List, Optional, Set, Tuple, Type import torch @@ -156,8 +157,10 @@ class CPUWorker(LocalOrDistributedWorkerBase): # Setup OpenMP threads affinity. omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND - if omp_cpuids == "all": - self.local_omp_cpuid = "all" + self.local_omp_cpuid = "all" + if omp_cpuids == "auto": + self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes( + ) else: self.local_omp_cpuid = omp_cpuids.split("|")[rank] @@ -399,3 +402,49 @@ class CPUWorker(LocalOrDistributedWorkerBase): return CPUCacheEngine.get_cache_block_size( self.cache_config.block_size, self.cache_config.cache_dtype, self.model_config, self.parallel_config) + + def get_cpus_id_binding_based_on_numa_nodes(self) -> str: + """Return CPUs id binding based on NUMA nodes. + """ + rank_to_cpus = self.local_omp_cpuid + # Setup OpenMP thread affinity based on NUMA nodes automatically + world_size = self.vllm_config.parallel_config.world_size + libnuma_found = util.find_spec("numa") is not None + psutil_found = util.find_spec("psutil") is not None + if libnuma_found and psutil_found: + import psutil + from numa import info + cpu_count = psutil.cpu_count(logical=False) + cpus_allow_list = psutil.Process().cpu_affinity() + numa_size = info.get_num_configured_nodes() + cpu_count_per_numa = cpu_count // numa_size + num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU, + cpu_count_per_numa // 2) + + # check allow node_to_cpus list + node_to_cpus = [] + for i in range(numa_size): + node_intersect = set( + info.node_to_cpus(i)).intersection(cpus_allow_list) + if bool(node_intersect): + node_to_cpus.append(list(node_intersect)) + + if world_size > len(node_to_cpus): + logger.error( + "Auto thread-binding failed due to " + "world size: %d is larger than " + "allowed NUMA nodes number: %d." + "Please try to bind threads manually.", world_size, + len(node_to_cpus)) + else: + end = cpu_count_per_numa - num_of_reserved_cpu + rank_to_cpus_list = node_to_cpus[self.rank][:end] + rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list) + logger.info("auto thread-binding list: %s", rank_to_cpus) + else: + logger.warning( + "Auto thread-binding is not supported due to " + "the lack of package numa and psutil," + "fallback to no thread-binding. To get better performance," + "please try to manually bind threads.") + return rank_to_cpus diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 17123d2b48375..5860368298822 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -886,7 +886,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): num_decode_tokens=0, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps= - None, # FIXME(kzawora): mutli-modality will not work here + None, # FIXME(kzawora): multi-modality will not work here enable_kv_scales_calculation=False, ) multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index cc0cc855e7be4..0680e60b52a14 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -277,7 +277,7 @@ class StatefulModelInput(BroadcastableModelInput): assert fmi.input_tokens.shape[0] >= self.num_seqs fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs] - # Update frozen_model_input::input_positons. + # Update frozen_model_input::input_positions. assert fmi.input_positions is not None assert fmi.input_positions.shape[0] >= self.num_seqs fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self. diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 5f1535271b9ac..336bc0bcec363 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -798,9 +798,9 @@ class ModelWrapper(nn.Module): """ batch_size, seq_len = token_ids.shape # Calculate the positions to sample from. - start_indicies = torch.arange( + start_indices = torch.arange( batch_size, dtype=torch.int32, device=input_lens.device) * seq_len - logits_indices = start_indicies + input_lens - 1 + logits_indices = start_indices + input_lens - 1 attn_metadata = get_forward_context().attn_metadata # FIXME(woosuk): This is a temporary hack to avoid using the existing @@ -822,14 +822,14 @@ class ModelWrapper(nn.Module): num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape slot_mapping = attn_metadata.slot_mapping slot_mapping = slot_mapping.flatten() - head_indicies = torch.arange(0, - num_kv_heads, - device=slot_mapping.device, - dtype=slot_mapping.dtype) - head_indicies *= block_size * num_blocks + head_indices = torch.arange(0, + num_kv_heads, + device=slot_mapping.device, + dtype=slot_mapping.dtype) + head_indices *= block_size * num_blocks slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view( -1, num_kv_heads) - slot_mapping = slot_mapping + head_indicies.view(1, -1) + slot_mapping = slot_mapping + head_indices.view(1, -1) slot_mapping = slot_mapping.flatten() attn_metadata.slot_mapping = slot_mapping