diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 37e2980eea974..2ef36089b6afb 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -8,7 +8,7 @@ This benchmark aims to: Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. -Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176) +Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176) ## Setup diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index a1de41652c9a6..8c6ef7817aaf8 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,24 +1,22 @@ steps: # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 - label: "Build arm64 wheel - CUDA 12.9" + depends_on: ~ id: build-wheel-arm64-cuda-12-9 agents: queue: arm64_cpu_queue_postmerge commands: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - - block: "Build CUDA 12.8 wheel" - key: block-build-cu128-wheel - - label: "Build wheel - CUDA 12.8" - depends_on: block-build-cu128-wheel + depends_on: ~ id: build-wheel-cuda-12-8 agents: queue: cpu_queue_postmerge @@ -30,12 +28,8 @@ steps: env: DOCKER_BUILDKIT: "1" - - block: "Build CUDA 12.6 wheel" - key: block-build-cu126-wheel - depends_on: ~ - - label: "Build wheel - CUDA 12.6" - depends_on: block-build-cu126-wheel + depends_on: ~ id: build-wheel-cuda-12-6 agents: queue: cpu_queue_postmerge @@ -102,8 +96,6 @@ steps: depends_on: - create-multi-arch-manifest - build-wheel-cuda-12-8 - - build-wheel-cuda-12-6 - - build-wheel-cuda-12-9 id: annotate-release-workflow agents: queue: cpu_queue_postmerge diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index 94e0ac2398f34..fde48603ad3cd 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF To download the wheel: \`\`\` aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . + aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . -aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . \`\`\` To download and upload the image: \`\`\` -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai -docker tag vllm/vllm-openai vllm/vllm-openai:latest -docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION} -docker push vllm/vllm-openai:latest -docker push vllm/vllm-openai:v${RELEASE_VERSION} +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 +docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 +docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 +docker push vllm/vllm-openai:latest-x86_64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64 +docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64 +docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 +docker push vllm/vllm-openai:latest-aarch64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 + +docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend +docker manifest push vllm/vllm-openai:latest +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} \`\`\` EOF \ No newline at end of file diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 0f734763f13fd..64943d2a15a79 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -66,7 +66,6 @@ function cpu_tests() { pytest -x -v -s tests/models/language/pooling -m cpu_model pytest -x -v -s tests/models/multimodal/generation \ - --ignore=tests/models/multimodal/generation/test_mllama.py \ --ignore=tests/models/multimodal/generation/test_pixtral.py \ -m cpu_model" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index adb5c862eecd9..df95fcaa04382 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -394,6 +394,7 @@ steps: - pytest -v -s compile/test_async_tp.py - pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_decorator.py + - pytest -v -s compile/test_noop_elimination.py - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -548,15 +549,6 @@ steps: commands: # LMEval+Transcription WER check - pytest -s entrypoints/openai/correctness/ -- label: Encoder Decoder tests # 12min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/encoder_decoder - commands: - - pytest -v -s encoder_decoder - - label: OpenAI-Compatible Tool Use # 23 min timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000000000..bc6342956109b --- /dev/null +++ b/.coveragerc @@ -0,0 +1,32 @@ +[run] +source = vllm +omit = + */tests/* + */test_* + */__pycache__/* + */build/* + */dist/* + */vllm.egg-info/* + */third_party/* + */examples/* + */benchmarks/* + */docs/* + +[report] +exclude_lines = + pragma: no cover + def __repr__ + if self.debug: + if settings.DEBUG + raise AssertionError + raise NotImplementedError + if 0: + if __name__ == .__main__.: + class .*\bProtocol\): + @(abc\.)?abstractmethod + +[html] +directory = htmlcov + +[xml] +output = coverage.xml diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 846b68054c0a1..e3dbd28fa91e9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,17 +2,20 @@ # for more info about CODEOWNERS file # This lists cover the "core" components of vLLM that require careful review +/vllm/attention @LucasWilkinson /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill +/vllm/model_executor/layers/fused_moe @mgoin /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/model_loader @22quinn /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche +/vllm/v1/attention @LucasWilkinson /vllm/v1/sample @22quinn @houseroad /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee @@ -30,6 +33,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett /vllm/v1/spec_decode @benchislett @luccafong +/vllm/v1/attention/backends/flashinfer.py @mgoin /vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/core @heheda12345 /vllm/v1/kv_cache_interface.py @heheda12345 @@ -41,7 +45,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche -/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256 +/tests/evals @mgoin +/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche /tests/prefix_caching @comaniac @KuntaiDu @@ -101,4 +106,7 @@ mkdocs.yaml @hmellor /vllm/v1/worker/tpu* @NickLucche /vllm/platforms/tpu.py @NickLucche /vllm/v1/sample/tpu @NickLucche -/vllm/tests/v1/tpu @NickLucche \ No newline at end of file +/vllm/tests/v1/tpu @NickLucche + +# KVConnector installation files +/requirements/kv_connectors.txt @NickLucche diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f1f9a781a07a..009c224dc7735 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26) # cmake --install . --component _C project(vllm_extensions LANGUAGES CXX) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + + # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM") message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") @@ -779,6 +783,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() + # Hadacore kernels + cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}") + if(HADACORE_ARCHS) + set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${HADACORE_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + message(STATUS "Building hadacore") + endif() + # if CUDA endif endif() diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 94f3f1ae11f27..837b2b0c10447 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -560,7 +560,7 @@ def save_configs( filename = os.path.join(save_dir, filename) print(f"Writing best config to {filename}...") with open(filename, "w") as f: - json.dump(configs, f, indent=4) + json.dump({"triton_version": triton.__version__, **configs}, f, indent=4) f.write("\n") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 9c0ed1d09572e..8558976e2c392 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -480,7 +480,6 @@ function (define_gpu_extension_target GPU_MOD_NAME) ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}") endif() - set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17) target_compile_options(${GPU_MOD_NAME} PRIVATE $<$:${GPU_COMPILE_FLAGS}>) diff --git a/csrc/ops.h b/csrc/ops.h index c65bf431640d5..fd9c55b948959 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -347,6 +347,8 @@ std::tuple allocate_shared_buffer_and_handle( int64_t open_mem_handle(torch::Tensor& mem_handle); void free_shared_buffer(int64_t buffer); +torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace); + #ifdef USE_ROCM fptr_t init_custom_qr(int64_t rank, int64_t world_size, std::optional qr_max_size = std::nullopt); diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh index 939879b2c59fa..dbf79a0651159 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh @@ -146,6 +146,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; + using ElementBlockScale = typename Gemm::ElementBlockScale; int32_t m = a.size(0), n = b.size(1), k = a.size(1); @@ -166,26 +167,29 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) : ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); - auto a_ptr = static_cast(a.data_ptr()); - auto b_ptr = static_cast(b.data_ptr()); - auto a_scales_ptr = static_cast(a_scales.data_ptr()); - auto b_scales_ptr = static_cast(b_scales.data_ptr()); + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); + auto a_scales_ptr = static_cast(a_scales.data_ptr()); + auto b_scales_ptr = static_cast(b_scales.data_ptr()); - auto mainloop_args = [&](){ - // layout_SFA and layout_SFB cannot be swapped since they are deduced. - if (swap_ab) { - return typename GemmKernel::MainloopArguments{ - b_ptr, b_stride, a_ptr, a_stride, - b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB - }; - } - else { - return typename GemmKernel::MainloopArguments{ - a_ptr, a_stride, b_ptr, b_stride, - a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB - }; - } - }(); + typename GemmKernel::MainloopArguments mainloop_args{}; + mainloop_args.layout_SFA = layout_SFA; + mainloop_args.layout_SFB = layout_SFB; + if (swap_ab) { + mainloop_args.ptr_A = b_ptr; + mainloop_args.dA = b_stride; + mainloop_args.ptr_B = a_ptr; + mainloop_args.dB = a_stride; + mainloop_args.ptr_SFA = b_scales_ptr; + mainloop_args.ptr_SFB = a_scales_ptr; + } else { + mainloop_args.ptr_A = a_ptr; + mainloop_args.dA = a_stride; + mainloop_args.ptr_B = b_ptr; + mainloop_args.dB = b_stride; + mainloop_args.ptr_SFA = a_scales_ptr; + mainloop_args.ptr_SFB = b_scales_ptr; + } auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1); auto c_ptr = static_cast(out.data_ptr()); diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh index 78d5cf37fa6d0..811741aee58b3 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh @@ -125,6 +125,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; + using ElementBlockScale = typename Gemm::ElementBlockScale; int32_t m = a.size(0), n = b.size(1), k = a.size(1); @@ -143,17 +144,20 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, LayoutSFB layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); - auto a_ptr = static_cast(a.data_ptr()); - auto b_ptr = static_cast(b.data_ptr()); - auto a_scales_ptr = static_cast(a_scales.data_ptr()); - auto b_scales_ptr = static_cast(b_scales.data_ptr()); + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); + auto a_scales_ptr = static_cast(a_scales.data_ptr()); + auto b_scales_ptr = static_cast(b_scales.data_ptr()); - auto mainloop_args = [&](){ - return typename GemmKernel::MainloopArguments{ - a_ptr, a_stride, b_ptr, b_stride, - a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB - }; - }(); + typename GemmKernel::MainloopArguments mainloop_args{}; + mainloop_args.ptr_A = a_ptr; + mainloop_args.dA = a_stride; + mainloop_args.ptr_B = b_ptr; + mainloop_args.dB = b_stride; + mainloop_args.ptr_SFA = a_scales_ptr; + mainloop_args.layout_SFA = layout_SFA; + mainloop_args.ptr_SFB = b_scales_ptr; + mainloop_args.layout_SFB = layout_SFB; auto prob_shape = cute::make_shape(m, n, k, 1); auto c_ptr = static_cast(out.data_ptr()); diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh index 86220264151e7..147eb8efc0778 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh @@ -115,6 +115,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, using ElementAB = typename Gemm::ElementAB; using ElementD = typename Gemm::ElementD; + using ElementBlockScale = typename Gemm::ElementBlockScale; int32_t m = a.size(0), n = b.size(1), k = a.size(1); @@ -135,17 +136,20 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, LayoutSFB layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); - auto a_ptr = static_cast(a.data_ptr()); - auto b_ptr = static_cast(b.data_ptr()); - auto a_scales_ptr = static_cast(a_scales.data_ptr()); - auto b_scales_ptr = static_cast(b_scales.data_ptr()); + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); + auto a_scales_ptr = static_cast(a_scales.data_ptr()); + auto b_scales_ptr = static_cast(b_scales.data_ptr()); - auto mainloop_args = [&](){ - return typename GemmKernel::MainloopArguments{ - a_ptr, a_stride, b_ptr, b_stride, - a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB - }; - }(); + typename GemmKernel::MainloopArguments mainloop_args{}; + mainloop_args.ptr_A = a_ptr; + mainloop_args.dA = a_stride; + mainloop_args.ptr_B = b_ptr; + mainloop_args.dB = b_stride; + mainloop_args.ptr_SFA = a_scales_ptr; + mainloop_args.layout_SFA = layout_SFA; + mainloop_args.ptr_SFB = b_scales_ptr; + mainloop_args.layout_SFB = layout_SFB; auto prob_shape = cute::make_shape(m, n, k, 1); auto c_ptr = static_cast(out.data_ptr()); diff --git a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu new file mode 100644 index 0000000000000..5369d409f9b21 --- /dev/null +++ b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu @@ -0,0 +1,817 @@ +// clang-format off +// Adapted from: https://github.com/meta-pytorch/applied-ai/blob/main/kernels/cuda/inference/hadamard_transform/hadamard_transform_cuda.cu + +/*********** +Copyright 2024 Meta + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +***********/ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "core/registration.h" +#include "dispatch_utils.h" + +namespace hadacore { + +#ifndef __CUDACC__ +#define __launch_bounds__(x,y) +#endif + +#define MAX_WARPS_PER_SM 48 + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +using b16 = uint16_t; +using b32 = uint32_t; + +constexpr int launch_configs_big[7][3] = { + // default + {2, 1, 24}, + {2, 2, 16}, + {2, 4, 8}, + {2, 8, 4}, + {2, 16, 3}, + {4, 16, 2}, + {8, 16, 1} + // // extra coalescing + // {2, 1, 24}, + // {2, 2, 16}, + // {2, 4, 8}, + // {2, 8, 4}, + // {4, 8, 3}, + // {8, 8, 2}, + // {16, 8, 1} + // // less coalescing + // {2, 1, 24}, + // {2, 2, 16}, + // {2, 4, 8}, + // {2, 8, 4}, + // {1, 32, 1}, + // {2, 32, 1}, + // {4, 32, 1} +}; + +// a 4x2, b 2x2, c 2x2 +template +__device__ __forceinline__ void mma_m16_n8_k16_b16_b16_b16_noacc(b32 a0, b32 a1, b32 a2, b32 a3, b32 b0, b32 b1, b32& c0, b32& c1){ + static_assert(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16); + // d, a, b, c + b32 zero = 0; + if constexpr(dtype == torch::ScalarType::Half) { + asm ( + "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 " + "{%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%8, %9};\n\t" + : "=r"(c0), "=r"(c1) : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(b0), "r"(b1), "r"(zero), "r"(zero) + ); + } else { + b32 temp0, temp1, temp2, temp3; + asm ( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n\t" + : "=r"(temp0), "=r"(temp1), "=r"(temp2), "=r"(temp3) : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(b0), "r"(b1), "r"(zero), "r"(zero), "r"(zero), "r"(zero) + ); + asm ("cvt.rn.bf16x2.f32 %0, %1, %2;\n\t" : "=r"(c0) : "r"(temp1), "r"(temp0)); + asm ("cvt.rn.bf16x2.f32 %0, %1, %2;\n\t" : "=r"(c1) : "r"(temp3), "r"(temp2)); + } +} + +// a 4x2, b 4x2, c 4x2 +template +__device__ __forceinline__ void mma_m16_n16_k16_b16_b16_b16_noacc(b32 a0, b32 a1, b32 a2, b32 a3, b32 b0, b32 b1, b32 b2, b32 b3, b32& c0, b32& c1, b32& c2, b32& c3){ + mma_m16_n8_k16_b16_b16_b16_noacc(a0, a1, a2, a3, b0, b1, c0, c1); + mma_m16_n8_k16_b16_b16_b16_noacc(a0, a1, a2, a3, b2, b3, c2, c3); +} + +__device__ __forceinline__ void matrix_transpose_m8_n8_b16_inplace(b32& a0) { + asm ( + "movmatrix.sync.aligned.m8n8.trans.b16 " + "%0, %1;\n\t" + : "=r"(a0) : "r"(a0) + ); +} + +#define p_p(i) ((val_1p[i] & 0x0000FFFF) | val_1p[i] << 16) +#define p_n(i) ((val_1p[i] & 0x0000FFFF) | val_1n[i] << 16) +#define n_p(i) ((val_1n[i] & 0x0000FFFF) | val_1p[i] << 16) +#define n_n(i) ((val_1n[i] & 0x0000FFFF) | val_1n[i] << 16) + +template +__global__ void __launch_bounds__(32 * warps_per_block, blocks_per_sm) +// a is column major, b is row major +hadamard_transform_kernel(b16* a, b16* out, int total_num_chunks) { + static_assert(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16, "Only fp16 and bf16 supported currently"); + + b32 b_frag_all[num_chunks][4]; // for all chunks, holds matrix fragment (which takes 4 regs of b16x2 * 32 threads) + + int64_t blockid = blockIdx.x * warps_per_block + threadIdx.x / 32; + int64_t threadid = threadIdx.x % 32; + extern __shared__ b32 bfrag_arr[]; // num_chunks * warps_per_block * 128 + int64_t real_num_chunks = ((blockid + 1) * num_chunks) > total_num_chunks ? (total_num_chunks - (blockid * num_chunks)) : num_chunks; + int64_t diff_num_chunks = real_num_chunks - num_chunks; + + b32* a_start_ptr = (b32*) (a + blockid * num_chunks * 256); // offset a to where this warp starts + b32* out_start_ptr = (b32*) (out + blockid * num_chunks * 256); + b32* a_ptr = a_start_ptr + threadid * 4; + b32* b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * 128 + threadid * 4; + + #if (__CUDA_ARCH__ < 900) // SM80, SM89 + uint64_t cache_policy; + asm volatile( + "createpolicy.fractional.L2::evict_first.b64 %0, 1.0;\n" + : "=l"(cache_policy) + ); + #endif + + #pragma unroll + for (int64_t k = 0; k < num_chunks; k++) { + size_t shared_ptr = __cvta_generic_to_shared(b_frag_ptr); + #if (__CUDA_ARCH__ >= 900) // SM90 + asm volatile( + "cp.async.cg.shared.global [%0], [%1], 16;\n" + "cp.async.commit_group;\n" + :: "l"(shared_ptr), "l"(a_ptr) + ); + #else // SM80, SM89 + asm volatile( + "cp.async.cg.shared.global.L2::cache_hint.L2::256B [%0], [%1], 16, %2;\n" + "cp.async.commit_group;\n" + :: "l"(shared_ptr), "l"(a_ptr), "l"(cache_policy) + ); + #endif + + a_ptr += 128; + b_frag_ptr += 128; + } + + // generate hadamard 16x16 (up to 2 of them) + constexpr b16 fp16_1p[4] = {0b0011100110101000, 0b0011100000000000, 0b0011010110101000, 0b0011010000000000}; + constexpr b16 fp16_1n[4] = {0b1011100110101000, 0b1011100000000000, 0b1011010110101000, 0b1011010000000000}; + constexpr b16 bf16_1p[4] = {0b0011111100110101, 0b0011111100000000, 0b0011111010110101, 0b0011111010000000}; + constexpr b16 bf16_1n[4] = {0b1011111100110101, 0b1011111100000000, 0b1011111010110101, 0b1011111010000000}; + + #define val_type_1p(i) (((dtype) == torch::ScalarType::Half) ? (fp16_1p[i]) : (bf16_1p[i])) + #define val_type_1n(i) (((dtype) == torch::ScalarType::Half) ? (fp16_1n[i]) : (bf16_1n[i])) + constexpr b16 val_1p[4] = {val_type_1p(0), val_type_1p(1), val_type_1p(2), val_type_1p(3)}; + constexpr b16 val_1n[4] = {val_type_1n(0), val_type_1n(1), val_type_1n(2), val_type_1n(3)}; + + constexpr b32 p_p[4] = {p_p(0), p_p(1), p_p(2), p_p(3)}; + constexpr b32 p_n[4] = {p_n(0), p_n(1), p_n(2), p_n(3)}; + constexpr b32 n_p[4] = {n_p(0), n_p(1), n_p(2), n_p(3)}; + constexpr b32 n_n[4] = {n_n(0), n_n(1), n_n(2), n_n(3)}; + const b32 had_16_p1[4][4] = { + { + 0b10001000010001000010001000010001, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b10001000010001000010001000010001 + }, + { + 0b11001100100010000011001100100010, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11001100100010000011001100100010 + }, + { + 0b11111111101010101100110010011001, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11111111101010101100110010011001 + }, + { + 0b11111111101010101100110010011001, + 0b11111111101010101100110010011001, + 0b11111111101010101100110010011001, + 0b00000000010101010011001101100110 + } + }; + const b32 had_16_p2[4][4] = { + { + 0b10000000010000000010000000010000, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b10000000010000000010000000010000 + }, + { + 0b11000000100001000011000000100001, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11000000100001000011000000100001 + }, + { + 0b11110000101001011100001110010110, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11110000101001011100001110010110 + }, + { + 0b11110000101001011100001110010110, + 0b11110000101001011100001110010110, + 0b11110000101001011100001110010110, + 0b00001111010110100011110001101001 + } + }; + const b32 had_16_mask[3][4] = { + { + 0b10001000010001000010001000010001, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b10001000010001000010001000010001 + }, + { + 0b11001100110011000011001100110011, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11001100110011000011001100110011 + }, + { + 0b11111111111111111111111111111111, + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000000, + 0b11111111111111111111111111111111 + } + }; + b32 had_frag[8]; + #pragma unroll + for (int64_t i = 0; i < 2; i++) { + int64_t c_log_h = (i == 0) ? MIN(4, log_had_size) : log_had_size % 4; + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + if (c_log_h < 4) { + bool mask = had_16_mask[c_log_h - 1][j] & (1 << (31 - threadid)); + if (!mask) { + had_frag[i * 4 + j] = 0; + continue; + } + } + bool pred1 = had_16_p1[c_log_h - 1][j] & (1 << (31 - threadid)); + bool pred2 = had_16_p2[c_log_h - 1][j] & (1 << (31 - threadid)); + b32 val = pred1 ? (pred2 ? p_p[c_log_h - 1] : p_n[c_log_h - 1]) : (pred2 ? n_p[c_log_h - 1] : n_n[c_log_h - 1]); + had_frag[i * 4 + j] = val; + } + if constexpr(log_had_size <= 4 || log_had_size % 4 == 0) break; + } + + // log had size above 8, only used for above 2^8 = 256 size + constexpr int64_t part8_log_had_size = log_had_size - 8; + + b32* a_chunk_ptr = a_start_ptr; // first chunk starts at this warp's data starts + b32* out_chunk_ptr = out_start_ptr; + + #pragma unroll + for (int64_t l = 0; l < 2; l++) { + if constexpr(log_had_size <= 8) { // l == 0 guaranteed, redundant simplified version of else body, to help compiler warnings + b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * 128; + } else { + b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * (l == 0 ? 128 : (128 >> part8_log_had_size)); + } + + if (l == 1) { + if constexpr(log_had_size > 8) { + __syncthreads(); // sync between first and second iterations if above size 256 + + if constexpr(log_had_size >= 12) { + // sizes 4k and above + + // a + threadblock offset + warp offset + // can then index into all chunks owned by this warp + b32* store = bfrag_arr + (128 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block)); + + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + #pragma unroll + for (int64_t k = 0; k < num_chunks; k++) { + // here, j represents register, and k represents 8-offset/chunk + uint64_t real_chunk_num = (num_chunks - (threadid % num_chunks) + k) % num_chunks; // chunk at which you have target thread #'s data + + int64_t real_thread_id = (threadid / num_chunks) * num_chunks + k; // target thread # + int64_t chunk_idx = 128 * real_chunk_num; // index due to fetching from another chunk (chunk in which this thread has the target thread's original data) + int64_t thread_group_idx = (real_thread_id / 4) * 16; // index due to fetching from another group of num_chunk threads (since shuffle is between num_chunk threads) + int64_t thread_idx = (real_thread_id % 4) * 2; // index due to original thread's position within the group of num_chunk threads + int64_t reg_idx = (j / 2) * 8 + (j % 2); // index due to target register + int64_t idx = chunk_idx + thread_group_idx + thread_idx + reg_idx; // final index + + // fix idx for majorness + int64_t rowidx = idx % (1 << part8_log_had_size); + int64_t colidx = idx >> part8_log_had_size; + + // store[rowidx * 128 + colidx] = data; + b32 data = store[rowidx * 128 + colidx]; + + // compiler generates excessive instructions, so we manually do the if statement + #pragma unroll + for (uint64_t i = 0; i < num_chunks; i++) { + asm volatile ( + "{\n\t" + " .reg .pred p0;\n\t" + " setp.eq.s64 p0, %1, %2;\n\t" + " @p0 mov.b32 %0, %3;\n\t" + "}\n\t" + : "+r"(b_frag_all[i][j]) // Output operand %0 + : "l"(real_chunk_num), "l"(i), "r"(data) // Input operands %1, %2, %3 + ); + } + } + } + + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + #pragma unroll + for (int64_t k = 1; k < num_chunks; k++) { + int64_t threadid_contig = threadid % num_chunks; + int64_t threadid_mul = threadid / num_chunks; + int64_t threadid2 = (threadid_contig + num_chunks - k) % num_chunks + threadid_mul * num_chunks; // thread to give your data to + b_frag_all[k][j] = __shfl_sync(0xFFFFFFFF, b_frag_all[k][j], threadid2); + } + } + } + } + } + + #pragma unroll + for (int64_t k = 0; k < num_chunks; k++) { + if constexpr(enable_mask) { + if (k >= real_num_chunks) + break; + } + if (l == 0) { + // bad fix for k not being recognized as a constexpr by compiler + // asm("cp.async.wait_group %0;\n" :: "n"(num_chunks - k - 1)); + #define SWITCH_WAIT_ASYNC_LOAD_GROUP(i) case i: asm volatile("cp.async.wait_group %0;\n" :: "n"(num_chunks - i - 1)); break; + if constexpr(enable_mask) { + switch(k + diff_num_chunks) { + SWITCH_WAIT_ASYNC_LOAD_GROUP(0) + SWITCH_WAIT_ASYNC_LOAD_GROUP(1) + SWITCH_WAIT_ASYNC_LOAD_GROUP(2) + SWITCH_WAIT_ASYNC_LOAD_GROUP(3) + SWITCH_WAIT_ASYNC_LOAD_GROUP(4) + SWITCH_WAIT_ASYNC_LOAD_GROUP(5) + SWITCH_WAIT_ASYNC_LOAD_GROUP(6) + SWITCH_WAIT_ASYNC_LOAD_GROUP(7) + SWITCH_WAIT_ASYNC_LOAD_GROUP(8) + SWITCH_WAIT_ASYNC_LOAD_GROUP(9) + SWITCH_WAIT_ASYNC_LOAD_GROUP(10) + SWITCH_WAIT_ASYNC_LOAD_GROUP(11) + SWITCH_WAIT_ASYNC_LOAD_GROUP(12) + SWITCH_WAIT_ASYNC_LOAD_GROUP(13) + SWITCH_WAIT_ASYNC_LOAD_GROUP(14) + SWITCH_WAIT_ASYNC_LOAD_GROUP(15) + SWITCH_WAIT_ASYNC_LOAD_GROUP(16) + SWITCH_WAIT_ASYNC_LOAD_GROUP(17) + SWITCH_WAIT_ASYNC_LOAD_GROUP(18) + SWITCH_WAIT_ASYNC_LOAD_GROUP(19) + SWITCH_WAIT_ASYNC_LOAD_GROUP(20) + SWITCH_WAIT_ASYNC_LOAD_GROUP(21) + SWITCH_WAIT_ASYNC_LOAD_GROUP(22) + SWITCH_WAIT_ASYNC_LOAD_GROUP(23) + SWITCH_WAIT_ASYNC_LOAD_GROUP(24) + SWITCH_WAIT_ASYNC_LOAD_GROUP(25) + SWITCH_WAIT_ASYNC_LOAD_GROUP(26) + SWITCH_WAIT_ASYNC_LOAD_GROUP(27) + SWITCH_WAIT_ASYNC_LOAD_GROUP(28) + SWITCH_WAIT_ASYNC_LOAD_GROUP(29) + SWITCH_WAIT_ASYNC_LOAD_GROUP(30) + SWITCH_WAIT_ASYNC_LOAD_GROUP(31) + } + } else { + switch(k) { + SWITCH_WAIT_ASYNC_LOAD_GROUP(0) + SWITCH_WAIT_ASYNC_LOAD_GROUP(1) + SWITCH_WAIT_ASYNC_LOAD_GROUP(2) + SWITCH_WAIT_ASYNC_LOAD_GROUP(3) + SWITCH_WAIT_ASYNC_LOAD_GROUP(4) + SWITCH_WAIT_ASYNC_LOAD_GROUP(5) + SWITCH_WAIT_ASYNC_LOAD_GROUP(6) + SWITCH_WAIT_ASYNC_LOAD_GROUP(7) + SWITCH_WAIT_ASYNC_LOAD_GROUP(8) + SWITCH_WAIT_ASYNC_LOAD_GROUP(9) + SWITCH_WAIT_ASYNC_LOAD_GROUP(10) + SWITCH_WAIT_ASYNC_LOAD_GROUP(11) + SWITCH_WAIT_ASYNC_LOAD_GROUP(12) + SWITCH_WAIT_ASYNC_LOAD_GROUP(13) + SWITCH_WAIT_ASYNC_LOAD_GROUP(14) + SWITCH_WAIT_ASYNC_LOAD_GROUP(15) + SWITCH_WAIT_ASYNC_LOAD_GROUP(16) + SWITCH_WAIT_ASYNC_LOAD_GROUP(17) + SWITCH_WAIT_ASYNC_LOAD_GROUP(18) + SWITCH_WAIT_ASYNC_LOAD_GROUP(19) + SWITCH_WAIT_ASYNC_LOAD_GROUP(20) + SWITCH_WAIT_ASYNC_LOAD_GROUP(21) + SWITCH_WAIT_ASYNC_LOAD_GROUP(22) + SWITCH_WAIT_ASYNC_LOAD_GROUP(23) + SWITCH_WAIT_ASYNC_LOAD_GROUP(24) + SWITCH_WAIT_ASYNC_LOAD_GROUP(25) + SWITCH_WAIT_ASYNC_LOAD_GROUP(26) + SWITCH_WAIT_ASYNC_LOAD_GROUP(27) + SWITCH_WAIT_ASYNC_LOAD_GROUP(28) + SWITCH_WAIT_ASYNC_LOAD_GROUP(29) + SWITCH_WAIT_ASYNC_LOAD_GROUP(30) + SWITCH_WAIT_ASYNC_LOAD_GROUP(31) + } + } + } + + if (l == 0) { + // loading for the first iteration + + // thread 0 loads [t0r0, t16r1, t0r2, t16r3] + // thread 16 loads [t0r1, t16r0, t0r3, t16r2] + // allows full coalescing, same for t1/t17, t2/t18, etc. + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + int64_t reg = ((threadid & 16) == 0) ? j : (j / 2 * 2 + (1 - j % 2)); + int64_t real_thread_id = (reg == 0 || reg == 2) ? threadid : (threadid ^ 16); + int64_t real_row = real_thread_id % 4; + int64_t real_col = real_thread_id / 4; + b_frag_all[k][j] = b_frag_ptr[(real_row + (reg % 2) * 4) + (real_col + (j / 2) * 8) * 8]; + } + + // for t16 swap r0/r1 and r2/r3 to have [t16r0, t0r1, t16r2, t0r3] + // so registers are in right order, same for t17, t18, etc. + if ((threadid & 16) != 0) { + b32 temp = b_frag_all[k][0]; + b_frag_all[k][0] = b_frag_all[k][1]; + b_frag_all[k][1] = temp; + + temp = b_frag_all[k][2]; + b_frag_all[k][2] = b_frag_all[k][3]; + b_frag_all[k][3] = temp; + } + + // t0 and t16 swap r1 and r3 to have their own data, + // same for t1/t17, t2/18, etc. + #pragma unroll + for (int64_t j = 1; j < 4; j += 2) { + b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], 16); + } + } else if constexpr(log_had_size > 8) { // condition is redundant to help compiler warnings + if constexpr(log_had_size < 12) { + // sizes 512, 1k, and 2k + + // for 512: + // thread 0 loads [t0r0, t0r1, t16r2, t16r3] + // thread 16 loads [t0r2, t0r3, t16r0, t16r1] + // same for t1/t17, t2/t18, etc. + // for 1k and 2k: + // thread 0 loads [t0r0, t0r1, t1r2, t1r3] + // thread 1 loads [t0r2, t0r3, t1r0, t1r1] + // same for t2/t3, t4/t5, etc. + // allows full coalescing for 512 and 1k, 16x coalescing for 2k + constexpr int64_t xor_val = log_had_size == 9 ? 16 : 1; + + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + int64_t reg = ((threadid & xor_val) == 0) ? j : (j + 2) % 4; + int64_t real_thread_id = reg < 2 ? threadid : (threadid ^ xor_val); + int64_t idx = (real_thread_id / 4 * 16) + (real_thread_id % 4 * 2) + (reg / 2 * 8) + (reg % 2); + int64_t rowidx = idx % (1 << part8_log_had_size); + int64_t colidx = idx >> part8_log_had_size; + b_frag_all[k][j] = b_frag_ptr[rowidx * 128 + colidx]; + } + + if ((threadid & xor_val) != 0) { + b32 temp = b_frag_all[k][0]; + b_frag_all[k][0] = b_frag_all[k][2]; + b_frag_all[k][2] = temp; + + temp = b_frag_all[k][1]; + b_frag_all[k][1] = b_frag_all[k][3]; + b_frag_all[k][3] = temp; + } + + #pragma unroll + for (int64_t j = 2; j < 4; j++) { + b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], xor_val); + } + } + } + + if (l == 1) { + // for second iteration, we load 2 consecutive b16s (1 b32) per register, + // but tensor core register layout requires 2 b16s that are in the + // same column/consecutive rows to be in the same register, so do the swap + b32 f0 = ((b_frag_all[k][1] & 0xFFFF) << 16) | (b_frag_all[k][0] & 0xFFFF); + b32 f1 = ((b_frag_all[k][3] & 0xFFFF) << 16) | (b_frag_all[k][2] & 0xFFFF); + b32 f2 = (b_frag_all[k][1] & 0xFFFF0000) | (b_frag_all[k][0] >> 16); + b32 f3 = (b_frag_all[k][3] & 0xFFFF0000) | (b_frag_all[k][2] >> 16); + b_frag_all[k][0] = f0; + b_frag_all[k][1] = f1; + b_frag_all[k][2] = f2; + b_frag_all[k][3] = f3; + } + + #pragma unroll + for(int64_t i = 0, remaining_log_had_size = log_had_size - l * 8; i < 2 && remaining_log_had_size > 0; i++) { + int64_t had_off = ((remaining_log_had_size < 4) && !(log_had_size <= 4 || log_had_size % 4 == 0)) ? 4 : 0; + mma_m16_n16_k16_b16_b16_b16_noacc(had_frag[had_off + 0], had_frag[had_off + 1], had_frag[had_off + 2], had_frag[had_off + 3], b_frag_all[k][0], b_frag_all[k][1], b_frag_all[k][2], b_frag_all[k][3], b_frag_all[k][0], b_frag_all[k][1], b_frag_all[k][2], b_frag_all[k][3]); + + remaining_log_had_size -= 4; + if (remaining_log_had_size <= 0 && i == 0) { + // TODO: consider different storing so no need for transpose + matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][0]); + matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][1]); + matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][2]); + matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][3]); + } else { + // swap and use output directly as b_frag for next iteration as an actually free transpose + b32 temp = b_frag_all[k][1]; + b_frag_all[k][1] = b_frag_all[k][2]; + b_frag_all[k][2] = temp; + } + } + + if (l == 1) { + // invert swap from above for second iteration + b32 f0 = ((b_frag_all[k][2] & 0xFFFF) << 16) | (b_frag_all[k][0] & 0xFFFF); + b32 f1 = (b_frag_all[k][2] & 0xFFFF0000) | (b_frag_all[k][0] >> 16); + b32 f2 = ((b_frag_all[k][3] & 0xFFFF) << 16) | (b_frag_all[k][1] & 0xFFFF); + b32 f3 = (b_frag_all[k][3] & 0xFFFF0000) | (b_frag_all[k][1] >> 16); + b_frag_all[k][0] = f0; + b_frag_all[k][1] = f1; + b_frag_all[k][2] = f2; + b_frag_all[k][3] = f3; + } + + if (l == 0) { + // inverse of coalesced load for first iteration to store result + #pragma unroll + for (int64_t j = 1; j < 4; j += 2) { + b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], 16); + } + + if ((threadid & 16) != 0) { + b32 temp = b_frag_all[k][0]; + b_frag_all[k][0] = b_frag_all[k][1]; + b_frag_all[k][1] = temp; + + temp = b_frag_all[k][2]; + b_frag_all[k][2] = b_frag_all[k][3]; + b_frag_all[k][3] = temp; + } + + // if only going up to 256 size, store directly back to global memory, + // otherwise store back to shared memory for next iteration + b32* store = (log_had_size <= 8) ? out_chunk_ptr : b_frag_ptr; + + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + int64_t reg = ((threadid & 16) == 0) ? j : (j / 2 * 2 + (1 - j % 2)); + int64_t real_thread_id = (reg == 0 || reg == 2) ? threadid : (threadid ^ 16); + int64_t real_row = real_thread_id % 4; + int64_t real_col = real_thread_id / 4; + store[(real_row + (reg % 2) * 4) + (real_col + (reg / 2) * 8) * 8] = b_frag_all[k][j]; + } + } else if constexpr(log_had_size > 8) { // condition is redundant to help compiler warnings + if (log_had_size < 12) { + // inverse of coalesced load for sizes 512, 1k and 2k to store result + constexpr int xor_val = log_had_size == 9 ? 16 : 1; + #pragma unroll + for (int64_t j = 2; j < 4; j++) { + b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], xor_val); + } + + if ((threadid & xor_val) != 0) { + b32 temp = b_frag_all[k][0]; + b_frag_all[k][0] = b_frag_all[k][2]; + b_frag_all[k][2] = temp; + + temp = b_frag_all[k][1]; + b_frag_all[k][1] = b_frag_all[k][3]; + b_frag_all[k][3] = temp; + } + + b32* store = (b32*)(out + (blockid / warps_per_block) * (num_chunks * warps_per_block) * 256 + (256 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block) + k)); + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + int64_t reg = ((threadid & xor_val) == 0) ? j : (j + 2) % 4; + b32 data = b_frag_all[k][j]; + int64_t real_thread_id = reg < 2 ? threadid : (threadid ^ xor_val); + int64_t idx = (real_thread_id / 4 * 16) + (real_thread_id % 4 * 2) + (reg / 2 * 8) + (reg % 2); + int64_t rowidx = idx % (1 << part8_log_had_size); + int64_t colidx = idx >> part8_log_had_size; + store[rowidx * 128 + colidx] = data; + } + } + // for size 4k and above, wait to process all chunks so a final store can be performed coalesced + } + + a_chunk_ptr += 128; // (only affects first 256 size) move on to next chunk by skipping 256 elements in b16 (= 128 in b32) + out_chunk_ptr += 128; + if constexpr(log_had_size > 8) { + b_frag_ptr += (l == 0 ? 128 : (128 >> part8_log_had_size)); + } else { // else is redundant, simplified version of if body, to help compiler warnings + b_frag_ptr += 128; + } + } + if (log_had_size <= 8) + break; + } + + if constexpr(log_had_size >= 12) { + // for sizes 4k and above, perform final coalesced store after processing all chunks + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + #pragma unroll + for (int64_t k = 1; k < num_chunks; k++) { + int64_t threadid_contig = threadid % num_chunks; + int64_t threadid_mul = threadid / num_chunks; + int64_t threadid2 = (threadid_contig + k) % num_chunks + threadid_mul * num_chunks; // thread to give your data to + b_frag_all[k][j] = __shfl_sync(0xFFFFFFFF, b_frag_all[k][j], threadid2); + } + } + + // a + threadblock offset + warp offset + // can then index into all chunks owned by this warp + b32* store = bfrag_arr + (128 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block)); + + #pragma unroll + for (int64_t j = 0; j < 4; j++) { + #pragma unroll + for (int64_t k = 0; k < num_chunks; k++) { + // here, j represents register, and k represents 8-offset/chunk + int64_t real_chunk_num = (num_chunks - (threadid % num_chunks) + k) % num_chunks; // chunk at which you have target thread #'s data + + // b32 data = b_frag_all[real_chunk_num][j]; // target thread data + b32 data; + #pragma unroll + for (int64_t i = 0; i < num_chunks; i++) { + if (real_chunk_num == i) data = b_frag_all[i][j]; + } + + int64_t real_thread_id = (threadid / num_chunks) * num_chunks + k; // target thread # + int64_t chunk_idx = 128 * real_chunk_num; // index due to fetching from another chunk (chunk in which this thread has the target thread's original data) + int64_t thread_group_idx = (real_thread_id / 4) * 16; // index due to fetching from another group of num_chunk threads (since shuffle is between num_chunk threads) + int64_t thread_idx = (real_thread_id % 4) * 2; // index due to original thread's position within the group of num_chunk threads + int64_t reg_idx = (j / 2) * 8 + (j % 2); // index due to target register + int64_t idx = chunk_idx + thread_group_idx + thread_idx + reg_idx; // final index + + // fix idx for majorness + int64_t rowidx = idx % (1 << part8_log_had_size); + int64_t colidx = idx >> part8_log_had_size; + + store[rowidx * 128 + colidx] = data; + } + } + + __syncthreads(); + store = ((b32*) out) + (blockid / warps_per_block) * (num_chunks * warps_per_block) * 128; + int4* store4 = (int4*) store; + int4* bfrag_arr4 = (int4*) bfrag_arr; + // flush smem, simply linearly write to store + // always divisible by 128*32b, so (32*4)*32b is ok + #pragma unroll + for (int64_t warp_off = 0; warp_off < (num_chunks * warps_per_block * 128 / 4); warp_off += 32 * warps_per_block) { + int64_t total_off = warp_off + threadid + (blockid % warps_per_block) * 32; + store4[total_off] = bfrag_arr4[total_off]; + } + } + +} + +constexpr int64_t ceil_div(int64_t a, int64_t b) { + return (a + b - 1) / b; +} + +template +void __forceinline__ run_kernel(b16* a_mat, b16* out, int64_t num_chunks, cudaStream_t stream) { + int64_t shared_size = chunks_per_warp * warps_per_block * 128 * 4; + dim3 block_size = 32 * warps_per_block; + + #define CHECK_SHARED_LIM() { \ + if (shared_size > 48 * 1024) { \ + C10_CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536)); \ + } \ + } \ + + if constexpr(check_masking) { + if (num_chunks % (chunks_per_warp * warps_per_block) != 0) { + dim3 grid_size = ceil_div(ceil_div(num_chunks, chunks_per_warp), warps_per_block); + auto kernel = hadamard_transform_kernel; + CHECK_SHARED_LIM(); + kernel<<>>(a_mat, out, num_chunks); + } else { + dim3 grid_size = num_chunks / chunks_per_warp / warps_per_block; + auto kernel = hadamard_transform_kernel; + CHECK_SHARED_LIM(); + kernel<<>>(a_mat, out, num_chunks); + } + } else { + dim3 grid_size = num_chunks / chunks_per_warp / warps_per_block; + auto kernel = hadamard_transform_kernel; + CHECK_SHARED_LIM(); + kernel<<>>(a_mat, out, num_chunks); + } + + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void run_fht(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream) { + int64_t num_chunks = numel / 256; // caller required to ensure divisible by 256 + // for size 256, use (2, 1) + // for size 32k use (8, 16) + constexpr int64_t chunks_per_warp_small = 1;// 8; + constexpr int64_t warps_per_block_small = 1;//2;//16; + constexpr int64_t blocks_per_sm_small = 24; + constexpr int64_t chunks_per_warp_large = 2; + constexpr int64_t warps_per_block_large = 1; + constexpr int64_t blocks_per_sm_large = 24; + + b16* a_mat = (b16*) a_mat_ptr; + b16* out = (b16*) out_ptr; + + if (numel <= 256) { + switch (had_size) { + case (1<<1): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<2): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<3): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<4): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<5): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<6): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<7): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<8): run_kernel(a_mat, out, num_chunks, stream); break; + } + } else { + switch (had_size) { + case (1<<1): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<2): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<3): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<4): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<5): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<6): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<7): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<8): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<9): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<10): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<11): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<12): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<13): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<14): run_kernel(a_mat, out, num_chunks, stream); break; + case (1<<15): run_kernel(a_mat, out, num_chunks, stream); break; + } + } +} + +template void run_fht(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream); +template void run_fht(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream); + +} // namespace hadacore + +constexpr bool is_power_of_two(int x) { return x && !(x & (x - 1)); } + +torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) { + auto dtype = x.scalar_type(); + TORCH_CHECK(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16, "Only fp16 and bf16 supported currently"); + TORCH_CHECK(x.is_cuda()); + + const int had_size = x.size(-1); + TORCH_CHECK(is_power_of_two(had_size) && (had_size <= (1U << 15)), + "Only power of two Hadamard sizes up to 2^15 are supported, got ", had_size); + + const auto res_shape = x.sizes(); + x = x.reshape({-1, had_size}); + + auto numel = x.numel(); + if (numel % 256 != 0) { + x = torch::nn::functional::pad(x, torch::nn::functional::PadFuncOptions({0, 0, 0, (256 - numel % 256) / had_size})); + } + + if (x.stride(-1) != 1) { + x = x.contiguous(); + } + torch::Tensor out = inplace ? x : torch::empty_like(x); + + at::cuda::CUDAGuard device_guard{(char)x.get_device()}; + auto stream = at::cuda::getCurrentCUDAStream().stream(); + + VLLM_DISPATCH_HALF_TYPES(x.scalar_type(), "hadacore_transform_runfht", [&] { + auto constexpr SCALAR_TYPE = c10::CppTypeToScalarType::value; + hadacore::run_fht(x.data_ptr(), x.data_ptr(), x.numel(), had_size, stream); + }); + + if (numel % 256 != 0) { + out = out.index({torch::indexing::Slice(0, numel / had_size)}); + } + + if (inplace && out.data_ptr() != x.data_ptr()) { + x.copy_(out.view(res_shape)); + return x; + } + return out.reshape(res_shape); +} + +TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { + m.impl("hadacore_transform", &hadacore_transform); +} diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index e3a0e15f5304f..dac9df6048f2a 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -30,6 +30,10 @@ #define __HIP__GFX9__ #endif +#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__)) + #define __HIP__FP8MFMA__ +#endif + #if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__)) #define __HIP__GFX11__ #endif @@ -51,6 +55,12 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) +enum class MFMAType { + F16 = 0, + Fp8 = 1, + Fp4 = 2, +}; + #if defined(__HIP__GFX9__) #define GCN_MFMA_INSTR1 __builtin_amdgcn_mfma_f32_16x16x4f32 @@ -112,6 +122,21 @@ __device__ __forceinline__ floatx4 gcn_mfma16x16x16_instr(const _B16x4& inpA, } } +template +__device__ __forceinline__ floatx4 gcn_mfma16x16x32_instr(const long& inpA, + const long& inpB, + const floatx4& inpC) { + if constexpr (std::is_same::value) { + return __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(inpA, inpB, inpC, absz, + cbid, blgp); + } else if constexpr (std::is_same::value) { + return __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(inpA, inpB, inpC, absz, + cbid, blgp); + } else { + static_assert(false, "unsupported 8b dtype"); + } +} + template __device__ __forceinline__ float to_float(const T& inp) { if constexpr (std::is_same::value) { @@ -256,12 +281,44 @@ __device__ __forceinline__ _B16x8 convert_b8x8_custom(const _B8x8 input) { return ret; } +typedef union u64_cvt { + half f16x4[4]; + int16_t b16x4[4]; + _B8x8 b8x8; + _B16x4 b64; + int64_t i64; +} _T8x8; + +__device__ __forceinline__ _B8x8 convert_b16x8(const _B16x8& input, + _T8x8& Mtemp) { + _T8x8 Qtmp8x8; + + for (int i = 0; i < 2; i++) { + floatx4 q_out = {0, 0, 0, 0}; + q_out = gcn_mfma16x16x16_instr<_Float16, 0, 0, 0>(Mtemp.b64, input.xy[i], + q_out); + Qtmp8x8.b16x4[i * 2] = + __builtin_amdgcn_cvt_pk_fp8_f32(q_out[0], q_out[1], 0, false); + Qtmp8x8.b16x4[i * 2 + 1] = + __builtin_amdgcn_cvt_pk_fp8_f32(q_out[2], q_out[3], 0, false); + } + return Qtmp8x8.b8x8; +} + +__device__ float warpReduceMax(float val) { + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + val = max( + val, __shfl_down(val, offset, WARP_SIZE)); // Using max() for reduction + } + return val; +} + // grid (num_seqs, num_partitions,num_kv_heads) // block (256) // clang-format off template + int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO, MFMAType MFMA_TYPE> __global__ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -367,6 +424,10 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq; int kphysical_block_number[TLOOP]; + #if defined(__HIP__FP8MFMA__) + float q_max = 0; + float q_scale = 1.0; + #endif // fetch k physical block numbers for (int token_depth = 0; token_depth < TLOOP; token_depth++) { @@ -416,6 +477,15 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( Qlocal[qkhe_depth][qkratio].xy[i] = shared_logits[qkhe_depth][rowid][lane16id % GQA_RATIO] [2 * qkratio + i]; + #if defined(__HIP__FP8MFMA__) + if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto && + MFMA_TYPE == MFMAType::Fp8) { + scalar_t* qptr = + reinterpret_cast(&Qlocal[qkhe_depth][qkratio].xy[i]); + for (int k = 0; k < 4; k++) + q_max = fmax(fabs(to_float(qptr[k])), q_max); + } + #endif } } } @@ -515,6 +585,14 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) { // multiply by k_scale if fp8 kv cache scale2 *= *k_scale; + #if defined(__HIP__FP8MFMA__) + q_max = warpReduceMax(q_max); + constexpr float FP8_E4M3_SCALE_TARGET = 224.0f; + if constexpr (MFMA_TYPE == MFMAType::Fp8) { + q_scale = q_max > 0 ? FP8_E4M3_SCALE_TARGET / q_max : 1.0f; + scale2 /= q_scale; + } + #endif } floatx4 d_out[TLOOP]; @@ -534,12 +612,41 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( auto Ktmp = Klocal[token_depth][qkhe_depth]; _B8x16 Ktmp8x16 = *reinterpret_cast<_B8x16*>(&Ktmp); for (int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) { - _B8x8 Ktmp8x8 = Ktmp8x16.xy[qkratio]; - _B16x8 Klocaltmp = convert_b8x8_custom(Ktmp8x8); - for (int i = 0; i < 2; i++) { - d_out[token_depth] = gcn_mfma16x16x16_instr( - Klocaltmp.xy[i], Qlocal[qkhe_depth][qkratio].xy[i], - d_out[token_depth]); + if constexpr (MFMA_TYPE == MFMAType::F16) { + _B8x8 Ktmp8x8 = Ktmp8x16.xy[qkratio]; + _B16x8 Klocaltmp = convert_b8x8_custom(Ktmp8x8); + for (int i = 0; i < 2; i++) { + d_out[token_depth] = gcn_mfma16x16x16_instr( + Klocaltmp.xy[i], Qlocal[qkhe_depth][qkratio].xy[i], + d_out[token_depth]); + } + } else { + #if defined(__HIP__FP8MFMA__) + _T8x8 Ktmp8x8, Qtmp8x8; + Ktmp8x8.b8x8 = Ktmp8x16.xy[qkratio]; + + for (int n = 0; n < 2; n++) { + scalar_t* qptr = reinterpret_cast( + &Qlocal[qkhe_depth][qkratio].xy[n]); + + Qtmp8x8.b16x4[n * 2] = + vllm::fp8::scaled_vec_conversion( + make_float2(to_float(qptr[0]), + to_float(qptr[1])), + q_scale); + Qtmp8x8.b16x4[n * 2 + 1] = + vllm::fp8::scaled_vec_conversion( + make_float2(to_float(qptr[2]), + to_float(qptr[3])), + q_scale); + } + + d_out[token_depth] = + gcn_mfma16x16x32_instr<__hip_fp8_e4m3, 0, 0, 0>( + Ktmp8x8.i64, Qtmp8x8.i64, d_out[token_depth]); + #else + UNREACHABLE_CODE + #endif } } } @@ -629,17 +736,36 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( // disable rtz conversion due to its impact on accuracy. constexpr bool LOGITS_RTZ_CONVERSION = false; + #if defined(__HIP__FP8MFMA__) + int rowid_8x8 = rowid / 2; + int offset = rowid % 2; + #endif + // write logits to shared mem for (int token_depth = 0; token_depth < TLOOP; token_depth++) { d_out[token_depth] *= inv_sum_scale; - if constexpr (LOGITS_RTZ_CONVERSION) { - // use rtz conversion for better performance, with negligible impact on - // accuracy - shared_logits[warpid][token_depth][lane16id][rowid] = - from_floatx4_rtz(d_out[token_depth]); + if constexpr (MFMA_TYPE != MFMAType::Fp8) { + if constexpr (LOGITS_RTZ_CONVERSION) { + // use rtz conversion for better performance, with negligible impact on + // accuracy + shared_logits[warpid][token_depth][lane16id][rowid] = + from_floatx4_rtz(d_out[token_depth]); + } else { + shared_logits[warpid][token_depth][lane16id][rowid] = + from_floatx4(d_out[token_depth]); + } } else { - shared_logits[warpid][token_depth][lane16id][rowid] = - from_floatx4(d_out[token_depth]); + #if defined(__HIP__FP8MFMA__) + // cast _B16x4* to _B8x8* + _T8x8& logits_8x8 = *reinterpret_cast<_T8x8*>( + &shared_logits[warpid][token_depth][lane16id][rowid_8x8]); + logits_8x8.b16x4[offset * 2] = __builtin_amdgcn_cvt_pk_fp8_f32( + d_out[token_depth][0], d_out[token_depth][1], 0, false); + logits_8x8.b16x4[offset * 2 + 1] = __builtin_amdgcn_cvt_pk_fp8_f32( + d_out[token_depth][2], d_out[token_depth][3], 0, false); + #else + UNREACHABLE_CODE + #endif } } @@ -692,19 +818,42 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( _B8x16 Vtmp8x16 = *reinterpret_cast<_B8x16*>(&Vtmp); for (int j = 0; j < ELEMS16_ELEMS8_RATIO; j++) { _B8x8 Vtmp8x8 = Vtmp8x16.xy[j]; - _B16x8 Vlocaltmp = convert_b8x8_custom(Vtmp8x8); - for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) { - const int offset = - rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO + - j * ELEMS8_ELEMS4_RATIO + i; - const int offset1 = offset % ROWS_PER_WARP; - const int offset2 = offset / ROWS_PER_WARP; - // output format is 16 qheads across 16 lanes, 16 head elems - // spread across 4 rows - tmp_out = gcn_mfma16x16x16_instr( - Vlocaltmp.xy[i], - shared_logits[vtoken_depth][offset2][lane16id][offset1], - tmp_out); + if constexpr (MFMA_TYPE == MFMAType::F16) { + _B16x8 Vlocaltmp = convert_b8x8_custom(Vtmp8x8); + for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) { + const int offset = + rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO + + j * ELEMS8_ELEMS4_RATIO + i; + const int offset1 = offset % ROWS_PER_WARP; + const int offset2 = offset / ROWS_PER_WARP; + // output format is 16 qheads across 16 lanes, 16 head elems + // spread across 4 rows + tmp_out = gcn_mfma16x16x16_instr( + Vlocaltmp.xy[i], + shared_logits[vtoken_depth][offset2][lane16id][offset1], + tmp_out); + } + } else { + #if defined(__HIP__FP8MFMA__) + for (int i = 0; i < ELEMS8_ELEMS4_RATIO / 2; i++) { + const int offset = + rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO + + j * ELEMS8_ELEMS4_RATIO + i; + const int offset1 = (offset % ROWS_PER_WARP) / 2; + const int offset2 = offset / ROWS_PER_WARP; + // output format is 16 qheads across 16 lanes, 16 head elems + // spread across 4 rows + tmp_out = gcn_mfma16x16x32_instr<__hip_fp8_e4m3, 0, 0, 0>( + reinterpret_cast<_T8x8*>(&Vtmp8x8)->i64, + reinterpret_cast<_T8x8*>( + &shared_logits[vtoken_depth][offset2][lane16id] + [offset1]) + ->i64, + tmp_out); + } + #else + UNREACHABLE_CODE + #endif } } } @@ -1570,7 +1719,8 @@ __device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) { // clang-format off template + int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO, + MFMAType MFMA_TYPE> __global__ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -2337,7 +2487,8 @@ __device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) { // clang-format off template + int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO, + MFMAType MFMA_TYPE> __global__ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -2969,7 +3120,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( template + int GQA_RATIO, MFMAType MFMA_TYPE> __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel( const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -3041,7 +3192,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( #define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO) \ paged_attention_ll4mi_QKV_mfma16_kernel \ + GQA_RATIO, MFMA_TYPE> \ <<>>( \ query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ block_tables_ptr, seq_lens_ptr, query_start_loc_ptr, \ @@ -3069,7 +3220,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( template + bool ALIBI_ENABLED, MFMAType MFMA_TYPE> void paged_attention_custom_launcher( torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, @@ -3225,7 +3376,7 @@ void paged_attention_custom_launcher( template + bool ALIBI_ENABLED, MFMAType MFMA_TYPE> void paged_attention_custom_launcher_navi( torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, @@ -3397,74 +3548,77 @@ void paged_attention_custom_launcher_navi( } #define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, \ - PSIZE, ALIBI_ENABLED) \ + PSIZE, ALIBI_ENABLED, MFMA_TYPE) \ if (!is_navi) { \ paged_attention_custom_launcher( \ + OUTT, PSIZE, ALIBI_ENABLED, MFMA_TYPE>( \ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \ max_seq_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); \ } else { \ - paged_attention_custom_launcher_navi< \ - T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \ + paged_attention_custom_launcher_navi( \ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \ max_seq_len, alibi_slopes, k_scale, v_scale); \ } #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ - OUTT, PSIZE) \ + OUTT, PSIZE, MFMA_TYPE) \ if (alibi_slopes) { \ CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, \ - true); \ + true, MFMA_TYPE); \ } else { \ CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, \ - false); \ + false, MFMA_TYPE); \ } #if defined(__HIPCC__) && defined(__gfx90a__) - #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE) \ + #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ + MFMA_TYPE) \ if (fp8_out_scale) { \ TORCH_CHECK(false, "fp8 out scale unsupported for gfx90a"); \ } else { \ CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \ - 256); \ + 256, MFMA_TYPE); \ } #else - #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE) \ + #define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ + MFMA_TYPE) \ if (fp8_out_scale) { \ CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ - uint8_t, 256); \ + uint8_t, 256, MFMA_TYPE); \ } else { \ CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \ - 256); \ + 256, MFMA_TYPE); \ } #endif -#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE) \ - switch (block_size) { \ - case 16: \ - CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 16, HEAD_SIZE); \ - break; \ - case 32: \ - CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 32, HEAD_SIZE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ +#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE, MFMA_TYPE) \ + switch (block_size) { \ + case 16: \ + CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 16, HEAD_SIZE, MFMA_TYPE); \ + break; \ + case 32: \ + CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 32, HEAD_SIZE, MFMA_TYPE); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported block size: ", block_size); \ + break; \ } -#define CALL_CUSTOM_LAUNCHER_BLK_HEAD(T, KVT, KV_DTYPE) \ - switch (head_size) { \ - case 64: \ - CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 64); \ - break; \ - case 128: \ - CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 128); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported head size: ", head_size); \ - break; \ +#define CALL_CUSTOM_LAUNCHER_BLK_HEAD(T, KVT, KV_DTYPE, MFMA_TYPE) \ + switch (head_size) { \ + case 64: \ + CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 64, MFMA_TYPE); \ + break; \ + case 128: \ + CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 128, MFMA_TYPE); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported head size: ", head_size); \ + break; \ } bool is_navi_gpu() { @@ -3503,28 +3657,43 @@ void paged_attention( const std::optional& alibi_slopes, const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale, - const std::optional& fp8_out_scale) { + const std::optional& fp8_out_scale, + const std::string& mfma_type) { // clang-format on bool is_navi = is_navi_gpu(); - const int head_size = query.size(2); if (kv_cache_dtype == "auto") { if (query.dtype() == at::ScalarType::Half) { - CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, _Float16, - vllm::Fp8KVCacheDataType::kAuto); + CALL_CUSTOM_LAUNCHER_BLK_HEAD( + _Float16, _Float16, vllm::Fp8KVCacheDataType::kAuto, MFMAType::F16); } else if (query.dtype() == at::ScalarType::BFloat16) { CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, __hip_bfloat16, - vllm::Fp8KVCacheDataType::kAuto); + vllm::Fp8KVCacheDataType::kAuto, + MFMAType::F16); } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } } else if (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3") { if (query.dtype() == at::ScalarType::Half) { - CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t, - vllm::Fp8KVCacheDataType::kFp8E4M3); + if (mfma_type == "fp8") { + CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t, + vllm::Fp8KVCacheDataType::kFp8E4M3, + MFMAType::Fp8); + } else { + CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t, + vllm::Fp8KVCacheDataType::kFp8E4M3, + MFMAType::F16); + } } else if (query.dtype() == at::ScalarType::BFloat16) { - CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t, - vllm::Fp8KVCacheDataType::kFp8E4M3); + if (mfma_type == "fp8") { + CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t, + vllm::Fp8KVCacheDataType::kFp8E4M3, + MFMAType::Fp8); + } else { + CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t, + vllm::Fp8KVCacheDataType::kFp8E4M3, + MFMAType::F16); + } } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h index 34dcc9401aae8..b6ee2656746c1 100644 --- a/csrc/rocm/ops.h +++ b/csrc/rocm/ops.h @@ -19,4 +19,5 @@ void paged_attention( const std::optional& query_start_loc, int64_t block_size, int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, torch::Tensor& k_scale, - torch::Tensor& v_scale, const std::optional& fp8_out_scale); + torch::Tensor& v_scale, const std::optional& fp8_out_scale, + const std::string& mfma_type); diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp index 66bdc448da3ca..c0c4daef64f05 100644 --- a/csrc/rocm/torch_bindings.cpp +++ b/csrc/rocm/torch_bindings.cpp @@ -48,7 +48,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) { " Tensor? alibi_slopes," " str kv_cache_dtype," " Tensor k_scale, Tensor v_scale," - " Tensor? fp8_out_scale) -> ()"); + " Tensor? fp8_out_scale," + " str mfma_type) -> ()"); rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention); } diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 81aca7b8860d5..f22e23519831f 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -613,6 +613,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "int pad_slot_id) -> ()"); ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); + // Hadamard transforms + ops.def("hadacore_transform(Tensor! x, bool inplace) -> Tensor"); + #ifndef USE_ROCM // Compute per-token-group FP8 quantized tensor and scaling factor. ops.def( diff --git a/docker/Dockerfile b/docker/Dockerfile index 307e9658f7175..17f8e6043f895 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -196,6 +196,7 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0 # Flag to control whether to use pre-built vLLM wheels ARG VLLM_USE_PRECOMPILED="" +ARG VLLM_MAIN_CUDA_VERSION="" # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ @@ -213,6 +214,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ + && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \ && export VLLM_DOCKER_BUILD_CONTEXT=1 \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ @@ -375,7 +377,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with "flashinfer" extra in setup.py -ARG FLASHINFER_GIT_REF="v0.3.0" +ARG FLASHINFER_GIT_REF="v0.3.1" # Flag to control whether to compile FlashInfer AOT kernels # Set to "true" to enable AOT compilation: # docker build --build-arg FLASHINFER_AOT_COMPILE=true ... diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index e147b97f0e056..ae12ed0f7cabb 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. # build flashinfer for torch nightly from source around 10 mins -# release version: v0.2.2.post1 +# release version: v0.3.1 # todo(elainewy): cache flashinfer build result for faster build ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ @@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ echo "git clone flashinfer..." \ && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ && cd flashinfer \ - && git checkout v0.2.2.post1 \ + && git checkout v0.3.1 \ && git submodule update --init --recursive \ && echo "finish git clone flashinfer..." \ && rm -rf build \ diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index dc742c8fcf2cd..87d34d207cde3 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -840,7 +840,6 @@ Some HF processors directly insert feature tokens without replacing anything in Examples: - BLIP-2 (insert at start of prompt): -- Florence2 (insert at start of prompt): - Molmo (insert after `<|endoftext|>` token): ### Handling prompt updates unrelated to multi-modal data diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md index af0f0690c68e2..c119878f137a4 100644 --- a/docs/deployment/frameworks/streamlit.md +++ b/docs/deployment/frameworks/streamlit.md @@ -6,35 +6,33 @@ It can be quickly integrated with vLLM as a backend API server, enabling powerfu ## Prerequisites -- Setup vLLM environment +Set up the vLLM environment by installing all required packages: + +```bash +pip install vllm streamlit openai +``` ## Deploy -- Start the vLLM server with the supported chat completion model, e.g. +1. Start the vLLM server with a supported chat completion model, e.g. -```bash -vllm serve qwen/Qwen1.5-0.5B-Chat -``` + ```bash + vllm serve Qwen/Qwen1.5-0.5B-Chat + ``` -- Install streamlit and openai: +1. Use the script: -```bash -pip install streamlit openai -``` +1. Start the streamlit web UI and start to chat: -- Use the script: - -- Start the streamlit web UI and start to chat: - -```bash -streamlit run streamlit_openai_chatbot_webserver.py - -# or specify the VLLM_API_BASE or VLLM_API_KEY -VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \ + ```bash streamlit run streamlit_openai_chatbot_webserver.py -# start with debug mode to view more details -streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug -``` + # or specify the VLLM_API_BASE or VLLM_API_KEY + VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \ + streamlit run streamlit_openai_chatbot_webserver.py -![](../../assets/deployment/streamlit-chat.png) + # start with debug mode to view more details + streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug + ``` + + ![Chat with vLLM assistant in Streamlit](../../assets/deployment/streamlit-chat.png) diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md index 5a7582c86d49f..412ce658b92a2 100644 --- a/docs/design/huggingface_integration.md +++ b/docs/design/huggingface_integration.md @@ -1,31 +1,31 @@ # Integration with Hugging Face -This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`. +This document describes how vLLM integrates with Hugging Face libraries. We will explain step by step what happens under the hood when we run `vllm serve`. -Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`. +Let's say we want to serve the popular Qwen model by running `vllm serve Qwen/Qwen2-7B`. 1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process: - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path. - - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works. - - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file. + - If the `model` argument is a Hugging Face model ID consisting of a username and model name, vLLM will first try to use the config file from the Hugging Face local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the Hugging Face cache works. + - If the `model` argument is a Hugging Face model ID but it is not found in the cache, vLLM will download the config file from the Hugging Face model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file. 2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation. 3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that: - - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example. - - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled. + - Hugging Face also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, Hugging Face will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example. + - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, Hugging Face will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled. 4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation. 5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs. -Beyond that, there are two more things vLLM depends on HuggingFace for. +Beyond that, there are two more things vLLM depends on Hugging Face for. -1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24). +1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24). -2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights. +2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights. - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that: -This completes the integration between vLLM and HuggingFace. +This completes the integration between vLLM and Hugging Face. -In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository. +In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the Hugging Face model hub or a local directory. It uses the config class from either vLLM, Hugging Face transformers, or loads the config class from the model's repository. diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 01c5f5fc02f3e..9e64c6f2540af 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -165,7 +165,19 @@ There are scenarios where the PyTorch dependency cannot be easily installed with - Building vLLM with PyTorch nightly or a custom PyTorch build. - Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it. -To build vLLM using an existing PyTorch installation, it is recommended to use `uv`, because it has [a unique mechanism](https://docs.astral.sh/uv/concepts/projects/config/#disabling-build-isolation) for disabling build isolation for specific packages and vLLM leverages this mechanism to specify `torch` as the package to disable build isolation. +To build vLLM using an existing PyTorch installation: + +```bash +# install PyTorch first, either from PyPI or from source +git clone https://github.com/vllm-project/vllm.git +cd vllm +python use_existing_torch.py +uv pip install -r requirements/build.txt +uv pip install --no-build-isolation -e . +``` + +Alternatively: if you are exclusively using `uv` to create and manage virtual environments, it has [a unique mechanism](https://docs.astral.sh/uv/concepts/projects/config/#disabling-build-isolation) +for disabling build isolation for specific packages. vLLM can leverage this mechanism to specify `torch` as the package to disable build isolation for: ```bash # install PyTorch first, either from PyPI or from source diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index d2fbb1870dde6..0521a22c07029 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -228,7 +228,7 @@ outputs = llm.embed(["Follow the white rabbit."], print(outputs[0].outputs) ``` -A code example can be found here: +A code example can be found here: ### Online Inference @@ -258,4 +258,4 @@ Expected output: {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} ``` -An OpenAI client example can be found here: +An OpenAI client example can be found here: diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 6295a2aa8dc2f..73834ddd0c5d6 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -328,10 +328,9 @@ th { | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ | | `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ | ✅︎ | | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ | | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | ✅︎ | -| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | -| `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | | | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | @@ -425,9 +424,6 @@ Some models are supported only via the [Transformers backend](#transformers). Th !!! note Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. -!!! note - Some mBART models' config files do not have an `architecture` defined. Therefore, you need to use `--hf-overrides '{"architectures": ["MBartForConditionalGeneration"]}'` to explicitly specify the use of the `MBartForConditionalGeneration` architecture. - ### Pooling Models See [this page](./pooling_models.md) for more information on how to use pooling models. @@ -530,7 +526,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A ``` !!! note - Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: . + Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: . ```bash vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' @@ -624,9 +620,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ | | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I+ | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ | | `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ | -| `DonutForConditionalGeneration`^ | Donut | T + I | `ByteDance/Dolphin`, `naver-clova-ix/donut-base-finetuned-docvqa`, etc. | | | | | `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I+/ V+ | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ | -| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | | | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ | | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | @@ -653,7 +647,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ | | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ | | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MllamaForConditionalGeneration` | Llama 3.2 | T + I+ | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | | `MolmoForCausalLM` | Molmo | T + I+ | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ | | `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ | | `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ | diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index dfed15d4ace97..181a874efa3cb 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -239,7 +239,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api]) which will be treated as a single prompt to the model. -Code example: +Code example: #### Multi-modal inputs @@ -313,7 +313,7 @@ and passing a list of `messages` in the request. Refer to the examples below for `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code example below for details. -Full example: +Full example: #### Extra parameters @@ -421,7 +421,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_ The input format is the same as [Embeddings API][embeddings-api], but the output data can contain an arbitrary nested list, not just a 1-D list of floats. -Code example: +Code example: [](){ #classification-api } @@ -431,7 +431,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities. -Code example: +Code example: #### Example Requests @@ -760,7 +760,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with popular open-source tools. -Code example: +Code example: #### Example Request diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index d404c87e8f5a7..340aaf54bb720 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -120,7 +120,7 @@ Please note that prefix caching is not yet supported for any of the above models Whisper is supported. Other models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, -`MllamaForConditionalGeneration`) are not yet supported. +`MllamaForConditionalGeneration`) are not supported. ### Features diff --git a/examples/offline_inference/dolphin.py b/examples/offline_inference/dolphin.py deleted file mode 100644 index d2ba27cd1e027..0000000000000 --- a/examples/offline_inference/dolphin.py +++ /dev/null @@ -1,311 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import copy -import os -from dataclasses import dataclass - -import cv2 -import numpy as np -import regex as re -from PIL import Image -from transformers import DonutProcessor - -from vllm import LLM, SamplingParams -from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt -from vllm.multimodal.utils import fetch_image - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -@dataclass -class ImageDimensions: - original_w: int - original_h: int - padded_w: int - padded_h: int - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -def map_to_original_coordinates( - x1, y1, x2, y2, dims: ImageDimensions -) -> tuple[int, int, int, int]: - try: - top = (dims.padded_h - dims.original_h) // 2 - left = (dims.padded_w - dims.original_w) // 2 - orig_x1 = max(0, x1 - left) - orig_y1 = max(0, y1 - top) - orig_x2 = min(dims.original_w, x2 - left) - orig_y2 = min(dims.original_h, y2 - top) - if orig_x2 <= orig_x1: - orig_x2 = min(orig_x1 + 1, dims.original_w) - if orig_y2 <= orig_y1: - orig_y2 = min(orig_y1 + 1, dims.original_h) - return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2) - except Exception as e: - print(f"map_to_original_coordinates error: {str(e)}") - return 0, 0, min(100, dims.original_w), min(100, dims.original_h) - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -def adjust_box_edges(image, boxes: list[list[float]], max_pixels=15, threshold=0.2): - if isinstance(image, str): - image = cv2.imread(image) - img_h, img_w = image.shape[:2] - new_boxes = [] - for box in boxes: - best_box = copy.deepcopy(box) - - def check_edge(img, current_box, i, is_vertical): - edge = current_box[i] - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold( - gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU - ) - if is_vertical: - line = binary[current_box[1] : current_box[3] + 1, edge] - else: - line = binary[edge, current_box[0] : current_box[2] + 1] - transitions = np.abs(np.diff(line)) - return np.sum(transitions) / len(transitions) - - edges = [(0, -1, True), (2, 1, True), (1, -1, False), (3, 1, False)] - current_box = copy.deepcopy(box) - current_box[0] = min(max(current_box[0], 0), img_w - 1) - current_box[1] = min(max(current_box[1], 0), img_h - 1) - current_box[2] = min(max(current_box[2], 0), img_w - 1) - current_box[3] = min(max(current_box[3], 0), img_h - 1) - - for i, direction, is_vertical in edges: - best_score = check_edge(image, current_box, i, is_vertical) - if best_score <= threshold: - continue - for step in range(max_pixels): - current_box[i] += direction - if i == 0 or i == 2: - current_box[i] = min(max(current_box[i], 0), img_w - 1) - else: - current_box[i] = min(max(current_box[i], 0), img_h - 1) - score = check_edge(image, current_box, i, is_vertical) - if score < best_score: - best_score = score - best_box = copy.deepcopy(current_box) - if score <= threshold: - break - new_boxes.append(best_box) - return new_boxes - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None): - try: - x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h) - x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h) - x1, y1, x2, y2 = ( - max(0, min(x1, dims.padded_w - 1)), - max(0, min(y1, dims.padded_h - 1)), - max(0, min(x2, dims.padded_w)), - max(0, min(y2, dims.padded_h)), - ) - if x2 <= x1: - x2 = min(x1 + 1, dims.padded_w) - if y2 <= y1: - y2 = min(y1 + 1, dims.padded_h) - new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]]) - x1, y1, x2, y2 = new_boxes[0] - x1, y1, x2, y2 = ( - max(0, min(x1, dims.padded_w - 1)), - max(0, min(y1, dims.padded_h - 1)), - max(0, min(x2, dims.padded_w)), - max(0, min(y2, dims.padded_h)), - ) - if x2 <= x1: - x2 = min(x1 + 1, dims.padded_w) - if y2 <= y1: - y2 = min(y1 + 1, dims.padded_h) - if previous_box is not None: - prev_x1, prev_y1, prev_x2, prev_y2 = previous_box - if (x1 < prev_x2 and x2 > prev_x1) and (y1 < prev_y2 and y2 > prev_y1): - y1 = prev_y2 - y1 = min(y1, dims.padded_h - 1) - if y2 <= y1: - y2 = min(y1 + 1, dims.padded_h) - new_previous_box = [x1, y1, x2, y2] - orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates( - x1, y1, x2, y2, dims - ) - return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box - except Exception as e: - print(f"process_coordinates error: {str(e)}") - orig_x1, orig_y1, orig_x2, orig_y2 = ( - 0, - 0, - min(100, dims.original_w), - min(100, dims.original_h), - ) - return 0, 0, 100, 100, orig_x1, orig_y1, orig_x2, orig_y2, [0, 0, 100, 100] - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -def prepare_image(image) -> tuple[np.ndarray, ImageDimensions]: - try: - image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) - original_h, original_w = image_cv.shape[:2] - max_size = max(original_h, original_w) - top = (max_size - original_h) // 2 - bottom = max_size - original_h - top - left = (max_size - original_w) // 2 - right = max_size - original_w - left - padded_image = cv2.copyMakeBorder( - image_cv, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0) - ) - padded_h, padded_w = padded_image.shape[:2] - dimensions = ImageDimensions( - original_w=original_w, - original_h=original_h, - padded_w=padded_w, - padded_h=padded_h, - ) - return padded_image, dimensions - except Exception as e: - print(f"prepare_image error: {str(e)}") - h, w = image.height, image.width - dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h) - return np.zeros((h, w, 3), dtype=np.uint8), dimensions - - -# Copied from https://github.com/bytedance/Dolphin/utils/utils.py -def parse_layout_string(bbox_str): - """Parse layout string using regular expressions""" - pattern = r"\[(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+)\]\s*(\w+)" - matches = re.finditer(pattern, bbox_str) - - parsed_results = [] - for match in matches: - coords = [float(match.group(i)) for i in range(1, 5)] - label = match.group(5).strip() - parsed_results.append((coords, label)) - - return parsed_results - - -model_id = "ByteDance/Dolphin" - -# The input image size for Dolphin is 896 x 896, -# and the patch_size is 4 x 4. -# Therefore, the initial number of patches is: -# Height: 896 / 4 = 224 patches -# Width: 896 / 4 = 224 patches - -# The Dolphin model uses a staged downsampling approach, -# defined by the "depths": [2, 2, 14, 2] configuration. -# Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed, -# which halves the feature map's dimensions (dividing both height and width by 2). -# Before Stage 2: The size changes from 224 x 224 to (224/2) x (224/2) = 112 x 112. -# Before Stage 3: The size changes from 112 x 112 to (112/2) x (112/2) = 56 x 56. -# Before Stage 4: The size changes from 56 x 56 to (56/2) x (56/2) = 28 x 28. - -# Because vLLM needs to fill the image features with an encoder_prompt, -# and the encoder_prompt will have `` tokens added when tokenized, -# we need to construct an encoder_prompt with a length of 28 x 28 - 1 = 783. -encoder_prompt = "".join(["0"] * 783) -sampling_params = SamplingParams( - temperature=0.0, - max_tokens=2048, -) - -processor = DonutProcessor.from_pretrained(model_id) -llm = LLM( - model=model_id, - dtype="float16", - max_num_seqs=8, - hf_overrides={"architectures": ["DonutForConditionalGeneration"]}, -) - -parser = argparse.ArgumentParser() -parser.add_argument( - "--image_path", type=str, default=None, help="Path to a local image file." -) -args = parser.parse_args() - -if args.image_path: - if not os.path.exists(args.image_path): - raise FileNotFoundError(f"Error: File not found at {args.image_path}") - image = Image.open(args.image_path).convert("RGB") -else: - image = fetch_image( - "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg" - ) - - -prompt = "Parse the reading order of this document. " -decoder_prompt = f"{prompt}" -decoder_prompt_tokens = TokensPrompt( - prompt_token_ids=processor.tokenizer(decoder_prompt, add_special_tokens=False)[ - "input_ids" - ] -) -enc_dec_prompt = ExplicitEncoderDecoderPrompt( - encoder_prompt=TextPrompt(prompt=encoder_prompt, multi_modal_data={"image": image}), - decoder_prompt=decoder_prompt_tokens, -) -layout_outputs = llm.generate(prompts=enc_dec_prompt, sampling_params=sampling_params) -layout_result_str = layout_outputs[0].outputs[0].text -print(f"Layout analysis output:\n{layout_result_str}") - -padded_image, dims = prepare_image(image) -layout_results = parse_layout_string(layout_result_str) -text_table_elements = [] -previous_box = None -reading_order = 0 -for bbox_coords, label in layout_results: - if label == "fig": - continue - try: - x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = ( - process_coordinates(bbox_coords, padded_image, dims, previous_box) - ) - cropped = padded_image[y1:y2, x1:x2] - if cropped.size > 0 and cropped.shape[0] > 3 and cropped.shape[1] > 3: - pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)) - prompt_ocr = ( - "Parse the table in the image. " - if label == "tab" - else "Read text in the image. " - ) - text_table_elements.append( - { - "crop": pil_crop, - "prompt": prompt_ocr, - "reading_order": reading_order, - } - ) - reading_order += 1 - except Exception as e: - print(f"Error processing bbox (label: {label}): {str(e)}") - continue - -if text_table_elements: - batch_prompts = [] - for elem in text_table_elements: - decoder_prompt_str = f"{elem['prompt']}" - decoder_prompt_tokens = TokensPrompt( - prompt_token_ids=processor.tokenizer( - decoder_prompt_str, add_special_tokens=False - )["input_ids"] - ) - enc_dec_prompt = ExplicitEncoderDecoderPrompt( - encoder_prompt=TextPrompt( - prompt=encoder_prompt, multi_modal_data={"image": elem["crop"]} - ), - decoder_prompt=decoder_prompt_tokens, - ) - batch_prompts.append(enc_dec_prompt) - batch_outputs = llm.generate(prompts=batch_prompts, sampling_params=sampling_params) - for i, output in enumerate(batch_outputs): - text_table_elements[i]["text"] = output.outputs[0].text.strip() - -print("------" * 8) -text_table_elements.sort(key=lambda x: x["reading_order"]) -for elem in text_table_elements: - print(elem.get("text", "")) diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py deleted file mode 100644 index 957db3c23b863..0000000000000 --- a/examples/offline_inference/encoder_decoder.py +++ /dev/null @@ -1,195 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Demonstrate prompting of text-to-text -encoder/decoder models, specifically BART and mBART. - -This script is refactored to allow model selection via command-line arguments. - -NOTE: This example is not yet supported in V1. -""" - -import argparse -from typing import NamedTuple, Optional - -from vllm import LLM, SamplingParams -from vllm.inputs import ( - ExplicitEncoderDecoderPrompt, - TextPrompt, - TokensPrompt, - zip_enc_dec_prompts, -) - - -class ModelRequestData(NamedTuple): - """ - Holds the configuration for a specific model, including its - HuggingFace ID and the prompts to use for the demo. - """ - - model_id: str - encoder_prompts: list - decoder_prompts: list - hf_overrides: Optional[dict] = None - - -def get_bart_config() -> ModelRequestData: - """ - Returns the configuration for facebook/bart-large-cnn. - This uses the exact test cases from the original script. - """ - encoder_prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "An encoder prompt", - ] - decoder_prompts = [ - "A decoder prompt", - "Another decoder prompt", - ] - return ModelRequestData( - model_id="facebook/bart-large-cnn", - encoder_prompts=encoder_prompts, - decoder_prompts=decoder_prompts, - ) - - -def get_mbart_config() -> ModelRequestData: - """ - Returns the configuration for facebook/mbart-large-en-ro. - This uses prompts suitable for an English-to-Romanian translation task. - """ - encoder_prompts = [ - "The quick brown fox jumps over the lazy dog.", - "How are you today?", - ] - decoder_prompts = ["", ""] - hf_overrides = {"architectures": ["MBartForConditionalGeneration"]} - return ModelRequestData( - model_id="facebook/mbart-large-en-ro", - encoder_prompts=encoder_prompts, - decoder_prompts=decoder_prompts, - hf_overrides=hf_overrides, - ) - - -MODEL_GETTERS = { - "bart": get_bart_config, - "mbart": get_mbart_config, -} - - -def create_all_prompt_types( - encoder_prompts_raw: list, - decoder_prompts_raw: list, - tokenizer, -) -> list: - """ - Generates a list of diverse prompt types for demonstration. - This function is generic and uses the provided raw prompts - to create various vLLM input objects. - """ - text_prompt_raw = encoder_prompts_raw[0] - text_prompt = TextPrompt(prompt=encoder_prompts_raw[1 % len(encoder_prompts_raw)]) - tokens_prompt = TokensPrompt( - prompt_token_ids=tokenizer.encode( - encoder_prompts_raw[2 % len(encoder_prompts_raw)] - ) - ) - - decoder_tokens_prompt = TokensPrompt( - prompt_token_ids=tokenizer.encode(decoder_prompts_raw[0]) - ) - single_prompt_examples = [ - text_prompt_raw, - text_prompt, - tokens_prompt, - ] - explicit_pair_examples = [ - ExplicitEncoderDecoderPrompt( - encoder_prompt=text_prompt_raw, - decoder_prompt=decoder_tokens_prompt, - ), - ExplicitEncoderDecoderPrompt( - encoder_prompt=text_prompt, - decoder_prompt=decoder_prompts_raw[1 % len(decoder_prompts_raw)], - ), - ExplicitEncoderDecoderPrompt( - encoder_prompt=tokens_prompt, - decoder_prompt=text_prompt, - ), - ] - zipped_prompt_list = zip_enc_dec_prompts( - encoder_prompts_raw, - decoder_prompts_raw, - ) - return single_prompt_examples + explicit_pair_examples + zipped_prompt_list - - -def create_sampling_params() -> SamplingParams: - """Create a sampling params object.""" - return SamplingParams( - temperature=0, - top_p=1.0, - min_tokens=0, - max_tokens=30, - ) - - -def print_outputs(outputs: list): - """Formats and prints the generation outputs.""" - print("-" * 80) - for i, output in enumerate(outputs): - prompt = output.prompt - encoder_prompt = output.encoder_prompt - generated_text = output.outputs[0].text - print(f"Output {i + 1}:") - print(f"Encoder Prompt: {encoder_prompt!r}") - print(f"Decoder Prompt: {prompt!r}") - print(f"Generated Text: {generated_text!r}") - print("-" * 80) - - -def main(args): - """Main execution function.""" - model_key = args.model - if model_key not in MODEL_GETTERS: - raise ValueError( - f"Unknown model: {model_key}. " - f"Available models: {list(MODEL_GETTERS.keys())}" - ) - config_getter = MODEL_GETTERS[model_key] - model_config = config_getter() - - print(f"🚀 Running demo for model: {model_config.model_id}") - llm = LLM( - model=model_config.model_id, - dtype="float", - hf_overrides=model_config.hf_overrides, - ) - tokenizer = llm.llm_engine.get_tokenizer_group() - prompts = create_all_prompt_types( - encoder_prompts_raw=model_config.encoder_prompts, - decoder_prompts_raw=model_config.decoder_prompts, - tokenizer=tokenizer, - ) - sampling_params = create_sampling_params() - outputs = llm.generate(prompts, sampling_params) - print_outputs(outputs) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="A flexible demo for vLLM encoder-decoder models." - ) - parser.add_argument( - "--model", - "-m", - type=str, - default="bart", - choices=MODEL_GETTERS.keys(), - help="The short name of the model to run.", - ) - args = parser.parse_args() - main(args) diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index 35e9203d1caf0..4a1b0c40604b2 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -13,8 +13,6 @@ from typing import NamedTuple from vllm import LLM, EngineArgs, PromptType, SamplingParams from vllm.assets.audio import AudioAsset -from vllm.assets.image import ImageAsset -from vllm.multimodal.utils import fetch_image from vllm.utils import FlexibleArgumentParser @@ -23,113 +21,6 @@ class ModelRequestData(NamedTuple): prompts: Sequence[PromptType] -def run_donut(): - engine_args = EngineArgs( - model="naver-clova-ix/donut-base-finetuned-docvqa", - max_num_seqs=2, - limit_mm_per_prompt={"image": 1}, - dtype="float16", - hf_overrides={"architectures": ["DonutForConditionalGeneration"]}, - ) - - # The input image size for donut-base-finetuned-docvqa is 2560 x 1920, - # and the patch_size is 4 x 4. - # Therefore, the initial number of patches is: - # Height: 1920 / 4 = 480 patches - # Width: 2560 / 4 = 640 patches - # The Swin model uses a staged downsampling approach, - # defined by the "depths": [2, 2, 14, 2] configuration. - # Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed, - # which halves the feature map's dimensions (dividing both height and width by 2). - # Before Stage 2: The size changes from 480 x 640 to (480/2) x (640/2) = 240 x 320. - # Before Stage 3: The size changes from 240 x 320 to (240/2) x (320/2) = 120 x 160. - # Before Stage 4: The size changes from 120 x 160 to (120/2) x (160/2) = 60 x 80. - # Because vLLM needs to fill the image features with an encoder_prompt, - # and the encoder_prompt will have `` tokens added when tokenized, - # we need to construct an encoder_prompt with a length of 60 x 80 - 1 = 4799. - prompts = [ - { - "encoder_prompt": { - "prompt": "".join(["$"] * 4799), - "multi_modal_data": { - "image": fetch_image( - "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg" - ) # noqa: E501 - }, - }, - "decoder_prompt": "What time is the coffee break?", # noqa: E501 - }, - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - -def run_florence2(): - engine_args = EngineArgs( - model="microsoft/Florence-2-large", - tokenizer="Isotr0py/Florence-2-tokenizer", - max_num_seqs=8, - trust_remote_code=True, - limit_mm_per_prompt={"image": 1}, - dtype="half", - ) - - prompts = [ - { # implicit prompt with task token - "prompt": "", - "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image}, - }, - { # explicit encoder/decoder prompt - "encoder_prompt": { - "prompt": "Describe in detail what is shown in the image.", - "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image}, - }, - "decoder_prompt": "", - }, - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - -def run_mllama(): - engine_args = EngineArgs( - model="meta-llama/Llama-3.2-11B-Vision-Instruct", - max_model_len=8192, - max_num_seqs=2, - limit_mm_per_prompt={"image": 1}, - dtype="half", - ) - - prompts = [ - { # Implicit prompt - "prompt": "<|image|><|begin_of_text|>What is the content of this image?", # noqa: E501 - "multi_modal_data": { - "image": ImageAsset("stop_sign").pil_image, - }, - }, - { # Explicit prompt - "encoder_prompt": { - "prompt": "<|image|>", - "multi_modal_data": { - "image": ImageAsset("stop_sign").pil_image, - }, - }, - "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501 - }, - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - def run_whisper(): os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -166,9 +57,6 @@ def run_whisper(): model_example_map = { - "donut": run_donut, - "florence2": run_florence2, - "mllama": run_mllama, "whisper": run_whisper, } @@ -182,7 +70,7 @@ def parse_args(): "--model-type", "-m", type=str, - default="mllama", + default="whisper", choices=model_example_map.keys(), help='Huggingface "model_type".', ) diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md new file mode 100644 index 0000000000000..8693f5e08e0ba --- /dev/null +++ b/examples/offline_inference/pooling/README.md @@ -0,0 +1,33 @@ +# Pooling models + +## Convert llm model to seq cls + +```bash +# for BAAI/bge-reranker-v2-gemma +# Caution: "Yes" and "yes" are two different tokens +python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls +# for mxbai-rerank-v2 +python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls +# for Qwen3-Reranker +python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls +``` + +## Embed jina_embeddings_v3 usage + +Only text matching task is supported for now. See + +```bash +python examples/offline_inference/pooling/embed_jina_embeddings_v3.py +``` + +## Embed matryoshka dimensions usage + +```bash +python examples/offline_inference/pooling/embed_matryoshka_fy.py +``` + +## Qwen3 reranker usage + +```bash +python qwen3_reranker.py +``` diff --git a/examples/offline_inference/convert_model_to_seq_cls.py b/examples/offline_inference/pooling/convert_model_to_seq_cls.py similarity index 100% rename from examples/offline_inference/convert_model_to_seq_cls.py rename to examples/offline_inference/pooling/convert_model_to_seq_cls.py diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/pooling/embed_jina_embeddings_v3.py similarity index 100% rename from examples/offline_inference/embed_jina_embeddings_v3.py rename to examples/offline_inference/pooling/embed_jina_embeddings_v3.py diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/pooling/embed_matryoshka_fy.py similarity index 100% rename from examples/offline_inference/embed_matryoshka_fy.py rename to examples/offline_inference/pooling/embed_matryoshka_fy.py diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/pooling/qwen3_reranker.py similarity index 100% rename from examples/offline_inference/qwen3_reranker.py rename to examples/offline_inference/pooling/qwen3_reranker.py diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 4b75eb19fcf94..929df8d8bebd9 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -204,28 +204,6 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData: ) -# Florence2 -def run_florence2(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - engine_args = EngineArgs( - model="microsoft/Florence-2-large", - tokenizer="Isotr0py/Florence-2-tokenizer", - max_model_len=4096, - max_num_seqs=2, - trust_remote_code=True, - dtype="bfloat16", - limit_mm_per_prompt={modality: 1}, - ) - - prompts = ["" for _ in questions] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - # Fuyu def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1008,44 +986,6 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData: ) -# LLama 3.2 -def run_mllama(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" - - # Note: The default setting of max_num_seqs (256) and - # max_model_len (131072) for this model may cause OOM. - # You may lower either to run this example on lower-end GPUs. - - # The configuration below has been confirmed to launch on a single L40 GPU. - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=2, - limit_mm_per_prompt={modality: 1}, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [ - [ - { - "role": "user", - "content": [{"type": "image"}, {"type": "text", "text": question}], - } - ] - for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, tokenize=False - ) - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - # Molmo def run_molmo(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1665,7 +1605,6 @@ model_example_map = { "command_a_vision": run_command_a_vision, "deepseek_vl_v2": run_deepseek_vl2, "ernie45_vl": run_ernie45_vl, - "florence2": run_florence2, "fuyu": run_fuyu, "gemma3": run_gemma3, "gemma3n": run_gemma3n, @@ -1691,7 +1630,6 @@ model_example_map = { "minicpmv": run_minicpmv, "minimax_vl_01": run_minimax_vl_01, "mistral3": run_mistral3, - "mllama": run_mllama, "molmo": run_molmo, "nemotron_vl": run_nemotron_vl, "NVLM_D": run_nvlm_d, @@ -1716,6 +1654,13 @@ model_example_map = { } +MODELS_NEED_VIDEO_METADATA = [ + "glm4_1v", + "glm4_5v", + "glm4_5v_fp8", +] + + def get_multi_modal_input(args): """ return { @@ -1740,12 +1685,13 @@ def get_multi_modal_input(args): if args.modality == "video": # Input video and question + needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata vid_questions = ["Why is this video funny?"] return { - "data": [(video, metadata)] if args.model_type == "glm4_1v" else video, + "data": ([(video, metadata)] if needs_metadata else video), "questions": vid_questions, } diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 01c2905cf26d8..51b41f34b2ff6 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -637,26 +637,6 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" - - # The configuration below has been confirmed to launch on a single L40 GPU. - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=2, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - - img_prompt = "Given the first image <|image|> and the second image<|image|>" - prompt = f"<|begin_of_text|>{img_prompt}, {question}?" - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - ) - - def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "nvidia/NVLM-D-72B" @@ -1253,7 +1233,6 @@ model_example_map = { "llava-next": load_llava_next, "llava-onevision": load_llava_onevision, "mistral3": load_mistral3, - "mllama": load_mllama, "NVLM_D": load_nvlm_d, "ovis": load_ovis, "ovis2_5": load_ovis2_5, diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh index f356d7d4529ea..56888c8aa0e4c 100644 --- a/examples/online_serving/openai_embedding_long_text/service.sh +++ b/examples/online_serving/openai_embedding_long_text/service.sh @@ -120,7 +120,7 @@ echo " - API Key: $API_KEY" echo " - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN" echo "" echo "🧪 Test the server with:" -echo " python examples/online_serving/openai_embedding_long_text_client.py" +echo " python examples/online_serving/openai_embedding_long_text/client.py" echo "" echo "📚 Enhanced features enabled:" echo " ✅ Intelligent native pooling type detection" diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md new file mode 100644 index 0000000000000..f7926542202d6 --- /dev/null +++ b/examples/online_serving/pooling/README.md @@ -0,0 +1,43 @@ +# Pooling models + +## Cohere rerank usage + +```bash +python examples/online_serving/pooling/cohere_rerank_client.py +``` + +## Jinaai rerank usage + +```bash +python examples/online_serving/pooling/jinaai_rerank_client.py +``` + +## Openai chat embedding for multimodal usage + +```bash +python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py +``` + +## Openai classification usage + +```bash +python examples/online_serving/pooling/openai_classification_client.py +``` + +## Openai embedding usage + +```bash +python examples/online_serving/pooling/openai_embedding_client.py +``` + +## Openai embedding matryoshka dimensions usage + +```bash +python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py +``` + +## Openai pooling usage + +```bash +python examples/online_serving/pooling/openai_pooling_client.py +``` diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/pooling/cohere_rerank_client.py similarity index 100% rename from examples/online_serving/cohere_rerank_client.py rename to examples/online_serving/pooling/cohere_rerank_client.py diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/pooling/jinaai_rerank_client.py similarity index 100% rename from examples/online_serving/jinaai_rerank_client.py rename to examples/online_serving/pooling/jinaai_rerank_client.py diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py similarity index 92% rename from examples/online_serving/openai_chat_embedding_client_for_multimodal.py rename to examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py index 771ad8511e972..30cb3325b9b18 100644 --- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py @@ -1,5 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 +"""Example Python client for multimodal embedding API using vLLM API server +NOTE: + start a supported multimodal embeddings model server with `vllm serve`, e.g. + vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024 +""" import argparse import base64 diff --git a/examples/online_serving/openai_classification_client.py b/examples/online_serving/pooling/openai_classification_client.py similarity index 86% rename from examples/online_serving/openai_classification_client.py rename to examples/online_serving/pooling/openai_classification_client.py index b10e7acbd26c1..d8dc2ef001112 100644 --- a/examples/online_serving/openai_classification_client.py +++ b/examples/online_serving/pooling/openai_classification_client.py @@ -1,5 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Example Python client for classification API using vLLM API server +NOTE: + start a supported classification model server with `vllm serve`, e.g. + vllm serve jason9693/Qwen2.5-1.5B-apeach +""" import argparse import pprint diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/pooling/openai_embedding_client.py similarity index 82% rename from examples/online_serving/openai_embedding_client.py rename to examples/online_serving/pooling/openai_embedding_client.py index 6bc390861e2ee..f5f6820d07d73 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/pooling/openai_embedding_client.py @@ -1,5 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Example Python client for embedding API using vLLM API server +NOTE: + start a supported embeddings model server with `vllm serve`, e.g. + vllm serve intfloat/e5-small +""" from openai import OpenAI diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/pooling/openai_embedding_matryoshka_fy.py similarity index 100% rename from examples/online_serving/openai_embedding_matryoshka_fy.py rename to examples/online_serving/pooling/openai_embedding_matryoshka_fy.py diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/pooling/openai_pooling_client.py similarity index 89% rename from examples/online_serving/openai_pooling_client.py rename to examples/online_serving/pooling/openai_pooling_client.py index 95555d41cbea5..569015746b128 100644 --- a/examples/online_serving/openai_pooling_client.py +++ b/examples/online_serving/pooling/openai_pooling_client.py @@ -4,7 +4,9 @@ Example online usage of Pooling API. Run `vllm serve --runner pooling` -to start up the server in vLLM. +to start up the server in vLLM. e.g. + +vllm serve internlm/internlm2-1_8b-reward --trust-remote-code """ import argparse @@ -23,7 +25,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach") + parser.add_argument("--model", type=str, default="internlm/internlm2-1_8b-reward") return parser.parse_args() diff --git a/requirements/test.in b/requirements/test.in index 744cfbe885278..451bd73879107 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -6,6 +6,7 @@ pytest-asyncio pytest-rerunfailures pytest-shard pytest-timeout +pytest-cov # testing utils backoff # required for phi4mm test diff --git a/requirements/test.txt b/requirements/test.txt index 5eebdc788aa3d..39040f210b2fd 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -135,6 +135,8 @@ colorful==0.5.6 # via ray contourpy==1.3.0 # via matplotlib +coverage==7.10.6 + # via pytest-cov cramjam==2.9.0 # via fastparquet cupy-cuda12x==13.6.0 @@ -686,7 +688,9 @@ platformdirs==4.3.6 plotly==5.24.1 # via genai-perf pluggy==1.5.0 - # via pytest + # via + # pytest + # pytest-cov polars==1.29.0 # via mteb pooch==1.8.2 @@ -786,6 +790,7 @@ pytest==8.3.5 # buildkite-test-collector # genai-perf # pytest-asyncio + # pytest-cov # pytest-forked # pytest-mock # pytest-rerunfailures @@ -796,6 +801,8 @@ pytest==8.3.5 # terratorch pytest-asyncio==0.24.0 # via -r requirements/test.in +pytest-cov==6.3.0 + # via -r requirements/test.in pytest-forked==1.6.0 # via -r requirements/test.in pytest-mock==3.14.0 diff --git a/setup.py b/setup.py index eb313b7d219c7..e4c40d22b928d 100644 --- a/setup.py +++ b/setup.py @@ -56,8 +56,6 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None # fallback to cpu VLLM_TARGET_DEVICE = "cpu" -MAIN_CUDA_VERSION = "12.8" - def is_sccache_available() -> bool: return which("sccache") is not None and \ @@ -507,7 +505,7 @@ def get_vllm_version() -> str: version += f"{sep}precompiled" else: cuda_version = str(get_nvcc_cuda_version()) - if cuda_version != MAIN_CUDA_VERSION: + if cuda_version != envs.VLLM_MAIN_CUDA_VERSION: cuda_version_str = cuda_version.replace(".", "")[:3] # skip this for source tarball, required for pypi if "sdist" not in sys.argv: @@ -515,7 +513,7 @@ def get_vllm_version() -> str: elif _is_hip(): # Get the Rocm Version rocm_version = get_rocm_version() or torch.version.hip - if rocm_version and rocm_version != MAIN_CUDA_VERSION: + if rocm_version and rocm_version != envs.VLLM_MAIN_CUDA_VERSION: version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}" elif _is_tpu(): version += f"{sep}tpu" @@ -664,7 +662,7 @@ setup( "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.3.0"], + "flashinfer": ["flashinfer-python==0.3.1"], # Optional deps for AMD FP4 quantization support "petit-kernel": ["petit-kernel"], }, diff --git a/tests/compile/backend.py b/tests/compile/backend.py index ace4d25534cdd..2c4287950dcfe 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -64,4 +64,8 @@ class TestBackend: num_pre = len(list(find_op_nodes(op, self.graph_pre_pass))) num_post = len(list(find_op_nodes(op, self.graph_post_pass))) assert num_pre == 0, f"Unexpected op {op.name()} in pre-pass graph" - assert num_post > 0, f"Op {op.name()} not found in post-pass graph" \ No newline at end of file + assert num_post > 0, f"Op {op.name()} not found in post-pass graph" + + def op_count(self, op: OpOverload, before=False) -> int: + graph = self.graph_pre_pass if before else self.graph_post_pass + return len(list(find_op_nodes(op, graph))) diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py new file mode 100644 index 0000000000000..242d531312675 --- /dev/null +++ b/tests/compile/test_noop_elimination.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch + +import vllm +from vllm.compilation.noop_elimination import NoOpEliminationPass +from vllm.config import (CompilationConfig, CompilationLevel, PassConfig, + VllmConfig) + +from .backend import TestBackend + + +@pytest.mark.parametrize("dtype", + [torch.float16, torch.bfloat16, torch.float32]) +@pytest.mark.parametrize("num_tokens", [256, 1024]) +@pytest.mark.parametrize("hidden_size", [64, 4096]) +def test_noop_elimination(dtype, num_tokens, hidden_size): + torch.set_default_device("cuda") + torch.set_default_dtype(dtype) + torch.manual_seed(1) + + class Model(torch.nn.Module): + + def forward(self, x): + # Chain of reshapes + y = x.reshape(-1, 128, 32) + z = y.reshape(-1, 4096) + # No-op reshape + a = z.reshape(-1, 4096) + # Final reshape that should remain + b = a.reshape(-1, 128, 32) + # No-op slice + c = b[0:b.shape[0]] + # The pass should replace the result of this op with `c`. + d = torch.slice_scatter( + torch.ones_like(c), # Dummy tensor to be scattered into + c, # Source tensor + 0, # dim + 0, # start + c.shape[0], # end + ) + return d + + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + pass_config=PassConfig(enable_noop=True), + )) + with vllm.config.set_current_vllm_config(vllm_config): + noop_pass = NoOpEliminationPass(vllm_config) + + backend = TestBackend(noop_pass) + + model = Model() + # First dimension dynamic + x = torch.rand(num_tokens, hidden_size) + torch._dynamo.mark_dynamic(x, 0) + + result = model(x) + + model2 = torch.compile(model, backend=backend) + result2 = model2(x) + + ATOL, RTOL = (2e-3, 2e-3) + torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL) + + # The no-op reshape and slice should be eliminated. + # The chain of reshapes should be fused into a single reshape. + assert backend.op_count(torch.ops.aten.reshape.default) == 1 + assert backend.op_count(torch.ops.aten.slice.Tensor) == 0 + assert backend.op_count(torch.ops.aten.slice_scatter.default) == 0 + + +def test_non_noop_slice_preserved(): + """Ensure that a slice with end=-1 (dropping last row) is NOT eliminated. + + Regression test for a bug where end=-1 was treated like an inferred + dimension (reshape semantics) leading to incorrect elimination. + """ + torch.set_default_device("cuda") + x = torch.randn(16, 16) + + class SliceModel(torch.nn.Module): + + def forward(self, x): + base = x.clone() + src = torch.ones(15, 16) + y = torch.slice_scatter(base, src, dim=0, start=0, end=-1) + return x[0:-1, :], y + + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + pass_config=PassConfig(enable_noop=True), + )) + with vllm.config.set_current_vllm_config(vllm_config): + noop_pass = NoOpEliminationPass(vllm_config) + backend = TestBackend(noop_pass) + model = SliceModel() + ref = model(x) + compiled = torch.compile(model, backend=backend) + out = compiled(x) + torch.testing.assert_close(ref, out) + # The slice should remain (not a no-op). + assert backend.op_count(torch.ops.aten.slice.Tensor) == 1 + assert backend.op_count(torch.ops.aten.slice_scatter.default) == 1 diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py index 9eed264fd7d43..24499b9ad4e9c 100644 --- a/tests/core/block/test_block_manager.py +++ b/tests/core/block/test_block_manager.py @@ -3,15 +3,12 @@ import pytest -from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, - STR_NOT_IMPL_ENC_DEC_SWA) from vllm.core.block_manager import SelfAttnBlockSpaceManager from vllm.core.interfaces import AllocStatus from vllm.sequence import Logprob, SequenceStatus from vllm.utils import chunk_list -from ..utils import (create_dummy_prompt, create_seq_group, - create_seq_group_encoder_decoder) +from ..utils import create_dummy_prompt, create_seq_group @pytest.mark.parametrize("block_size", [16]) @@ -58,156 +55,6 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int, assert can_allocate_result == AllocStatus.LATER -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160]) -@pytest.mark.parametrize("num_seqs_per_group", [1, 4]) -@pytest.mark.parametrize("watermark", [0.0, 0.5]) -def test_can_allocate_seq_group_encoder_decoder(block_size: int, - num_seqs_per_group: int, - num_gpu_blocks: int, - watermark: float): - block_manager = SelfAttnBlockSpaceManager( - block_size=block_size, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - watermark=watermark, - ) - num_watermark_blocks = int(watermark * num_gpu_blocks) - - num_output_blocks_per_seq = 1 - - # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but - # the current implementation assumes all seqs are new prompts / don't have - # different output lens. - num_output_blocks = num_output_blocks_per_seq - - for bdx, num_prompt_blocks in enumerate( - range(1, num_gpu_blocks - num_output_blocks)): - num_cross_blocks_per_seq = num_prompt_blocks - - seq_group = create_seq_group_encoder_decoder( - seq_prompt_len=block_size * num_prompt_blocks, - seq_output_lens=[ - block_size * num_output_blocks_per_seq - for _ in range(num_seqs_per_group) - ], - request_id=str(bdx)) - - assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks - - can_allocate_result = block_manager.can_allocate(seq_group) - - num_required_blocks = num_prompt_blocks + \ - num_output_blocks + \ - num_cross_blocks_per_seq - - if num_gpu_blocks - num_required_blocks < num_watermark_blocks: - assert can_allocate_result == AllocStatus.NEVER - elif num_gpu_blocks >= num_required_blocks: - assert can_allocate_result == AllocStatus.OK - else: - assert can_allocate_result == AllocStatus.LATER - - -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("num_gpu_blocks", [16]) -@pytest.mark.parametrize("num_seqs_per_group", [1]) -@pytest.mark.parametrize("watermark", [0.0, 0.5]) -def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int, - num_seqs_per_group: int, - num_gpu_blocks: int, - watermark: float): - ''' - SWA short for Sliding Window Attention. - - At time of writing block manager does not support SWA. - - However even when SWA is implemented for block manager, - there will still most likely be a separate workstream required - to enable SWA for encoder/decoder models. - - Therefore this test enforces that one of the following cases - hold true: - 1. Block manager does not support SWA at all (true at time of writing) - 2. Block manager fails with NotImplementError when SWA is enabled - AND a SequenceGroup with an encoder sequence (i.e. in support of an - encoder/decoder model) is passed into can_allocate() as an argument - - The setup for this test is stripped down version of - test_can_allocate_seq_group_encoder_decoder() - ''' - - with pytest.raises((NotImplementedError, AssertionError)) as exc_info: - block_manager = SelfAttnBlockSpaceManager( - block_size=block_size, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - watermark=watermark, - sliding_window=5 # SWA - ) - - num_output_blocks_per_seq = 1 - num_prompt_blocks = 1 - num_output_blocks = num_output_blocks_per_seq - seq_group = create_seq_group_encoder_decoder( - seq_prompt_len=block_size * num_prompt_blocks, - seq_output_lens=[ - block_size * num_output_blocks_per_seq - for _ in range(num_seqs_per_group) - ], - request_id="0") - - assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks - block_manager.can_allocate(seq_group) - - # Assert that either - # 1. Block manager constructor fails with assertion that sliding window - # is not yet supported (most likely near-term outcome at time of - # writing), or - # 2. can_allocate() fails with NotImplementedError due to combination of - # encoder/decoder and sliding window attention - if isinstance(exc_info.value, NotImplementedError): - assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA - elif isinstance(exc_info.value, AssertionError): - assert str(exc_info.value) == "Sliding window not yet supported" - - -@pytest.mark.parametrize("block_size", [16]) -@pytest.mark.parametrize("num_gpu_blocks", [16]) -@pytest.mark.parametrize("num_seqs_per_group", [1]) -@pytest.mark.parametrize("watermark", [0.0, 0.5]) -def test_can_allocate_encoder_decoder_fails_with_prefix_cache( - block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, - watermark: float): - - block_manager = SelfAttnBlockSpaceManager( - block_size=block_size, - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=1024, - watermark=watermark, - enable_caching=True # Prefix cache - ) - - num_output_blocks_per_seq = 1 - num_prompt_blocks = 1 - num_output_blocks = num_output_blocks_per_seq - seq_group = create_seq_group_encoder_decoder( - seq_prompt_len=block_size * num_prompt_blocks, - seq_output_lens=[ - block_size * num_output_blocks_per_seq - for _ in range(num_seqs_per_group) - ], - request_id="0") - - assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks - - # Assert that either can_allocate() fails with NotImplementedError - # due to combination of encoder/decoder and prefix cache - with pytest.raises(NotImplementedError) as exc_info: - block_manager.can_allocate(seq_group) - assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE - - @pytest.mark.parametrize("block_size", [1, 8]) @pytest.mark.parametrize("prompt_len", [1, 7, 8]) @pytest.mark.parametrize("num_slots_to_append", [1, 8, 129]) diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py deleted file mode 100644 index 20cc083ec8db4..0000000000000 --- a/tests/core/test_scheduler_encoder_decoder.py +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest # noqa - -from vllm.config import CacheConfig, SchedulerConfig -from vllm.core.scheduler import Scheduler -from vllm.sequence import SequenceGroup - -from .utils import (append_new_token, create_dummy_prompt_encoder_decoder, - get_sequence_groups, schedule_and_update_computed_tokens) - - -def test_scheduler_schedule_simple_encoder_decoder(): - ''' - Test basic scheduler functionality in the context - of an encoder/decoder model. Focus on testing - enc/dec-specific functionality sense tests already - exist for decoder-only functionality - - Test behavior: - * Construct Scheduler - * Construct dummy encoder/decoder sequence groups - * Add dummy seq groups to scheduler backlog - * Schedule the next seq group & validate: - * Cross-attn block tables - * Updated states of seq groups - * Number of batched tokens - * Number of blocks to copy/swap-in/swap-out - * Number of scheduled seq groups - * Repeat for both prefill- and decode-phase - * Abort scheduled seq groups - * Assert that aborted seq groups no longer appear in - cross-attention block table - ''' - - block_size = 4 - num_seq_group = 4 - max_model_len = 16 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens=64, - max_num_seqs=num_seq_group, - max_model_len=max_model_len, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group - cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group - scheduler = Scheduler(scheduler_config, cache_config, None) - running: list[SequenceGroup] = [] - - # Add seq groups to scheduler. - req_id_list = [] - for i in range(num_seq_group): - req_id = str(i) - req_id_list.append(req_id) - _, _, seq_group = create_dummy_prompt_encoder_decoder( - req_id, block_size, block_size, block_size) - scheduler.add_seq_group(seq_group) - running.append(seq_group) - - # Schedule seq groups prefill. - num_tokens = block_size * num_seq_group - seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler) - # - Verify that sequence group cross-attention block tables are - # registered with the block manager - assert all([(req_id in scheduler.block_manager.cross_block_tables) - for req_id in req_id_list]) - # - Validate sequence-group status - assert set(get_sequence_groups(out)) == set(running) - # - Validate number of batched tokens - assert out.num_batched_tokens == num_tokens - # - Validate there are no remaining blocks to swap - assert (not out.blocks_to_copy and not out.blocks_to_swap_in - and not out.blocks_to_swap_out) - # - Validate all seq groups were scheduled - assert len(seq_group_meta_list) == num_seq_group - append_new_token(out, 1) - - # Schedule seq groups decode. - seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler) - # - Verify that sequence group metadata includes encoder attention - # and cross-attention metadata - assert all([ - not ((seq_group_meta.encoder_seq_data is None) or - (seq_group_meta.cross_block_table is None)) - for seq_group_meta in seq_group_meta_list - ]) - # - Validate sequence-group status - assert set(get_sequence_groups(out)) == set(running) - # - Validate there is one batched token per seq group - assert out.num_batched_tokens == num_seq_group - # - Validate there are no remaining blocks to swap - assert (not out.blocks_to_copy and not out.blocks_to_swap_in - and not out.blocks_to_swap_out) - # - Validate that all seq groups were scheduled - assert len(seq_group_meta_list) == num_seq_group - append_new_token(out, 1) - - # Abort sequences - for req_id in req_id_list: - scheduler.abort_seq_group(req_id) - # - Verify that sequence group cross-attention block tables are - # NO LONGER registered with the block manager - assert req_id not in scheduler.block_manager.cross_block_tables diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index fffab1a984c26..9da9672d95970 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -215,9 +215,7 @@ TEXT_GENERATION_MODELS = { EMBEDDING_MODELS = { # type: ignore[var-annotated] # [Text-only] "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"), - # TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883 - # is fixed - #"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"), + "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"), "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast( load_format="dummy", runner="pooling" ), @@ -244,9 +242,6 @@ MULTIMODAL_MODELS = { "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(), "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(), - # [Encoder-decoder] - # TODO: Implement PP - # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(), } # yapf: enable diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index 65c5e68968440..ded3d834faf00 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -235,7 +235,6 @@ def _compare_sp( 'level': 3, 'custom_ops': ["+rms_norm"], 'compile_sizes': [4, 8], - 'splitting_ops': [], 'pass_config': { 'enable_sequence_parallelism': True, 'enable_fusion': enable_fusion, @@ -251,6 +250,8 @@ def _compare_sp( *common_args, "--tensor-parallel-size", str(tp_size), + "--pipeline-parallel-size", + str(pp_size), "--distributed-executor-backend", distributed_backend, "--compilation_config", diff --git a/tests/encoder_decoder/__init__.py b/tests/encoder_decoder/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py deleted file mode 100644 index 3cf4c377fb581..0000000000000 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ /dev/null @@ -1,131 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""E2E tests to verify the correctness of the encoder-decoder framework - -Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. -""" -from typing import Optional - -import pytest -from transformers import AutoModelForSeq2SeqLM - -from vllm.attention.selector import (_Backend, _cached_get_attn_backend, - global_force_attn_backend_context_manager) -from vllm.platforms import current_platform -from vllm.sequence import SampleLogprobs - -from ..conftest import DecoderPromptType -from ..models.utils import check_logprobs_close - -LIST_ENC_DEC_SUPPORTED_BACKENDS = [ - _Backend.XFORMERS, _Backend.FLASH_ATTN, None -] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -def vllm_to_hf_output( - vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], - decoder_prompt_type: DecoderPromptType, -): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - hf_output_str = output_str + "" - if decoder_prompt_type == DecoderPromptType.NONE: - hf_output_str = "" + hf_output_str - - return output_ids, hf_output_str, out_logprobs - - -@pytest.fixture(autouse=True) -def clear_cache(): - """Fixture to clear backend cache before each test.""" - _cached_get_attn_backend.cache_clear() # Clear the cache - yield # This allows the test to run - - -@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) -@pytest.mark.parametrize("enforce_eager", [True, False]) -@pytest.mark.skipif( - current_platform.is_cpu(), - reason="CPU backend is not currently supported with encoder/decoder models" -) -@pytest.mark.skip(reason="bart not supported in V1") -def test_encoder_decoder_e2e( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, - decoder_prompt_type: DecoderPromptType, - enforce_eager: bool, - attn_backend: _Backend, -) -> None: - ''' - End-to-End (E2E) test for the encoder-decoder framework. - This test evaluates the encoder-decoder functionality using the BART - model. We compare the outputs of the Hugging Face and vLLM - implementations to ensure that both implementations produce consistent - and correct results. - ''' - with global_force_attn_backend_context_manager(attn_backend): - if attn_backend == _Backend.FLASH_ATTN: - # Flash Attention works only with bfloat16 data-type - dtype = 'bfloat16' - test_case_prompts = example_encoder_decoder_prompts[ - decoder_prompt_type] - - # Configuration settings for HF baseline - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_outputs = ( - hf_model.generate_encoder_decoder_greedy_logprobs_limit( - test_case_prompts, - max_tokens, - num_logprobs, - **hf_kwargs, - )) - with vllm_runner(model, dtype=dtype, - enforce_eager=enforce_eager) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - test_case_prompts, max_tokens, num_logprobs) - - hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE - else 0) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) diff --git a/tests/engine/test_stop_checker.py b/tests/engine/test_stop_checker.py new file mode 100644 index 0000000000000..3d1e1c8032a48 --- /dev/null +++ b/tests/engine/test_stop_checker.py @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.reasoning import ReasoningParser +from vllm.sampling_params import SamplingParams +from vllm.sequence import Sequence, SequenceStatus + +REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" + + +class MockReasoningParser(ReasoningParser): + """Mock reasoning parser for testing purposes.""" + + def __init__(self, + tokenizer: AutoTokenizer, + reasoning_active: bool = False): + super().__init__(tokenizer) + self.reasoning_active = reasoning_active + + def is_reasoning_end(self, input_ids: list[int]) -> bool: + return not self.reasoning_active + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + return input_ids + + +class MockSequence(Sequence): + """Mock sequence for testing purposes.""" + + def __init__(self, token_ids, output_text="test_output", eos_token_id=0): + self.token_ids = token_ids + self.output_text = output_text + self.eos_token_id = eos_token_id + self.status = SequenceStatus.RUNNING + self.stop_reason = None + + def get_token_ids(self): + return self.token_ids + + def get_last_token_id(self): + return self.token_ids[-1] if self.token_ids else None + + def get_len(self): + return len(self.token_ids) + + def get_output_len(self): + return len(self.token_ids) - 1 # Simulating prompt + outputs + + +@pytest.fixture +def deepseek_r1_qwen_tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +@pytest.fixture +def stop_checker(): + return StopChecker(max_model_len=10, + get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer) + + +@pytest.fixture +def stop_checker_with_reasoner(): + reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer) + return StopChecker(max_model_len=10, + get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer, + reasoner=reasoner) + + +def test_eos_token_stopping(stop_checker): + """Test sequence stopping when EOS token is encountered.""" + seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) + sampling_params = SamplingParams() + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_STOPPED + + +def test_ignore_eos(stop_checker): + """Test sequence continuing when EOS token is ignored.""" + seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) + sampling_params = SamplingParams(ignore_eos=True) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.RUNNING + + +def test_min_tokens(stop_checker): + """Test min_tokens prevents early stopping.""" + seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) + sampling_params = SamplingParams(min_tokens=3) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.RUNNING + + +def test_stop_token_ids(stop_checker): + """Test sequence stopping with custom stop token IDs.""" + seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) + sampling_params = SamplingParams(stop_token_ids=[3]) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_STOPPED + assert seq.stop_reason == 3 + + +def test_stop_strings(stop_checker): + """Test sequence stopping with stop strings.""" + seq = MockSequence(token_ids=[1, 2, 3], + output_text="test output with STOP", + eos_token_id=0) + sampling_params = SamplingParams(stop=["STOP"]) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_STOPPED + assert seq.stop_reason == "STOP" + assert "STOP" not in seq.output_text # Default behavior removes stop string + + +def test_include_stop_str_in_output(stop_checker): + """Test keeping stop strings in output.""" + seq = MockSequence(token_ids=[1, 2, 3], + output_text="test output with STOP", + eos_token_id=0) + sampling_params = SamplingParams(stop=["STOP"], + include_stop_str_in_output=True) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=5, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_STOPPED + assert "STOP" in seq.output_text + + +def test_max_tokens(stop_checker): + """Test sequence stopping at max_tokens.""" + seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) + sampling_params = SamplingParams(max_tokens=2) + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED + + +def test_max_model_len(stop_checker): + """Test sequence stopping at max_model_len.""" + seq = MockSequence(token_ids=list(range(11)), + eos_token_id=0) # 11 tokens, max is 10 + sampling_params = SamplingParams() + + stop_checker.maybe_stop_sequence(seq, + new_char_count=1, + sampling_params=sampling_params) + + assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED + + +def test_reasoning_skip_stops(stop_checker_with_reasoner): + """Test that stop tokens and strings are ignored during reasoning.""" + # Set reasoning_active to True to simulate being in reasoning mode + stop_checker_with_reasoner.reasoner.reasoning_active = True + + # Test with stop token + seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) + sampling_params = SamplingParams(stop_token_ids=[3]) + + stop_checker_with_reasoner.maybe_stop_sequence( + seq, new_char_count=1, sampling_params=sampling_params) + assert seq.status == SequenceStatus.RUNNING + + # Test with stop string + seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP") + sampling_params = SamplingParams(stop=["STOP"]) + + stop_checker_with_reasoner.maybe_stop_sequence( + seq, new_char_count=4, sampling_params=sampling_params) + assert seq.status == SequenceStatus.RUNNING + + # But EOS token still stops the sequence + seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0) + sampling_params = SamplingParams() + + stop_checker_with_reasoner.maybe_stop_sequence( + seq, new_char_count=1, sampling_params=sampling_params) + assert seq.status == SequenceStatus.FINISHED_STOPPED + + +def test_reasoning_end_enables_stops(stop_checker_with_reasoner): + """Test that stop tokens work after reasoning ends.""" + # Set reasoning_active to False to simulate being out of reasoning mode + stop_checker_with_reasoner.reasoner.reasoning_active = False + + # Test with stop token + seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0) + sampling_params = SamplingParams(stop_token_ids=[3]) + + stop_checker_with_reasoner.maybe_stop_sequence( + seq, new_char_count=1, sampling_params=sampling_params) + assert seq.status == SequenceStatus.FINISHED_STOPPED + + # Test with stop string + seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP") + sampling_params = SamplingParams(stop=["STOP"]) + + stop_checker_with_reasoner.maybe_stop_sequence( + seq, new_char_count=4, sampling_params=sampling_params) + assert seq.status == SequenceStatus.FINISHED_STOPPED diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py deleted file mode 100644 index 75612962c95f7..0000000000000 --- a/tests/entrypoints/openai/test_encoder_decoder.py +++ /dev/null @@ -1,56 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import openai -import pytest -import pytest_asyncio - -from ...utils import RemoteOpenAIServer - -MODEL_NAME = "facebook/bart-base" - - -@pytest.fixture(scope="module") -def server(): - args = [ - "--dtype", - "bfloat16", - "--enforce-eager", - ] - - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server - - -@pytest_asyncio.fixture -async def client(server): - async with server.get_async_client() as async_client: - yield async_client - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.skip(reason="bart is not yet supported in V1") -async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): - completion = await client.completions.create(model=model_name, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - - choice = completion.choices[0] - assert len(choice.text) >= 5 - assert choice.finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=2, total_tokens=7) - - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 1 diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 818efd825640c..2bf29ecf087f3 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -9,7 +9,7 @@ from unittest.mock import MagicMock import pytest -from vllm.config import MultiModalConfig +from vllm.config.multimodal import MultiModalConfig from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index a4e1aca8bcac2..0c9e0f3a51429 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -232,6 +232,9 @@ EXPECTED_METRICS_V1 = [ "vllm:gpu_cache_usage_perc", "vllm:gpu_prefix_cache_queries", "vllm:gpu_prefix_cache_hits", + "vllm:kv_cache_usage_perc", + "vllm:prefix_cache_queries", + "vllm:prefix_cache_hits", "vllm:num_preemptions_total", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", @@ -277,6 +280,9 @@ EXPECTED_METRICS_V1 = [ ] HIDDEN_DEPRECATED_METRICS: list[str] = [ + "vllm:gpu_cache_usage_perc", + "vllm:gpu_prefix_cache_queries", + "vllm:gpu_prefix_cache_hits", "vllm:time_per_output_token_seconds_sum", "vllm:time_per_output_token_seconds_bucket", "vllm:time_per_output_token_seconds_count", @@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool): running_requests, waiting_requests, kv_cache_usage = ( - _get_running_metrics_from_api(server)) + _get_running_metrics_from_api(server, use_v1)) # Expect no running requests or kvcache usage assert running_requests == 0 @@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, # Check that we have running requests running_requests, waiting_requests, kv_cache_usage = ( - _get_running_metrics_from_api(server)) + _get_running_metrics_from_api(server, use_v1)) # Expect running requests and kvcache usage assert running_requests > 0 @@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, # Verify running and waiting requests counts and KV cache usage are zero running_requests_after, waiting_requests_after, kv_cache_usage_after = ( - _get_running_metrics_from_api(server)) + _get_running_metrics_from_api(server, use_v1)) assert running_requests_after == 0,\ (f"Expected 0 running requests after abort, got " @@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, f"{kv_cache_usage_after}") -def _get_running_metrics_from_api(server: RemoteOpenAIServer): +def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool): """Return (running_count, waiting_count, kv_cache_usage)""" response = requests.get(server.url_for("metrics")) @@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer): # Verify running and waiting requests counts and KV cache usage are zero running_requests, waiting_requests, kv_cache_usage = None, None, None + kv_cache_usage_metric = ("vllm:kv_cache_usage_perc" + if use_v1 else "vllm:gpu_cache_usage_perc") + for family in text_string_to_metric_families(response.text): if family.name == "vllm:num_requests_running": for sample in family.samples: @@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer): if sample.name == "vllm:num_requests_waiting": waiting_requests = sample.value break - elif family.name == "vllm:gpu_cache_usage_perc": + elif family.name == kv_cache_usage_metric: for sample in family.samples: - if sample.name == "vllm:gpu_cache_usage_perc": + if sample.name == kv_cache_usage_metric: kv_cache_usage = sample.value break diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 0d5836fab5a7c..88b3795abe73e 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str): assert response.status == "completed" +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_max_tokens(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is the first paragraph of Moby Dick?", + reasoning={"effort": "low"}, + max_output_tokens=30, + ) + assert response is not None + assert response.status == "incomplete" + assert response.incomplete_details.reason == "max_output_tokens" + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_chat(client: OpenAI, model_name: str): diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index d219a1f311f15..502704c9bbdff 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -12,7 +12,7 @@ from unittest.mock import MagicMock import pytest import pytest_asyncio -from vllm.config import MultiModalConfig +from vllm.config.multimodal import MultiModalConfig from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index dd33f5c8c1d8e..84dab737ece26 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -20,7 +20,6 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template, parse_chat_messages_futures, resolve_chat_template_content_format, resolve_hf_chat_template) -from vllm.entrypoints.llm import apply_hf_chat_template from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64, encode_video_base64) @@ -38,7 +37,6 @@ QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct" QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct" QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct" QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B" -MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B" HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B" MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" @@ -125,27 +123,6 @@ def qwen25omni_tokenizer(): ) -@pytest.fixture(scope="module") -def mllama_model_config(): - return ModelConfig( - MLLAMA_MODEL_ID, - runner="generate", - limit_mm_per_prompt={ - "image": 2, - }, - ) - - -@pytest.fixture(scope="module") -def mllama_tokenizer(): - return TokenizerGroup( - MLLAMA_MODEL_ID, - enable_lora=False, - max_num_seqs=5, - max_input_length=None, - ) - - @pytest.fixture(scope="function") def mistral_model_config(): return ModelConfig( @@ -2249,180 +2226,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( ) -### Mllama currently wraps images / texts as interleaved dictionaries -def test_mllama_single_image( - mllama_model_config, - mllama_tokenizer, - image_url, -): - """Ensures that a single image is parsed correctly mllama.""" - conversation, mm_data, mm_uuids = parse_chat_messages( - [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "The content of this image is:" - }, - { - "image_url": image_url - }, - ], - }], - mllama_model_config, - mllama_tokenizer, - content_format="openai", - ) - _assert_mm_data_is_image_input(mm_data, 1) - _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) - assert conversation == [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "The content of this image is:" - }, - { - "type": "image" - }, - ], - }] - - -def test_mllama_interleaved_images( - mllama_model_config, - mllama_tokenizer, - image_url, -): - """Ensures that multiple image are parsed as interleaved dicts.""" - conversation, mm_data, mm_uuids = parse_chat_messages( - [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "The content of the first image is:", - }, - { - "image_url": image_url - }, - { - "type": "text", - "text": "The content of the second image is:", - }, - { - "image_url": image_url - }, - ], - }], - mllama_model_config, - mllama_tokenizer, - content_format="openai", - ) - _assert_mm_data_is_image_input(mm_data, 2) - _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) - assert conversation == [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "The content of the first image is:" - }, - { - "type": "image" - }, - { - "type": "text", - "text": "The content of the second image is:" - }, - { - "type": "image" - }, - ], - }] - - -@pytest.mark.parametrize("model", [MLLAMA_MODEL_ID]) -def test_multimodal_image_parsing_matches_hf(model, image_url): - """Checks end to end hf alignment for multimodal [image] parsing.""" - - def get_conversation(is_hf: bool): - img_part = {"type": "image_url", "image_url": {"url": image_url}} - if is_hf: - img_part = {"type": "image"} - return [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "The content of the first image is:", - }, - img_part, - { - "type": "text", - "text": "The content of the second image is:", - }, - img_part, - { - "type": "text", - "text": "What animal is in the first image?", - }, - ], - }] - - # Build a config for the model - model_config = ModelConfig( - model, - runner="generate", - limit_mm_per_prompt={ - "image": 2, - }, - ) - - # Build the tokenizer group and grab the underlying tokenizer - tokenizer_group = TokenizerGroup( - model, - enable_lora=False, - max_num_seqs=5, - max_input_length=None, - trust_remote_code=model_config.trust_remote_code, - ) - tokenizer = tokenizer_group.tokenizer - - # Build and parse a conversation with {"type": "image"} using the tokenizer - hf_conversation = get_conversation(is_hf=True) - hf_result = tokenizer.apply_chat_template( - hf_conversation, - tokenize=False, - add_generation_prompt=True, - ) - - # Now parse with vLLMs chat utils & apply the template - vllm_conversation = get_conversation(is_hf=False) - conversation, _, _ = parse_chat_messages( - vllm_conversation, - model_config, - tokenizer_group, - content_format="openai", - ) - - vllm_result = apply_hf_chat_template( - tokenizer=tokenizer, - conversation=conversation, - chat_template=None, - model_config=model_config, - tools=None, - add_generation_prompt=True, - ) - - assert hf_result == vllm_result - - @pytest.mark.parametrize( "model", [ @@ -2486,7 +2289,6 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): (QWEN25VL_MODEL_ID, "openai"), (ULTRAVOX_MODEL_ID, "string"), (QWEN2AUDIO_MODEL_ID, "openai"), - (MLLAMA_MODEL_ID, "openai"), (LLAMA_GUARD_MODEL_ID, "openai")], ) # yapf: enable @@ -2545,7 +2347,6 @@ def test_resolve_content_format_hf_defined(model, expected_format): [("Salesforce/blip2-opt-2.7b", "string"), ("facebook/chameleon-7b", "string"), ("deepseek-ai/deepseek-vl2-tiny", "string"), - ("microsoft/Florence-2-base", "string"), ("adept/fuyu-8b", "string"), ("google/paligemma-3b-mix-224", "string"), ("Qwen/Qwen-VL", "string"), diff --git a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml new file mode 100644 index 0000000000000..7ec6a1e0be27f --- /dev/null +++ b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml @@ -0,0 +1,6 @@ +model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8" +accuracy_threshold: 0.72 +num_questions: 1319 +num_fewshot: 5 +max_model_len: 4096 + diff --git a/tests/evals/gsm8k/configs/models-small.txt b/tests/evals/gsm8k/configs/models-small.txt index afd1065b9191b..7bce3f0004f7d 100644 --- a/tests/evals/gsm8k/configs/models-small.txt +++ b/tests/evals/gsm8k/configs/models-small.txt @@ -3,3 +3,4 @@ Llama-3.2-1B-Instruct-INT8-CT.yaml Llama-3-8B-Instruct-nonuniform-CT.yaml Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml Qwen1.5-MoE-W4A16-CT.yaml +DeepSeek-V2-Lite-Instruct-FP8.yaml diff --git a/tests/kernels/attention/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py deleted file mode 100644 index a2e6986460904..0000000000000 --- a/tests/kernels/attention/test_encoder_decoder_attn.py +++ /dev/null @@ -1,1105 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Tests: - -* E2E test of Encoder attention + Decoder self-attention + - Encoder/decoder cross-attention (collectively - "encoder/decoder attention") - -""" - -from typing import NamedTuple, Optional - -import pytest -import torch - -from tests.kernels.utils import * -from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP -from vllm.attention.selector import (_Backend, _cached_get_attn_backend, - global_force_attn_backend_context_manager) -from vllm.config import VllmConfig, set_current_vllm_config -from vllm.forward_context import set_forward_context -from vllm.platforms import current_platform - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Encoder-decoder is only supported on V0, so set - VLLM_USE_V1=0 for all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -# List of support backends for encoder/decoder models -LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] -HEAD_SIZES = [64, 256] - -NUM_HEADS = [1, 16] - -BATCH_SIZES = [1, 16] -BLOCK_SIZES = [16] -CUDA_DEVICE = "cuda:0" - -MAX_DEC_SEQ_LENS = [128] -MAX_ENC_SEQ_LENS = [128] - -# Narrow test-cases for unsupported-scenario -# tests -HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]] - - -class TestPoint(NamedTuple): - """ - Encapsulates the attributes which define a single invocation - of the test_e2e_enc_dec_attn() test - - Attributes: - num_heads: The number of heads in the model. - head_size: Head dimension - backend_name: Name of the backend framework used. - batch_size: Number of samples per batch. - block_size: Size of each block of data processed. - max_dec_seq_len: Maximum sequence length for the decoder. - max_enc_seq_len: Maximum sequence length for the encoder. - num_blocks: Number of blocks in the model. - """ - - num_heads: int - head_size: int - backend_name: str - batch_size: int - block_size: int - max_dec_seq_len: int - max_enc_seq_len: int - num_blocks: int - attn_type: AttentionType - - -class TestResources(NamedTuple): - ''' - Encapsulates key components for performing an - encoder/decoder attention test - - Note that - (1) attn automatically selects an attention backend - based on platform info & a set of canned - heuristics - (2) attn_backend is thus *not the same backend - instance* used by attn, but rather it is - intended to be a - *different instance* of the *same backend class*; - it is assumed that the user of TestResources - will leverage attn_backend for the purpose of - constructing backend-compatible attention - metadata instances - - Attributes: - - * scale: 1/sqrt(d) scale factor for attn - * attn_backend: implementations of abstraction - attention interface using - a particular kernel library - i.e. XFormers - * attn: Attention layer instance - * kv_cache: shared key/value cache for all attention - ''' - - scale: float - attn: Attention - kv_cache: torch.Tensor - - -def _make_test_resources(test_pt: TestPoint, ) -> TestResources: - ''' - Build key components for performing encoder/decoder attention test. - - Note that - (1) The Attention instance constructed here, automatically selects - an attention backend class based on platform info & a set of canned - heuristics, so - (2) The attention backend instance constructed here is thus *not - the same backend instance* used by attn, but rather it is - intended to be a *different instance* of the *same backend class*; - therefore, - (3) This function requires that test_pt.backend_name matches the backend - class that Attention will automatically select when it is constructed. - - - Arguments: - - * test_pt: TestPoint data structure; this function relies on the - following fields: num_heads, head_size, num_blocks, - block_size, backend_name - - Returns: - - * TestResources data structure. - ''' - - scale = float(1.0 / (test_pt.head_size**0.5)) - attn = Attention( - test_pt.num_heads, - test_pt.head_size, - scale=scale, - prefix=f"{test_pt.attn_type}", - attn_type=test_pt.attn_type, - ) - if test_pt.num_blocks is None or test_pt.num_heads is None: - # Caller does not require a KV cache - return TestResources( - scale, attn, - torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE)) - - # Construct KV cache - if test_pt.attn_type in (AttentionType.DECODER, - AttentionType.ENCODER_DECODER): - kv_cache = make_kv_cache(test_pt.num_blocks, - test_pt.num_heads, - test_pt.head_size, - test_pt.block_size, - device=CUDA_DEVICE, - backend=test_pt.backend_name) - else: - kv_cache = torch.tensor([]) - - attn.kv_cache = [kv_cache] - return TestResources(scale, attn, kv_cache) - - -def _encoder_attn_setup( - test_pt: TestPoint, - test_rsrcs: TestResources, -) -> PhaseTestParameters: - ''' - Set up test vectors & data structures for encoder attention test. - - A triplet of synthetic query/key/value tensors are constructed. - Given this is an encoder attention test, the key & value - sequences will have the same length as the corresponding queries. - - The query/key/value tensors are passed to an ideal reference - self-attention implementation to generate an ideal output tensor. - - Encoder inference does not populate the KV cache, therefore - no KV cache memory mapping is constructed - - Arguments: - - * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, - block_size, max_q_seq_len - * test_rsrcs: TestResources data structure; this function relies on the - scale field - - - Returns: - - * PhaseTestParameters data structure comprising (1) packed query/key/value - tensors, (2) the ideal output of attention computed using a naive - implementation, and (3) KVCache field set to None - ''' - - ( - num_heads, - head_size, - _, - batch_size, - _, - _, - max_q_seq_len, - _, - _, - ) = test_pt - - scale = test_rsrcs.scale - - max_kv_seq_len = max_q_seq_len - - # Make test tensors - - qkv_in, _, _ = make_qkv(batch_size, - max_q_seq_len, - max_kv_seq_len, - num_heads, - head_size, - attn_type=AttentionType.ENCODER, - device=CUDA_DEVICE) - - # Compute correct answer using naive non-causal attention - # implementation - - ideal_output = ref_masked_attention(qkv_in.query, - qkv_in.key, - qkv_in.value, - scale=scale, - q_seq_lens=qkv_in.q_seq_lens, - kv_seq_lens=qkv_in.kv_seq_lens) - - packed_ideal_output, _ = pack_tensor(ideal_output, - qkv_in.q_seq_lens, - device=CUDA_DEVICE) - - packed_qkv = pack_qkv(qkv_in, device=CUDA_DEVICE) - - return PhaseTestParameters( - PackedQKVO(packed_qkv, packed_ideal_output), - None # No KV cache - ) - - -def _decoder_attn_setup( - test_pt: TestPoint, - test_rsrcs: TestResources, - block_base_addr: int = 0, -) -> tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]: - ''' - Set up test vectors & data structures for self-attention test. - - A triplet of synthetic query/key/value tensors are constructed ("baseline" - query/key/value). Given this is a self-attention test, the key & value - sequences will have the same length as the corresponding queries. - - "Prefill" query/key/value tensors are derived by masking out the last value - in each baseline query/key/value. These tensors are used to test prefill & - populate KV cache for a subsequent decode test. - - "Decode" query/key/value tensors are derived by extracting *only* the last - value from each baseline query/key/value (i.e. complement of the prefill - tensors.) These tensors are used to test decode, conditional on the kv cache - being populated during the prefill test. - - The baseline query/key/value tensors are passed to an ideal reference - self-attention implementation to generate a "Baseline" ideal output tensor. - This tensor is split into the "Prefill" ideal output tensor (all but the - last element of each output sequence) and the "Decode" ideal output tensor - (*only* the last element of each output sequence); the "Prefill" and - "Decode" ideal output tensors can be used to validate the prefill and decode - test results, respectively. - - This function also constructs the self-attention KV cache memory mapping - (slot mapping and block table), ensuring that the block table starts at - block_base_addr - - Arguments: - - * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, - block_size, max_q_seq_len - * test_rsrcs: TestResources data structure; this function relies on the - scale field - * block_base_addr: decoder self-attention block-table base address - - Returns: - * qkv: Unpacked (batch_size x padded_seq_len x num_heads x - head_size) query/key/value tensors - * Prefill-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) - query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data - structures appropriate for prefill phase. - * Decode-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) - query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data - structures appropriate for decode phase. - * max_block_idx: max physical address in decoder self-attention block-table - (intended to be used as the base address for the encoder/ - decoder cross-attention block-table, which is not - constructed in this function) - ''' - - ( - num_heads, - head_size, - _, - batch_size, - block_size, - max_q_seq_len, - _, - _, - _, - ) = test_pt - - scale = test_rsrcs.scale - - max_kv_seq_len = max_q_seq_len - - # Build test tensors - - ( - qkv, - prefill_qkv, - decode_qkv, - ) = make_qkv(batch_size, - max_q_seq_len, - max_kv_seq_len, - num_heads, - head_size, - attn_type=AttentionType.DECODER, - device=CUDA_DEVICE) - - # Compute correct answer using naive attention implementation - # with causal attention mask - - causal_mask = make_causal_mask(max_q_seq_len, - max_kv_seq_len).to(CUDA_DEVICE) - - ideal_output = ref_masked_attention(qkv.query, - qkv.key, - qkv.value, - scale=scale, - custom_mask=causal_mask, - q_seq_lens=qkv.q_seq_lens, - kv_seq_lens=qkv.kv_seq_lens) - - # Split out the prefill- & decode-phase ideal answers & pack them - - prefill_ideal_output = torch.zeros_like(ideal_output) - decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1]) - for bdx, prefill_q_seq_len in enumerate(prefill_qkv.q_seq_lens): - prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[ - bdx, :prefill_q_seq_len] - decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:( - prefill_q_seq_len + 1)] - - prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output, - prefill_qkv.q_seq_lens, - device=CUDA_DEVICE) - decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output, - [1 for _ in range(batch_size)], - device=CUDA_DEVICE) - - # Build prefill- & decode-phase data structures - # for decoder self-attention. Block tables and - # slot mapping must be in a format compatible - # with KV caching & attention kernels - # - # Prefill-phase: - # - # * Empty block-tables tensor - # * Slot-mapping with entries for prompt tokens - # - # Decode-phase: - # * Block-tables tensor with minimum number of blocks - # required by total num. tokens in the entirety of all sequences - # (including both prefill & decode) - # * Slot-mapping with entries for tokens that will be decoded in the - # current decode iteration - # - # Note: the format described above is simply mirroring what ModelRunner - # produces - - prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE) - - ( - decode_block_tables, - slot_mapping_list, - max_block_idx, - ) = make_block_tables_slot_mapping(block_size, - qkv.q_seq_lens, - device=CUDA_DEVICE, - block_base_addr=block_base_addr) - - ( - prefill_slot_mapping, - decode_slot_mapping, - ) = split_slot_mapping(slot_mapping_list, - qkv.q_seq_lens, - device=CUDA_DEVICE) - - prefill_pckd_qkv = pack_qkv(prefill_qkv, device=CUDA_DEVICE) - - decode_pckd_qkv = pack_qkv(decode_qkv, device=CUDA_DEVICE) - - return ( - qkv, - PhaseTestParameters( # Prefill test params - PackedQKVO(prefill_pckd_qkv, prefill_packed_ideal_output), - KVMemoryMap(prefill_block_tables, prefill_slot_mapping)), - PhaseTestParameters( # Decode test params - PackedQKVO(decode_pckd_qkv, decode_packed_ideal_output), - KVMemoryMap(decode_block_tables, decode_slot_mapping)), - max_block_idx) - - -def _enc_dec_cross_attn_setup_reuses_query( - decoder_qkv: QKVInputs, - encoder_test_params: PhaseTestParameters, - prefill_decoder_phase_test_params: PhaseTestParameters, - test_pt: TestPoint, - test_rsrcs: TestResources, - block_base_addr: int = 0, -) -> tuple[PhaseTestParameters, PhaseTestParameters]: - ''' - Set up test vectors & data structures for cross-attention test. - - A triplet of synthetic cross-attention key/value tensors are constructed - ("baseline" key/value). Given this is a cross-attention test, we assume - query tensors were already synthesized for a prior self-attention test and - will be reused for cross-attention. The key & value sequences generated here - may have a different length than the corresponding queries (as is often - the case for cross-attention between decoder and encoder sequences.) - - Cross attention key & value tensors do not grow during autoregressive - inference; thus this function obtains a single key/value pair suitable for - both prefill and decode. - - The "baseline" query tensor is received as an argument. The "baseline" - query/key/value tensors are passed to an ideal reference cross-attention - implementation to generate a "baseline" ideal output tensor. This tensor is - split into the "Prefill" ideal output tensor (all but the last element of - each output sequence) and the "Decode" ideal output tensor (*only* the last - element of each output sequence); the "Prefill" and "Decode" ideal output - tensors can be used to validate the prefill and decode test results, - respectively. - - This function also constructs the cross-attention KV cache memory mapping - (slot mapping and block table), ensuring that the block table starts at - block_base_addr. - - Arguments: - - * decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x - num_heads x head_size) decoder self-attention inputs; - this function relies on the query and q_seq_lens - fields - * encoder_test_params: PhaseTestParameters data structure which was - used for encoder inference; KV cache field - is not used by this function - * prefill_decoder_phase_test_params: PhaseTestParameters data structure - used for prefill-phase decoder - self-attention; all fields - including KV cache required - * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, - block_size, max_q_seq_len - * test_rsrcs: TestResources data structure; this function relies on the - scale field - * block_base_addr: decoder self-attention block-table base address - - Returns: - - * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data - structure, including (1) packed - (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a - naive implementation, and (3) memory-mapping data structures appropriate - for prefill phase. - * Decode-phase encoder/decoder cross-attention PhaseTestParameters data - structure, including (1) packed - (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a - naive implementation, and (3) memory-mapping data structures appropriate - for decode phase. - ''' - - assert encoder_test_params.packed_qkvo.packed_qkv is not None - assert prefill_decoder_phase_test_params.packed_qkvo.packed_qkv is not None - - ( - num_heads, - head_size, - _, - batch_size, - block_size, - max_decoder_seq_len, - max_encoder_seq_len, - _, - _, - ) = test_pt - - scale = test_rsrcs.scale - - decoder_query = decoder_qkv.query - decoder_seq_lens = decoder_qkv.q_seq_lens - encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens - prefill_q_seq_lens = ( - prefill_decoder_phase_test_params.packed_qkvo.packed_qkv.q_seq_lens) - - assert prefill_q_seq_lens is not None - - ( - cross_kv, - _, - _, - ) = make_qkv(batch_size, - max_decoder_seq_len, - max_encoder_seq_len, - num_heads, - head_size, - force_kv_seq_lens=encoder_seq_lens, - attn_type=AttentionType.ENCODER_DECODER, - device=CUDA_DEVICE) - - ideal_output = ref_masked_attention(decoder_query, - cross_kv.key, - cross_kv.value, - scale=scale, - q_seq_lens=decoder_seq_lens, - kv_seq_lens=cross_kv.kv_seq_lens) - - prefill_ideal_output = torch.zeros_like(ideal_output) - decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1]) - for bdx, prefill_q_seq_len in enumerate(prefill_q_seq_lens): - prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[ - bdx, :prefill_q_seq_len] - decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:( - prefill_q_seq_len + 1)] - - prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output, - prefill_q_seq_lens, - device=CUDA_DEVICE) - decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output, - [1 for _ in range(batch_size)], - device=CUDA_DEVICE) - - # Build prefill- & decode-phase data structures - # for encoder/decoder cross-attention. Block tables and - # slot mapping must be in a format compatible - # with KV caching & attention kernels - # - # Whereas decoder self-attention extracts relationships between - # equal-length Q/K/V sequences, which mutually grow in length - # with each decoded token, cross-attention relates the Q sequence - # - which grows with each new decoded token - to fixed-length - # K and V sequences derived from the encoder hidden states. - # - # Prefill-phase: - # - # * Empty block-tables tensor - # * Slot-mapping with as many entries as there are tokens in the encoder - # prompt. - # - # Decode-phase: - # * Block-tables tensor with minimum number of blocks to - # accommodate K & V tensors which are equal in lnegth - # to the encoder prompt length - # * Empty slot-mapping tensor (since K & V are fixed in size, - # new decoded tokens are not KV-cached and require no slot- - # mapping) - # - # Note: the format above is simply an extension of what ModelRunner - # produces for decoder-only models - - prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE) - decode_slot_mapping = make_empty_slot_mapping_tensor(device=CUDA_DEVICE) - - ( - decode_block_tables, - prefill_slot_mapping_list, - _, - ) = make_block_tables_slot_mapping(block_size, - cross_kv.kv_seq_lens, - block_base_addr=block_base_addr, - device=CUDA_DEVICE) - - prefill_slot_mapping = maybe_make_long_tensor(prefill_slot_mapping_list, - device=CUDA_DEVICE) - - # Packed key/value (query is already provided) - packed_cross_kv = pack_qkv(cross_kv, device=CUDA_DEVICE) - - return ( - PhaseTestParameters( # Prefill-phase test params - PackedQKVO(packed_cross_kv, prefill_packed_ideal_output), - KVMemoryMap(prefill_block_tables, prefill_slot_mapping)), - PhaseTestParameters( # Decode-phase test params - PackedQKVO(None, decode_packed_ideal_output), - KVMemoryMap(decode_block_tables, decode_slot_mapping))) - - -def _run_encoder_attention_test( - attn: Attention, - encoder_test_params: PhaseTestParameters, - attn_metadata: AttentionMetadata, - test_pt: TestPoint, - vllm_config: VllmConfig, -) -> torch.Tensor: - ''' - Run encoder attention. - - attn.forward() is passed attn_type=AttentionType.ENCODER in order - to configure the kernel invocation for encoder attention - - Requires attn_metadata.num_decode_tokens == 0 - (There is no encoder execution in the decode-phase) - - Arguments: - - * attn: Attention wrapper instance - * encoder_test_params: encoder PhaseTestParameters data structure; - this function relies on the packed - (number_of_tokens x num_heads x head_size) - query/key/value fields - * attn_metadata: attention metadata for encoder/decoder-self attention - * test_pt: The TestPoint object containing test details like number of - model heads, head size, name of the backend being used etc. - - Returns: - * Attention.forward() applied to packed {query,key,value} and - & attn_metadata - ''' - assert attn_metadata.num_decode_tokens == 0 - packed_qkv = encoder_test_params.packed_qkvo.packed_qkv - assert packed_qkv is not None - with set_forward_context(attn_metadata, vllm_config): - # In the test setup the shape of the query is - # [batch_size, seq_len, num_heads, head_size]. However - # the attention backend expect the shape to be - # [num_tokens, hidden_size]. Hence reshape the query before - # invoking the forward method. - # TODO - Update the way we construct the query so that it - # is shaped as [num_tokens, hidden_size] and we can skip the reshape. - reshaped_query = packed_qkv.query.view( - -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value) - - -def _run_decoder_self_attention_test( - test_rsrcs: TestResources, - decoder_test_params: PhaseTestParameters, - attn_metadata: AttentionMetadata, - test_pt: TestPoint, - vllm_config: VllmConfig, -) -> torch.Tensor: - ''' - Run decoder self-attention test. - - attn.forward() is passed attn_type=AttentionType.DECODER - in order to configure the kernel invocation for decoder self-attention. - - Arguments: - - * test_rsrcs: TestResources instance; this function relies on the kv_cache - and attn (Attention wrapper instance) fields - * decoder_test_params: decoder PhaseTestParameters data structure; - this function relies on the packed - (number_of_tokens x num_heads x head_size) - query/key/value fields - * attn_metadata: attention metadata for decoder-self attention - (contains KV cache memory-mapping) - * test_pt: The TestPoint object containing test details like number of - model heads, head size, name of the backend being used etc. - - Returns: - * Attention.forward() applied to packed_{query,key,value}, kv_cache - & attn_metadata - ''' - attn = test_rsrcs.attn - packed_qkv = decoder_test_params.packed_qkvo.packed_qkv - assert packed_qkv is not None - with set_forward_context(attn_metadata, vllm_config): - # In the test setup the shape of the query is - # [batch_size, seq_len, num_heads, head_size]. However - # the attention backend expect the shape to be - # [num_tokens, hidden_size]. Hence reshape the query before - # invoking the forward method. - # TODO - Update the way we construct the query so that it - # is shaped as [num_tokens, hidden_size] and we can skip the reshape. - reshaped_query = packed_qkv.query.view( - -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value) - - -def _run_encoder_decoder_cross_attention_test( - test_rsrcs: TestResources, - decoder_test_params: PhaseTestParameters, - cross_test_params: Optional[PhaseTestParameters], - attn_metadata: AttentionMetadata, - test_pt: TestPoint, - vllm_config: VllmConfig, -) -> torch.Tensor: - ''' - Run encoder/decoder cross-attention test. - - Via PhaseTestParameters data structures, consumes the same query utilized - for decoder self-attention, plus a key/value specific to cross-attention. - - if cross_test_params is None or cross_test_params.packed_qkvo.packed_qkv - is None, this reflects that in decode-phase cross attention there - is no growth in the key and value tensors. - - attn.forward() is passed attn_type=AttentionType.ENCODER_DECODER - in order to configure the kernel invocation for encoder/decoder cross- - attention. - - Arguments: - - * test_rsrcs: TestResources instance; this function relies on the kv_cache - and attn (Attention wrapper instance) fields - * decoder_test_params: decoder PhaseTestParameters data structure; - this function relies on the packed - (number_of_tokens x num_heads x head_size) - query field - * cross_test_params: encoder/decoder PhaseTestParameters data structure; - this function relies on the packed - (number_of_tokens x num_heads x head_size) - key/value fields - * attn_metadata: attention metadata for encoder/decoder-self attention - * test_pt: The TestPoint object containing test details like number of - model heads, head size, name of the backend being used etc. - - Returns: - * Attention.forward() applied to packed_{query,key,value}, kv_cache - & attn_metadata - ''' - assert decoder_test_params.packed_qkvo.packed_qkv is not None - - attn = test_rsrcs.attn - if cross_test_params is None: - key = None - value = None - else: - cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv - key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key) - value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value) - with set_forward_context(attn_metadata, vllm_config): - # In the test setup the shape of the query is - # [batch_size, seq_len, num_heads, head_size]. However - # the attention backend expect the shape to be - # [num_tokens, hidden_size]. Hence reshape the query before - # invoking the forward method. - # TODO - Update the way we construct the query so that it - # is shaped as [num_tokens, hidden_size] and we can skip the reshape. - reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view( - -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, key, value) - - -@pytest.fixture(autouse=True) -def set_reset_environment(attn_backend): - # Set the default torch datatype to bfloat16 to enable - # testing of the Flash Attention backend. Also clear the - # cached value of the backend. - default_dtype = torch.get_default_dtype() - if attn_backend.name == 'FLASH_ATTN': - torch.set_default_dtype(torch.bfloat16) - _cached_get_attn_backend.cache_clear() - yield - # Reset the torch datatype to what it was before the test - # so as not to impact the remaining tests. - torch.set_default_dtype(default_dtype) - - -@pytest.mark.skipif(current_platform.is_rocm(), - reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS) -@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS) -def test_encoder_only( - num_heads: int, - head_size: int, - attn_backend: _Backend, - batch_size: int, - block_size: int, - max_dec_seq_len: int, - max_enc_seq_len: int, -): - ''' - End-to-end encoder-only attention test: - - * Construct fake test vectors for (1) encoder attention - * Construct (1) attention metadata structure with prefill-phase - encoder attention, and (2) an analogous attention metadata - structure but for decode-phase - * Test & validate encoder attention against ideal output - - No KV cache is required for encoder-only attention. - - Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if - current_platform.is_rocm(). - - This test globally forces an override of the usual backend - auto-selection process, forcing the specific backend-under-test - to be utilized. - - Arguments: - - * num_heads - * head_size, - * attn_backend: The attention backend to employ for testing - * batch_size - * block_size: KV cache block size - * max_dec_seq_len: max length of decoder input sequences - * max_enc_seq_len: max length of encoder input sequences - ''' - # Force Attention wrapper backend - with global_force_attn_backend_context_manager(attn_backend): - # Note: KV cache size of 4096 is arbitrary & chosen intentionally - # to be more than necessary, since exceeding the kv cache size - # is not part of this test - test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096, AttentionType.ENCODER) - - # Attention scale factor, attention backend instance, attention wrapper - # instance, KV cache init - vllm_config = VllmConfig() - with set_current_vllm_config(vllm_config): - test_rsrcs = _make_test_resources(test_pt) - - # Construct encoder attention test params (only used - # during prefill) - - enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs) - - # Shared prefill metadata structure - - prephase_attn_metadata: AttentionMetadata = make_test_metadata( - attn_backend, - True, - None, - decoder_test_params=None, - encoder_test_params=enc_test_params, - cross_test_params=None, - device=CUDA_DEVICE) - - # PREFILL: encoder attention - - enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test( - test_rsrcs.attn, - enc_test_params, - prephase_attn_metadata, - test_pt=test_pt, - vllm_config=vllm_config)) - - # - Is encoder attention result correct? - assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, - attn_backend.name) - - -@pytest.mark.skipif(current_platform.is_rocm(), - reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS) -@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS) -def test_e2e_enc_dec_attn( - num_heads: int, - head_size: int, - attn_backend: _Backend, - batch_size: int, - block_size: int, - max_dec_seq_len: int, - max_enc_seq_len: int, -) -> None: - ''' - End-to-end encoder/decoder test: - - * Construct fake test vectors for (1) encoder attention, - (2) decoder self-attention, and (3) encoder/decoder cross-attention - * Construct (1) attention metadata structure with self- and cross-attention - attributes for prefill-phase, and (2) an analogous attention metadata - structure but for decode-phase - * Test attention steps in the following order - - * Encoder attention - * Prefill self-attention - * Prefill cross-attention - * Decode self-attention - * Decode cross-attention - * Besides being reflective of realistic use-cases, this order would - exacerbate any accidental overlap in the self-/cross-attention - block tables, which one hopes to avoid - - - * Validate output correctness against ideal reference attention - implementation - - Block tables are constructed such that cross-attention KV cache is in a - higher, non-intersecting address-space than self-attention KV cache. - - Self- and cross-attention share the same query tensor but not the K/V - tensors. Self-attention K/Vs must have the same seq len as Q while - cross-attention K/Vs are allowed to differ in seq len, as is often the case - for cross-attention. - - This test globally forces an override of the usual backend - auto-selection process, forcing the specific backend-under-test - to be utilized. - - Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if - current_platform.is_rocm(). - - Note on metadata: there is a single attention metadata structure shared by - all prefill-phase attention operations (encoder, decoder, enc/dec cross), - and a single one shared by all decode-phase attention operations - (decoder & enc/dec cross.) This is intended to reflect the behavior - of EncoderDecoderModelRunner, which constructs a single attention metadata - structure for each prefill or decode run. A realistic scenario would rely - on the attention backend to utilize the appropriate attention metadata - fields according to the value of attn_metadata.attention_type. Thus, - this test is organized so as to confirm that the backend-under-test can - handle a shared prefill attention metadata structure & a shared decode\ - attention metadata structure. - - Arguments: - - * num_heads - * head_size, - * attn_backend: The attention backend to employ for testing - * batch_size - * block_size: KV cache block size - * max_dec_seq_len: max length of decoder input sequences - * max_enc_seq_len: max length of encoder input sequences - ''' - # Force Attention wrapper backend - with global_force_attn_backend_context_manager(attn_backend): - # Note: KV cache size of 4096 is arbitrary & chosen intentionally - # to be more than necessary, since exceeding the kv cache size - # is not part of this test - enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096, AttentionType.ENCODER) - enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096, - AttentionType.ENCODER_DECODER) - dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096, AttentionType.DECODER) - - # Attention scale factor, attention backend instance, attention wrapper - # instance, KV cache init - vllm_config = VllmConfig() - with set_current_vllm_config(vllm_config): - enc_test_rsrcs = _make_test_resources(enc_test_pt) - enc_dec_test_rsrcs = _make_test_resources(enc_dec_test_pt) - dec_test_rsrcs = _make_test_resources(dec_test_pt) - - # Construct encoder attention test params (only used - # during prefill) - - enc_test_params = _encoder_attn_setup(enc_test_pt, enc_test_rsrcs) - - # Construct Decoder self-attention prefill-phase & decode-phase - # test params, including query/key/value tensors, decoder self-attention - # memory-mapping. cross_block_base_addr is the uppermost address in the - # decoder self-attention block-table, i.e. a base address which the - # encoder/decoder cross-attention block-table may build downward toward. - - ( - dec_qkv, - prephase_dec_test_params, - decphase_dec_test_params, - cross_block_base_addr, - ) = _decoder_attn_setup(dec_test_pt, dec_test_rsrcs) - - # Construct encoder/decoder cross-attention prefill-phase - # & decode-phase test params, including key/value tensors, - # cross-attention memory-mapping - - ( - prephase_cross_test_params, - decphase_cross_test_params, - ) = _enc_dec_cross_attn_setup_reuses_query( - dec_qkv, - enc_test_params, - prephase_dec_test_params, - enc_dec_test_pt, - enc_dec_test_rsrcs, - block_base_addr=cross_block_base_addr) - - # Shared prefill metadata structure - assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None - prephase_attn_metadata: AttentionMetadata = make_test_metadata( - attn_backend, - True, - prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens, - decoder_test_params=prephase_dec_test_params, - encoder_test_params=enc_test_params, - cross_test_params=prephase_cross_test_params, - device=CUDA_DEVICE) - - # PREFILL: encoder attention - - enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn, - enc_test_params, - prephase_attn_metadata, - test_pt=enc_test_pt, - vllm_config=vllm_config) - - # - Is encoder attention result correct? - assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, - attn_backend.name) - - # PREFILL: decoder self-attention test - - prephase_dec_pckd_act_out = _run_decoder_self_attention_test( - dec_test_rsrcs, - prephase_dec_test_params, - prephase_attn_metadata, - test_pt=dec_test_pt, - vllm_config=vllm_config) - - # - Is prefill decoder self-attention correct? - assert_actual_matches_ideal(prephase_dec_test_params, - prephase_dec_pckd_act_out, - attn_backend.name) - - # PREFILL: encoder/decoder cross-attention test - - prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - enc_dec_test_rsrcs, - prephase_dec_test_params, - prephase_cross_test_params, - prephase_attn_metadata, - test_pt=enc_dec_test_pt, - vllm_config=vllm_config) - - # - Is prefill encoder/decoder cross-attention correct? - assert_actual_matches_ideal(prephase_cross_test_params, - prephase_cross_pckd_act_out, - attn_backend.name) - - # DECODE: build decode-phase attention metadata - - decphase_attn_metadata: AttentionMetadata = make_test_metadata( - attn_backend, - False, - dec_qkv.q_seq_lens, - decoder_test_params=decphase_dec_test_params, - encoder_test_params=enc_test_params, - cross_test_params=decphase_cross_test_params, - device=CUDA_DEVICE) - - # DECODE: decoder self-attention test - - decphase_dec_pckd_act_out = _run_decoder_self_attention_test( - dec_test_rsrcs, - decphase_dec_test_params, - decphase_attn_metadata, - test_pt=dec_test_pt, - vllm_config=vllm_config) - - # - Is decode-phase decoder self-attention correct? - assert_actual_matches_ideal(decphase_dec_test_params, - decphase_dec_pckd_act_out, - attn_backend.name) - - # DECODE: encoder/decoder cross-attention test - - decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - enc_dec_test_rsrcs, - decphase_dec_test_params, - None, - decphase_attn_metadata, - test_pt=enc_dec_test_pt, - vllm_config=vllm_config) - - # - Is decode-phase encoder/decoder cross-attention correct? - assert_actual_matches_ideal(decphase_cross_test_params, - decphase_cross_pckd_act_out, - attn_backend.name) diff --git a/tests/kernels/quantization/test_hadacore.py b/tests/kernels/quantization/test_hadacore.py new file mode 100644 index 0000000000000..127d68072e3f3 --- /dev/null +++ b/tests/kernels/quantization/test_hadacore.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math + +import pytest +import torch +from compressed_tensors.transform import deterministic_hadamard_matrix + +from vllm import _custom_ops as ops + + +@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("hidden_dim", [2**n for n in range(10)]) +def test_hadacore(batch_size, hidden_dim, dtype=torch.bfloat16, device="cuda"): + x = torch.eye(hidden_dim, dtype=dtype, device=device) + hadamard = deterministic_hadamard_matrix( + hidden_dim, dtype=torch.float64, device="cuda") / math.sqrt(hidden_dim) + + y = ops.hadacore_transform(x.clone()) + y_true = (x.to(hadamard.dtype) @ hadamard.T).to(y.dtype) + assert torch.allclose(y, y_true) + + y = ops.hadacore_transform(y) + assert torch.allclose(y, x) diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py index 03d5d98739c50..a9b1c71ef0718 100644 --- a/tests/kernels/quantization/test_rocm_skinny_gemms.py +++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py @@ -5,6 +5,8 @@ import torch import vllm._custom_ops as ops from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + rocm_per_tensor_w8a8_scaled_mm_impl) from vllm.platforms import current_platform DTYPES = [torch.bfloat16, torch.float16] @@ -116,3 +118,32 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed): current_platform.get_cu_count()) assert torch.allclose(out, ref_out, rtol=0.01) + + +@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("use_bias", [True, False]) +@pytest.mark.skipif( + not (current_platform.is_rocm() and current_platform.supports_fp8()), + reason="only test for rocm fp8") +def test_rocm_per_tensor_w8a8_scaled_mm_impl(n, k, m, dtype, seed, use_bias): + torch.manual_seed(seed) + + A = torch.rand(n, k, device="cuda") + B = torch.rand(m, k, device="cuda") + + A, scale_a = ref_dynamic_per_tensor_fp8_quant(A) + B, scale_b = ref_dynamic_per_tensor_fp8_quant(B) + + bias = torch.rand(1, m, dtype=dtype, device="cuda") if use_bias else None + + output = rocm_per_tensor_w8a8_scaled_mm_impl(A, B.t(), dtype, scale_a, + scale_b, bias) + ref_out = torch._scaled_mm(A, + B.t(), + out_dtype=dtype, + scale_a=scale_a, + scale_b=scale_b, + bias=bias) + assert torch.allclose(output, ref_out, rtol=0.01) diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py deleted file mode 100644 index 22ceb27869ac4..0000000000000 --- a/tests/models/language/generation/test_bart.py +++ /dev/null @@ -1,222 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional - -import pytest -from transformers import AutoModelForSeq2SeqLM - -from vllm.sequence import SampleLogprobs - -from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt, - HfRunner, VllmRunner) -from ....utils import multi_gpu_test -from ...utils import check_logprobs_close - - -def vllm_to_hf_output( - vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], - decoder_prompt_type: DecoderPromptType, -): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - hf_output_str = output_str + "" - if decoder_prompt_type == DecoderPromptType.NONE: - hf_output_str = "" + hf_output_str - - return output_ids, hf_output_str, out_logprobs - - -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - prompts: list[ExplicitEncoderDecoderPrompt[str, str]], - decoder_prompt_type: DecoderPromptType, - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -) -> None: - ''' - Test the vLLM BART model for a variety of encoder/decoder input prompts, - by validating it against HuggingFace (HF) BART. - - Arguments: - - * hf_runner: HuggingFace (HF) test model runner - * vllm_runner: vLLM test model runner - * example_encoder_decoder_prompts: test fixture which provides a - dictionary of dummy prompts - * model: the HF ID of the specific BART variant under test - * dtype: the tensor datatype to employ - * max_tokens - * num_logprobs - * decoder_prompt_type: key into the example_encoder_decoder_prompts - dictionary; selects specific encoder/decoder - prompt scenarios to test - - A note on using HF BART as a baseline for validating vLLM BART, - specifically when the decoder prompt is None. - - The HF GenerationMixin's default behavior is to force the first - decoded token to be if the prompt does not already contain - (this is accomplished using a logit - processor setting.) - - So when we use HF BART as our baseline for comparison, note that - when the user provides a request with a None decoder prompt - (i.e. a singleton encoder prompt, or else an explicit encoder/ - decoder prompt with the decoder sub-prompt set to None), HF and - vLLM handle this in different ways: - - * HF will (1) tokenize the None prompt as an empty token-list, - (2) append to the beginning, yielding - [], (3) pass this token list to the model, and - then (4) after computing logits during prefill, override the model - logits & force to be the first generated token. - - * vLLM will (1) tokenize the None prompt as [], (2) append decoder- - start-token to the beginning, yielding [], - (3) pass these tokens to the model & proceed with generation. - - The net effect is that compared to vLLM, the list of HF *decoded* tokens - will contain one more initial than the vLLM generated tokens, - because vLLM's token is injected into the prompt rather than into - the generated output. This is in spite of the fact that overall, the - complete sequences (prompt + decoded tokens) produced by vLLM will match - HF. - - So when we use HF decoded token output to validate vLLM's decoded token - output, the testing process must account for the difference in decoded - token sequences between vLLM and HF specifically in the - decoder-prompt-is-None case. - - One option is to disable the logit processor feature that forces the - token to be decoded (forced_bos_token_id = None), eliminating - the problem entirely. However this is not "normal" BART usage. - - The other option is - only in the decoder-prompt-is-None case - to - discard the first decoded token from the HF output before comparing it - to vLLM. - - To that end, when testing the scenario where the decoder prompt is None - (and only in that one scenario), this test skips the first HF decoded - token during the process of validating the vLLM decoded output. - ''' - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default). - - # Note: currently encoder/decoder models are only compatible with - # enforce_eager=True. Normally this is not a problem because - # for encoder/decoder models vLLM will - # default to enforce_eager=True if enforce_eager - # is left unspecified. However, the - # VllmRunner test fixture (which wraps around the LLM class) defaults to - # enforce_eager=False (a behavior which a number of already-existing - # decoder-only unit tests expect), so when testing an encoder/decoder - # model we must explicitly specify enforce_eager=True in the VllmRunner - # constructor. - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - prompts, max_tokens, num_logprobs) - - # Configuration settings for HF baseline - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit( - prompts, - max_tokens, - num_logprobs, - **hf_kwargs, - )) - - hf_skip_tokens = (1 - if decoder_prompt_type == DecoderPromptType.NONE else 0) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) - - -@pytest.mark.parametrize( - "model", - [ - pytest.param("facebook/bart-base", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), - pytest.param("facebook/bart-large-cnn"), - ], -) -@pytest.mark.parametrize("dtype", ["float", "bfloat16"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) -@pytest.mark.skip(reason="bart not supported in V1") -def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model, - dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None: - - run_test( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts[decoder_prompt_type], - decoder_prompt_type, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM]) -@pytest.mark.skip(reason="bart not supported in V1") -def test_models_distributed(hf_runner, vllm_runner, - example_encoder_decoder_prompts, - distributed_executor_backend, model, dtype, - max_tokens, num_logprobs, - decoder_prompt_type) -> None: - run_test( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts[decoder_prompt_type], - decoder_prompt_type, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=2, - distributed_executor_backend=distributed_executor_backend, - ) diff --git a/tests/models/language/generation/test_mbart.py b/tests/models/language/generation/test_mbart.py deleted file mode 100644 index 854a72713943b..0000000000000 --- a/tests/models/language/generation/test_mbart.py +++ /dev/null @@ -1,123 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional - -import pytest -from transformers import AutoModelForSeq2SeqLM - -from vllm.sequence import SampleLogprobs - -from ....conftest import DecoderPromptType, HfRunner, VllmRunner -from ...utils import check_logprobs_close - - -def vllm_to_hf_output( - vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], - decoder_prompt_type: DecoderPromptType, -): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - hf_output_str = output_str + "" - return output_ids, hf_output_str, out_logprobs - - -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - prompts: list[dict[str, str]], - decoder_prompt_type: DecoderPromptType, - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -) -> None: - ''' - Test the vLLM mBART model by validating it against HuggingFace (HF). - (Docstring content is omitted for brevity) - ''' - - vllm_prompts = prompts - if decoder_prompt_type == DecoderPromptType.NONE: - vllm_prompts = [{ - "encoder_prompt": p['encoder_prompt'], - "decoder_prompt": "" - } for p in prompts] - - vllm_kwargs = { - "hf_overrides": { - "architectures": ["MBartForConditionalGeneration"] - } - } - - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, - **vllm_kwargs) as vllm_model: # type: ignore - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - vllm_prompts, max_tokens, num_logprobs) - - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_kwargs["decoder_start_token_id"] = ( - hf_model.tokenizer.lang_code_to_id["ro_RO"]) - - hf_outputs = ( - hf_model.generate_encoder_decoder_greedy_logprobs_limit( - prompts, # HF runner still uses the original prompts - max_tokens, - num_logprobs, - **hf_kwargs, - )) - - hf_skip_tokens = 0 - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) - - -@pytest.mark.parametrize( - "model", - [pytest.param("facebook/mbart-large-en-ro")], -) -@pytest.mark.parametrize("dtype", ["float", "bfloat16"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) -def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model, - dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None: - - run_test( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts[decoder_prompt_type], - decoder_prompt_type, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py deleted file mode 100644 index a622957f96f69..0000000000000 --- a/tests/models/multimodal/generation/test_florence2.py +++ /dev/null @@ -1,147 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Optional - -import pytest -from PIL import Image - -from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt -from vllm.multimodal.image import rescale_image_size -from vllm.sequence import SampleLogprobs - -from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner -from ...utils import check_logprobs_close - -MODELS = ["microsoft/Florence-2-base"] -# Florence-2 model repo's tokenizer config is missing some special tokens. -# Therefore, we use a converted tokenizer from a forked repo -TOKENIZER = "Isotr0py/Florence-2-tokenizer" -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "", # special task token which will output special tokens - "cherry_blossom": - "Describe in detail what is shown in the image.", -}) - - -def get_hf_images_prompts( - prompts_: list[ExplicitEncoderDecoderPrompt[str, TextPrompt]], -) -> tuple[list[ExplicitEncoderDecoderPrompt[str, str]], list[Image.Image]]: - prompts, images = [], [] - for prompt in prompts_: - encoder_prompt = prompt["encoder_prompt"] - prompts.append( - ExplicitEncoderDecoderPrompt( - encoder_prompt=encoder_prompt["prompt"], - decoder_prompt=None, - )) - images.append(encoder_prompt["multi_modal_data"]["image"]) - return prompts, images - - -def hf_to_vllm_output(hf_output: tuple[list[int], str, - Optional[SampleLogprobs]]): - """Sanitize hf output to be comparable with vllm output.""" - output_ids, output_str, out_logprobs = hf_output - - output_str = output_str.replace("", "").replace("", "") - - return output_ids, output_str, out_logprobs - - -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - inputs: list[list[ExplicitEncoderDecoderPrompt]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -) -> None: - with vllm_runner(model, - max_num_seqs=8, - tokenizer_name=TOKENIZER, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_case = [ - vllm_model.generate_encoder_decoder_greedy_logprobs( - prompts, - max_tokens, - num_logprobs=num_logprobs, - skip_special_tokens=False, - ) for prompts in inputs - ] - - hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs] - - with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model: - hf_model.model.get_output_embeddings = lambda: \ - hf_model.model.language_model.lm_head - hf_outputs_per_case = [ - hf_model.generate_encoder_decoder_greedy_logprobs_limit( - prompts, max_tokens, num_logprobs=num_logprobs, images=images) - for prompts, images in hf_inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, - vllm_outputs_per_case): - check_logprobs_close( - outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs], - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=1, - ) - - -# FIXME: https://github.com/huggingface/transformers/issues/38358 -@pytest.mark.skip("Model initialization fails") -@pytest.mark.core_model -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, model: str, - size_factors: list[int], dtype: str, max_tokens: int, - num_logprobs: int) -> None: - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [[ - ExplicitEncoderDecoderPrompt( - encoder_prompt=TextPrompt( - prompt=prompt, - multi_modal_data={"image": rescale_image_size(image, factor)}), - decoder_prompt=None, - ) for factor in size_factors - ] for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - run_test( - hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py deleted file mode 100644 index 1c32cc6d71c04..0000000000000 --- a/tests/models/multimodal/generation/test_mllama.py +++ /dev/null @@ -1,768 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Optional, overload - -import pytest -import torch -from packaging.version import Version -from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer -from transformers import __version__ as TRANSFORMERS_VERSION - -from vllm import LLM, SamplingParams -from vllm.attention.backends.flash_attn import FlashAttentionMetadata -from vllm.attention.selector import (_Backend, _cached_get_attn_backend, - global_force_attn_backend_context_manager) -from vllm.model_executor.models.mllama import MllamaForConditionalGeneration -from vllm.multimodal.image import rescale_image_size -from vllm.sequence import SampleLogprobs - -from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets, - PromptImageInput, VllmRunner) -from ....quantization.utils import is_quant_method_supported -from ....utils import (create_new_process_for_each_test, large_gpu_test, - multi_gpu_test) -from ...utils import check_logprobs_close - -_LIMIT_IMAGE_PER_PROMPT = 3 -MLLAMA_IMAGE_TOKEN_ID = 128256 - -LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "<|image|><|begin_of_text|>The meaning of the image is", - "cherry_blossom": - "<|image|><|begin_of_text|>The city is", -}) - -text_only_prompts = [ - "The color of the sky is blue but sometimes it can also be", -] - -models = [ - "meta-llama/Llama-3.2-11B-Vision-Instruct", -] - -# Indices for inputs -TEXT_ONLY = '0' -IMAGE_AT_BEG = '1' -IMAGE_AT_MIDDLE = '2' -TWO_IMAGES = '3' - -# Input tokenized -prompt_data = { - # Tell me a story - TEXT_ONLY: [41551, 757, 264, 3446], - # <|image|> What's the content of this image - IMAGE_AT_BEG: - [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220], - # Hello <|image|>What' the content of this image - IMAGE_AT_MIDDLE: - [9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217], - #<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501 - TWO_IMAGES: [ - MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30, - MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30 - ] -} - - -def vllm_to_hf_output(vllm_output: tuple[list[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - config = AutoConfig.from_pretrained(model) - image_token_id = config.image_token_index - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = [ - token_id for idx, token_id in enumerate(output_ids) - if token_id != image_token_id or output_ids[idx - 1] != image_token_id - ] - - hf_output_str = output_str - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -def _get_inputs( - image_assets: ImageTestAssets, - *, - size_factors: Optional[list[float]] = None, - sizes: Optional[list[tuple[int, int]]] = None, -) -> list[tuple[list[str], PromptImageInput]]: - images = [asset.pil_image for asset in image_assets] - - if size_factors is not None: - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - elif sizes is not None: - inputs_per_image = [( - [ - prompt if size is not None else text_only_prompts[0] - for size in sizes - ], - [ - image.resize(size) if size is not None else None - for size in sizes - ], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - if len(sizes) == 0: - inputs_per_image.append( - (text_only_prompts, [None] * len(text_only_prompts))) - else: - raise ValueError("You must provide either `size_factors` or `sizes`") - - return inputs_per_image - - -@overload -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, - model: str, - *, - size_factors: list[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -@overload -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, - model: str, - *, - sizes: list[tuple[int, int]], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - image_assets: ImageTestAssets, - model: str, - *, - size_factors: Optional[list[float]] = None, - sizes: Optional[list[tuple[int, int]]] = None, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - _run_test( - hf_runner, - vllm_runner, - _get_inputs(image_assets, size_factors=size_factors, sizes=sizes), - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - ) - - -def _run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - inputs: list[tuple[list[str], PromptImageInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner( - model, - dtype=dtype, - max_model_len=19212, # 3 max size images - max_num_seqs=3, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - limit_mm_per_prompt={"image": - _LIMIT_IMAGE_PER_PROMPT}) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - with hf_runner(model, - dtype=dtype, - model_kwargs={"device_map": "auto"}, - auto_cls=AutoModelForImageTextToText) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.fixture(autouse=True) -def clear_cache(): - """Fixture to clear backend cache before each test.""" - _cached_get_attn_backend.cache_clear() # Clear the cache - yield # This allows the test to run - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "sizes", - [ - # Text only - [], - # Single-size - [(512, 512)], - # Single-size, batched - [(512, 512), (512, 512), (512, 512)], - # Multi-size, batched - [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024), - (1024, 1024), (512, 1536), (512, 2028)], - # Multi-size, batched, including text only - [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024), - (1024, 1024), (512, 1536), (512, 2028), None], - # mllama has 8 possible aspect ratios, carefully set the sizes - # to cover all of them - ]) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) <= Version("4.55.2"), - reason="Transformers v4.55 has a regression issue on mllama, " - "see: https://github.com/huggingface/transformers/pull/40083") -def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, - model, sizes, dtype, max_tokens, - num_logprobs, - attn_backend: _Backend) -> None: - with global_force_attn_backend_context_manager(attn_backend): - if attn_backend == _Backend.FLASH_ATTN: - # Flash Attention works only with bfloat16 data-type - dtype = 'bfloat16' - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - sizes=sizes, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) <= Version("4.55.2"), - reason="Transformers v4.55 has a regression issue on mllama, " - "see: https://github.com/huggingface/transformers/pull/40083") -def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, - model, dtype, max_tokens, num_logprobs, - attn_backend: _Backend) -> None: - - stop_sign = image_assets[0].pil_image - cherry_blossom = image_assets[1].pil_image - - inputs = [( - [ - "<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501 - "<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501 - "<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.", # noqa: E501 - ], - [ - [stop_sign, cherry_blossom], - # Images with different sizes. - [ - stop_sign.resize((512, 512)), - stop_sign, - ], - [ - stop_sign, - stop_sign.resize((512, 1536)), - cherry_blossom.resize((512, 1024)), - ], - ])] - with global_force_attn_backend_context_manager(attn_backend): - if attn_backend == _Backend.FLASH_ATTN: - # Flash Attention works only with bfloat16 data-type - dtype = 'bfloat16' - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -@pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) <= Version("4.55.2"), - reason="Transformers v4.55 has a regression issue on mllama, " - "see: https://github.com/huggingface/transformers/pull/40083") -def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, - dtype, max_tokens, num_logprobs, - attn_backend: _Backend) -> None: - - stop_sign = image_assets[0].pil_image - cherry_blossom = image_assets[1].pil_image - - inputs = [( - [ - "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501 - "<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, " # noqa: E501 - "which is a stop sign and which is a cherry blossom?", # noqa: E501 - ], - [ - [stop_sign], - [stop_sign, cherry_blossom], - ])] - with global_force_attn_backend_context_manager(attn_backend): - if attn_backend == _Backend.FLASH_ATTN: - # Flash Attention works only with bfloat16 data-type - dtype = 'bfloat16' - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@create_new_process_for_each_test() -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) <= Version("4.55.2"), - reason="Transformers v4.55 has a regression issue on mllama, " - "see: https://github.com/huggingface/transformers/pull/40083") -def test_models_distributed( - hf_runner, - vllm_runner, - image_assets, - distributed_executor_backend, - model, - dtype, - max_tokens, - num_logprobs, -) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model=model, - size_factors=[0.25, 0.5, 1.0], - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=2, - distributed_executor_backend=distributed_executor_backend, - ) - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["float16"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), - reason='bitsandbytes is not supported on this GPU type.') -def test_bnb_regression( - image_assets: ImageTestAssets, - model: str, - dtype: str, - max_tokens: int, -): - stop_sign = image_assets[0].pil_image - prompts = [ - { - "prompt": "<|begin_of_text|>The content of the image <|image|> is", - "multi_modal_data": { - "image": stop_sign - }, - }, - { - "prompt": - "The color of the sky is blue but sometimes it can also be", - }, - ] - # Test regression about QKVCrossParallelLinear - llm = LLM( - model=model, - dtype=dtype, - max_model_len=8192, - max_num_seqs=2, - quantization="bitsandbytes", - ) - sampling_params = SamplingParams( - temperature=0, - max_tokens=max_tokens, - ) - outputs = llm.generate(prompts, sampling_params) - assert outputs - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [32]) -def test_explicit_implicit_prompt( - image_assets: ImageTestAssets, - model: str, - dtype: str, - max_tokens: int, -): - stop_sign = image_assets[0].pil_image - # yapf: disable - prompts = [ - # explicit prompt - { - "encoder_prompt": { - "prompt": "<|image|>", - "multi_modal_data": {"image": stop_sign}, - }, - "decoder_prompt": { - "prompt_token_ids": [128000, 791, 2262, 315, 279, 2217, 220, 128256, 374], # noqa: E501 - } - }, - { - "encoder_prompt": "Not <|image|>", - "decoder_prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501 - }, - # implicit prompt - { - "prompt": "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501 - "multi_modal_data": {"image": stop_sign}, - }, - { - "prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501 - }, - ] - # yapf: enable - llm = LLM( - model=model, - dtype=dtype, - max_model_len=8192, - max_num_seqs=2, - tensor_parallel_size=1, - ) - sampling_params = SamplingParams( - temperature=0, - max_tokens=max_tokens, - ) - outputs = llm.generate(prompts, sampling_params) - n_prompts = len(prompts) - explicit_outputs = outputs[:n_prompts // 2] - implicit_outputs = outputs[n_prompts // 2:] - for exp_output, imp_output in zip(explicit_outputs, implicit_outputs): - assert exp_output.outputs[0].text == imp_output.outputs[0].text - - -@large_gpu_test(min_gb=48) -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) -def test_regression(vllm_runner, image_assets, model, dtype, max_tokens, - num_logprobs, attn_backend: _Backend) -> None: - - stop_sign = image_assets[0].pil_image - - with global_force_attn_backend_context_manager(attn_backend), vllm_runner( - model, - dtype=dtype, - max_model_len=8192, - max_num_seqs=4, - tensor_parallel_size=1, - limit_mm_per_prompt={"image": - _LIMIT_IMAGE_PER_PROMPT}) as vllm_model: - - # Regression tests for https://github.com/vllm-project/vllm/issues/10648 - - # Number of groups of image tokens is greater than the number of images - # provided (the whitespace between the tags is necessary) - prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images" # noqa: E501 - image = stop_sign - with pytest.raises(ValueError): - vllm_model.generate_greedy_logprobs([prompt], - max_tokens, - num_logprobs, - images=[image]) - - # Batch of a text-only and image request that requires cross-attention - prompts = [ - "What is the capital of spain?", - "Text before the image...<|image|>What is in the image?", # noqa: E501 - ] - images = [ - None, - [stop_sign], - ] - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs, - images=images) - - # Test the reverse order too for good measure - prompts = [ - "<|begin_of_text|>Text before the image...<|image|>What is in the image?", # noqa: E501 - "<|begin_of_text|>Hello!", - ] - images = [ - [stop_sign], - None, - ] - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs, - images=images) - - # Mixed batch with text and images with different numbers of tiles - prompts = [ - "<|begin_of_text|>Hello!", - "<|begin_of_text|>Some text before.<|image|>What is in the image?", # noqa: E501 - "<|begin_of_text|>Some text before.<|image|>What is in the image?", # noqa: E501 - ] - images = [ - None, - [stop_sign], - # smaller image must be 2nd for the repro - [stop_sign.resize((448, 448))], - ] - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs, - images=images) - - -class DummyModel: - image_token_id = MLLAMA_IMAGE_TOKEN_ID - - -@pytest.mark.core_model -@pytest.mark.parametrize( - "input_indices_and_output", - # inputs, (cross_attention_mask, kv_range_for_decode) - [([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)), - ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)), - ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])), - ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])), - ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE], - ((23, 24), [[0, 6], [6, 12]])), - ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])), - ([TWO_IMAGES], ((18, 12), [[6, 12]])), - ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))]) -def test_get_cross_attention_mask(input_indices_and_output) -> None: - - input_indices, expected_output = input_indices_and_output - - sequences = [torch.tensor(prompt_data[i]) for i in input_indices] - num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices - if i != TEXT_ONLY] - input = torch.cat(sequences) - - seq_lens = [len(s) for s in sequences] - - attn_data = FlashAttentionMetadata( - seq_lens=seq_lens, - # Dummy values - enable_kv_scales_calculation=False, - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=0, - slot_mapping=0, - multi_modal_placeholder_index_maps=None, - seq_lens_tensor=0, - max_prefill_seq_len=0, - max_decode_seq_len=0, - context_lens_tensor=None, - block_tables=None, - use_cuda_graph=False, - ) - - dummy = DummyModel() - - cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\ - .get_cross_attention_mask(dummy, - input, - attn_data, - num_tiles=num_tiles, - num_tokens_per_tile=3, - dtype=torch.bfloat16) - - expected_cross_attention_mask, expected_kv_range_for_decode = \ - expected_output - - assert kv_range_for_decode == expected_kv_range_for_decode - if expected_cross_attention_mask is not None: - assert cross_attention_mask is not None - assert cross_attention_mask.shape == expected_cross_attention_mask - else: - assert cross_attention_mask is None - - -@pytest.mark.core_model -@pytest.mark.parametrize( - "input_indices", - [[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE], - [TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE], - [IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]]) -def test_get_full_text_row_masked_out_mask(input_indices) -> None: - - sequences = [torch.tensor(prompt_data[i]) for i in input_indices] - - seq_lens = [len(s) for s in sequences] - - num_prefill_tokens = sum(seq_lens) - - # TEXT_ONLY is zero, so it will be masked out, - # other instances should not be. - encoder_seq_lens = [int(i) for i in input_indices] - - attn_data = FlashAttentionMetadata( - seq_lens=seq_lens, - encoder_seq_lens=encoder_seq_lens, - num_prefill_tokens=num_prefill_tokens, - # Dummy values - enable_kv_scales_calculation=False, - num_prefills=0, - num_decode_tokens=0, - slot_mapping=0, - multi_modal_placeholder_index_maps=None, - seq_lens_tensor=0, - max_prefill_seq_len=0, - max_decode_seq_len=0, - context_lens_tensor=None, - block_tables=None, - use_cuda_graph=False, - ) - - dummy = DummyModel() - - full_text_row_masked_out_mask = MllamaForConditionalGeneration\ - .get_full_text_row_masked_out_mask(dummy, - attn_data, - torch.get_default_device()) - - full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze() - full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist() - - idx = 0 - assert len(full_text_row_masked_out_mask) == num_prefill_tokens - for i, seq_len in enumerate(seq_lens): - must_be_masked = input_indices[i] != TEXT_ONLY - for _ in range(seq_len): - assert full_text_row_masked_out_mask[idx] == must_be_masked, \ - f"full_text_row_masked_out_mask[{idx}] must be " \ - f"'{must_be_masked}' " - idx += 1 - - -@pytest.mark.core_model -@pytest.mark.parametrize("encoder_seq_lens, num_tiles, expected", [ - ([6404], [[4]], [6404]), - ([0, 6404], [[4]], [6404]), - ([0, 1601, 8005], [[1], [4, 1]], [1601, 8005]), - ([0, 19212, 0, 3202], [[4, 4, 4], [2]], [19212, 3202]), -]) -def test_parse_and_validate_encoder_lens(encoder_seq_lens, num_tiles, - expected) -> None: - - dummy = DummyModel() - num_tokens_per_tile = 1601 - actual_encoder_seq_lens = MllamaForConditionalGeneration \ - ._get_and_validate_encoder_lens( - dummy, - encoder_seq_lens, - num_tiles, - num_tokens_per_tile, - ) - assert actual_encoder_seq_lens == expected, \ - f"Expected {expected} but got {actual_encoder_seq_lens}" diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index ced0ab3377a9e..a272c840f8dac 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -32,11 +32,14 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict: # Ensure video metadata is included if "video" in mm_data: video = mm_data["video"] + num_frames = len(video) mm_data["video"] = (video, { - "total_num_frames": len(video), - "fps": len(video), + "total_num_frames": num_frames, + "fps": num_frames, "duration": 1, - "video_backend": "opencv" + "frames_indices": [i for i in range(num_frames)], + "video_backend": "opencv", + "do_sample_frames": True, }) return mm_data @@ -164,8 +167,6 @@ def _test_processing_correctness( # incorrect token ids. So we need use `add_special_tokens=False` here # to leave bos_token to be added by the processor. _ADD_SPECIAL_TOKENS_OVERRIDES = { - "donut": False, - "mllama": False, "ovis": False, "ovis2_5": False, "paligemma": False, @@ -275,9 +276,7 @@ def _test_processing_correctness_one( "facebook/chameleon-7b", "CohereLabs/command-a-vision-07-2025", "deepseek-ai/deepseek-vl2-tiny", - "naver-clova-ix/donut-base-finetuned-docvqa", "baidu/ERNIE-4.5-VL-28B-A3B-PT", - "microsoft/Florence-2-base", "adept/fuyu-8b", "google/gemma-3-4b-it", "google/gemma-3n-E2B-it", @@ -302,7 +301,6 @@ def _test_processing_correctness_one( "llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf", "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - "meta-llama/Llama-3.2-11B-Vision-Instruct", "TIGER-Lab/Mantis-8B-siglip-llama3", "mispeech/midashenglm-7b", "openbmb/MiniCPM-Llama3-V-2_5", diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index dfb8d9b2a038d..070ddcd89ee96 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -12,8 +12,19 @@ from ...utils import build_model_context @pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"]) @pytest.mark.parametrize("expected_toks_per_frame", [299]) -@pytest.mark.parametrize("num_frames", [32, 128]) -@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)]) +@pytest.mark.parametrize( + "num_frames, fps, expected_grid_t", + [ + # pre-sampled fixed frames (unexpected behavior, + # but we still expect it to work without errors) + (32, 1, 16), + (32, 2, 16), + (128, 1, 64), + (128, 2, 64), + # post-sampled frames (expected behavior) + (-1, 1, 5), + (-1, 2, 10), + ]) def test_processor_override( model_id: str, expected_toks_per_frame: int, @@ -80,7 +91,7 @@ def test_video_loader_consistency( static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes) dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes( - video_bytes, requested_fps=fps) + video_bytes, fps=fps) # pre-sampled loader shouldn't read all frames assert len(dynamic_video) < len(static_video) diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py deleted file mode 100644 index b42d3f89f3cbf..0000000000000 --- a/tests/models/multimodal/processing/test_mllama.py +++ /dev/null @@ -1,72 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for mllama's multimodal preprocessing and profiling.""" -import pytest -from transformers import MllamaConfig - -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.profiling import MultiModalProfiler - -from ...utils import build_model_context - - -@pytest.mark.parametrize("model_id", - ["meta-llama/Llama-3.2-11B-Vision-Instruct"]) -@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072]) -@pytest.mark.parametrize("max_num_seqs", [1, 2, 8]) -def test_profiling( - model_id: str, - max_model_len: int, - max_num_seqs: int, -): - # regression test for https://github.com/vllm-project/vllm/issues/13929 - from vllm.model_executor.models.mllama import calc_token_per_chunk - - model_config_kwargs = { - "max_model_len": max_model_len, - } - ctx = build_model_context( - model_id, - model_config_kwargs=model_config_kwargs, - limit_mm_per_prompt={"image": 1}, - ) - - mm_config = ctx.get_mm_config() - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) - profiler = MultiModalProfiler(processor) - - dummy_encoder_data = profiler.get_encoder_dummy_data( - max_model_len, - mm_counts=mm_config.limit_per_prompt, - ) - dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs( - max_model_len, - mm_counts=mm_config.limit_per_prompt, - ) - - hf_config = ctx.get_hf_config(MllamaConfig) - image_size = hf_config.vision_config.image_size - encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids) - ] * max_num_seqs - - mm_data = processor.apply( - prompt=dummy_mm_data.prompt, - mm_data=dummy_mm_data.mm_data, - hf_processor_mm_kwargs=dict(), - )["mm_kwargs"].get_data() - - # Get the actual number of encoder tokens for each sample. - # Because attn_metadata.encoder_seq_lens only counts the last - # group of images for each sample, which is used to cheat the - # block manager to allocate blocks for those images only. - # See MllamaMultiModalProcessor for more details. - num_tiles = [[t] for t in mm_data.pop("num_tiles")] - num_tokens_per_tile = calc_token_per_chunk(image_size) - actual_encoder_seq_lens = [ - sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles - ] - - # simulate mllama image-present prefill. - for actual_len, last_group_len in zip(actual_encoder_seq_lens, - encoder_seq_lens): - assert actual_len >= last_group_len diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 3b87b669dbbe3..b678313752d65 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -31,7 +31,6 @@ from ...utils import dummy_hf_overrides ARCH_TO_SKIP = { "MolmoForCausalLM": "incompatible requirements", - "Florence2ForConditionalGeneration": "not supported in V1", } ARCH_NEEDS_EXTRAS = [ "InternVLChatModel", diff --git a/tests/models/registry.py b/tests/models/registry.py index b268bf12a3f30..9aef08769fb22 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -180,6 +180,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5", trust_remote_code=True), + "BailingMoeV2ForCausalLM": _HfExamplesInfo("inclusionAI/Ling-mini-2.0", + trust_remote_code=True), "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1", min_transformers_version="4.55.3", extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501 @@ -352,11 +354,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True), "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"), - # [Encoder-decoder] - "BartModel": _HfExamplesInfo("facebook/bart-base"), - "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"), - "MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro", # noqa: E501 - hf_overrides={"architectures": ["MBartForConditionalGeneration"]}), # noqa: E501 } _EMBEDDING_EXAMPLE_MODELS = { @@ -494,7 +491,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 max_model_len=10240, - extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501 + extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501 ), "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501 @@ -581,15 +578,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { is_available_online=False, ), # [Encoder-decoder] - "DonutForConditionalGeneration": _HfExamplesInfo("naver-clova-ix/donut-base-finetuned-docvqa", # noqa: E501 - hf_overrides={"architectures": ["DonutForConditionalGeneration"], "model_type": "donut"}, # noqa: E501 - extras={"dolphin": "ByteDance/Dolphin"}), # noqa: E501 - # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer - # Therefore, we borrow the BartTokenizer from the original Bart model - "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501 - tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501 - trust_remote_code=True), # noqa: E501 - "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 # [Cross-encoder] "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501 diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index c22d94948d249..56b5d32d16536 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -10,7 +10,7 @@ from vllm import LLM from vllm.config import ModelImpl from vllm.engine.llm_engine import LLMEngine as V0LLMEngine from vllm.utils import GiB_bytes -from vllm.v1.core.kv_cache_utils import get_kv_cache_config +from vllm.v1.core.kv_cache_utils import get_kv_cache_configs from vllm.v1.engine.core import EngineCore as V1EngineCore from ..utils import create_new_process_for_each_test @@ -68,11 +68,11 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, def _initialize_kv_caches_v1(self, vllm_config): kv_cache_specs = self.model_executor.get_kv_cache_specs() - scheduler_kv_cache_config = get_kv_cache_config( + scheduler_kv_cache_config = get_kv_cache_configs( vllm_config, - kv_cache_specs[0], - 10 * GiB_bytes, - ) + kv_cache_specs, + [10 * GiB_bytes], + )[0] # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config return 1, 0, scheduler_kv_cache_config @@ -92,10 +92,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when # L4 supports FA3. m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1") - if model_arch == "Florence2ForConditionalGeneration": - # An encoder-decoder model that's V0-only. Just skip it - # since V0 is about to be removed. - pytest.skip("Skipping Florence2ForConditionalGeneration") if model_arch == "WhisperForConditionalGeneration": m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") LLM( diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 36882aba5e941..f67d4017eeeec 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -50,7 +50,6 @@ def test_registry_imports(model_arch): @create_new_process_for_each_test() @pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [ ("LlamaForCausalLM", False, False, False), - ("MllamaForConditionalGeneration", True, False, False), ("LlavaForConditionalGeneration", True, True, False), ("BertForSequenceClassification", False, False, True), ("RobertaForSequenceClassification", False, False, True), diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index 3c61ee26e092e..3c737acfbfe28 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -7,6 +7,7 @@ import pytest import torch from vllm.config import ModelConfig, ParallelConfig, VllmConfig +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import (MultiModalCache, MultiModalProcessorCacheItem, MultiModalProcessorCacheItemMetadata, @@ -17,7 +18,6 @@ from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem, MultiModalKwargsItems, MultiModalSharedField) from vllm.multimodal.processing import PromptInsertion -from vllm.multimodal.registry import MultiModalRegistry def _dummy_elem( @@ -96,7 +96,9 @@ def _create_vllm_config( enable_ipc: bool, ): return VllmConfig( - model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb), + model_config=ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + mm_processor_cache_gb=mm_processor_cache_gb), parallel_config=ParallelConfig( data_parallel_size=1 if enable_ipc else 2), ) @@ -113,15 +115,16 @@ def _compare_caches( n_iter: int = 100, seed: int = 0, ): - mm_registry = MultiModalRegistry() - cache_0_p0 = processor_cache_from_config(config_0, mm_registry) - cache_0_p1 = engine_receiver_cache_from_config(config_0, mm_registry) - cache_1_p0 = processor_cache_from_config(config_1, mm_registry) - cache_1_p1 = engine_receiver_cache_from_config(config_1, mm_registry) + cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY) + cache_0_p1 = engine_receiver_cache_from_config(config_0, + MULTIMODAL_REGISTRY) + cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY) + cache_1_p1 = engine_receiver_cache_from_config(config_1, + MULTIMODAL_REGISTRY) cache_size_gb = max( - config_0.model_config.mm_processor_cache_gb, - config_1.model_config.mm_processor_cache_gb, + config_0.model_config.multimodal_config.mm_processor_cache_gb, + config_1.model_config.multimodal_config.mm_processor_cache_gb, ) item_size_gb = int(cache_size_gb / item_capacity) diff --git a/tests/test_config.py b/tests/test_config.py index 373fbd267539a..6e37bdbee59eb 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -6,9 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field import pytest from vllm.compilation.backends import VllmBackend -from vllm.config import (ModelConfig, PoolerConfig, VllmConfig, get_field, - update_config) +from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config from vllm.config.load import LoadConfig +from vllm.config.utils import get_field from vllm.model_executor.layers.pooler import PoolingType from vllm.platforms import current_platform @@ -299,9 +299,8 @@ def test_rope_customization(): reason="Encoder Decoder models not supported on ROCm.") @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [ ("facebook/opt-125m", False), - ("facebook/bart-base", True), + ("openai/whisper-tiny", True), ("meta-llama/Llama-3.2-1B-Instruct", False), - ("meta-llama/Llama-3.2-11B-Vision", True), ]) def test_is_encoder_decoder(model_id, is_encoder_decoder): config = ModelConfig(model_id) diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 6dbba18b4dcfa..608f517f69145 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -501,34 +501,6 @@ def test_bind_kv_cache_non_attention(): assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1] -def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch): - # V1 TESTS: ENCODER_DECODER is not supported on V1 yet. - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - - from vllm.attention import Attention, AttentionType - - # example from bart - ctx = { - 'encoder.layers.0.self_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER), - 'decoder.layers.0.encoder_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER), - 'decoder.layers.0.self_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.DECODER), - } - - kv_cache = [ - torch.zeros((1, )), - ] - encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache - - bind_kv_cache(ctx, [kv_cache]) - assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache - assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0] - assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0] - - def test_bind_kv_cache_pp(): with patch("vllm.utils.cuda_device_count_stateless", lambda: 2): # this test runs with 1 GPU, but we simulate 2 GPUs diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 44e479098ad5d..319e6e84fba1e 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -18,19 +18,28 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager from vllm.v1.core.kv_cache_utils import ( BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, estimate_max_model_len, generate_block_hash_extra_keys, - get_kv_cache_config, get_max_concurrency_for_kv_cache_config, + get_kv_cache_configs, get_max_concurrency_for_kv_cache_config, get_request_block_hasher, hash_block_tokens, init_none_hash, - is_kv_cache_type_uniform, make_block_hash_with_group_id, - unify_kv_cache_configs) + is_kv_cache_type_uniform, make_block_hash_with_group_id) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, - KVCacheGroupSpec, KVCacheTensor, - SlidingWindowSpec) + KVCacheGroupSpec, KVCacheSpec, + KVCacheTensor, SlidingWindowSpec) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request # yapf: enable +@pytest.fixture(autouse=True) +def _auto_init_hash_fn(request): + hash_fn: Callable + if "hash_fn" in request.fixturenames: + hash_fn = init_none_hash(request.getfixturevalue("hash_fn")) + else: + hash_fn = sha256 + init_none_hash(hash_fn) + + def make_request( request_id: str, prompt_token_ids: list[int], @@ -244,6 +253,18 @@ def test_free_kv_cache_block_queue_append_n(): assert blocks[3].next_free_block is queue.fake_free_list_tail assert queue.fake_free_list_tail.prev_free_block is blocks[3] + # Create an empty FreeKVCacheBlockQueue + invalid_queue = FreeKVCacheBlockQueue([]) + # set prev_free_block to None and this will cause assertation in append_n + invalid_queue.fake_free_list_tail.prev_free_block = None + with pytest.raises(AssertionError): + # Append 1 block + # fake_head->fake_tail + invalid_queue.append_n(blocks[0:1]) + assert invalid_queue.num_free_blocks == 0 + assert (invalid_queue.fake_free_list_head.next_free_block == + invalid_queue.fake_free_list_tail) + def test_free_kv_cache_block_queue_popleft_n(): blocks = [KVCacheBlock(block_id=i) for i in range(6)] @@ -269,9 +290,11 @@ def test_free_kv_cache_block_queue_popleft_n(): # Pop 0 block # fake_head->b1->b3->b5->b4->b0->b2->fake_tail assert len(queue.popleft_n(0)) == 0 + assert queue.num_free_blocks == 6 # Pop 1 block # fake_head->b3->b5->b4->b0->b2->fake_tail result_blocks = queue.popleft_n(1) + assert queue.num_free_blocks == 5 assert len(result_blocks) == 1 assert result_blocks[0] is blocks[1] for block in result_blocks: @@ -281,6 +304,7 @@ def test_free_kv_cache_block_queue_popleft_n(): # fake_head->b4->b0->b2->fake_tail result_blocks = queue.popleft_n(2) assert len(result_blocks) == 2 + assert queue.num_free_blocks == 3 assert result_blocks[0] is blocks[3] assert result_blocks[1] is blocks[5] for block in result_blocks: @@ -290,6 +314,7 @@ def test_free_kv_cache_block_queue_popleft_n(): # fake_head->fake_tail result_blocks = queue.popleft_n(3) assert len(result_blocks) == 3 + assert queue.num_free_blocks == 0 assert result_blocks[0] is blocks[4] assert result_blocks[1] is blocks[0] assert result_blocks[2] is blocks[2] @@ -409,7 +434,6 @@ def test_generate_block_hash_extra_keys_cache_salt(): @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_hash_block_tokens(hash_fn): - init_none_hash(hash_fn) parent_block_hash = BlockHash(b"123") curr_block_token_ids = (1, 2, 3) extra_keys = ("key1", "key2") @@ -422,8 +446,6 @@ def test_hash_block_tokens(hash_fn): @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_request_block_hasher(hash_fn): - kv_cache_utils.init_none_hash(hash_fn) - request = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], @@ -446,8 +468,6 @@ def test_request_block_hasher(hash_fn): @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_hash_tokens_different_mm_input(hash_fn): - init_none_hash(hash_fn) - request1 = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], @@ -476,8 +496,6 @@ def test_hash_tokens_different_mm_input(hash_fn): @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_hash_request_tokens_no_mm_inputs(hash_fn): - kv_cache_utils.init_none_hash(hash_fn) - request = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], @@ -531,102 +549,288 @@ def test_metrics(): assert not metrics.query_queue -def test_unify_kv_cache_configs(): - same_kv_cache_config = [ - KVCacheConfig( - num_blocks=10, - kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), - ], - kv_cache_groups=[ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=4)), - ], - ), - KVCacheConfig( - num_blocks=20, - kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), - ], - kv_cache_groups=[ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=4)), - ], - ), - ] - unify_kv_cache_configs(same_kv_cache_config) - assert same_kv_cache_config[0].num_blocks == 10 - assert same_kv_cache_config[1].num_blocks == 10 +def test_get_kv_cache_configs_multiple_workers(): + model_config = ModelConfig(max_model_len=16) + vllm_config = VllmConfig(model_config=model_config) - need_sort_kv_cache_config = [ + ref_kv_cache_spec = new_kv_cache_spec() + same_kv_cache_specs = [{ + "layer1": new_kv_cache_spec(), + "layer2": new_kv_cache_spec(), + }, { + "layer1": new_kv_cache_spec(), + "layer2": new_kv_cache_spec(), + }] + + # Basic case. All things are the same. + kv_cache_configs = get_kv_cache_configs(vllm_config, same_kv_cache_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10 + ]) + assert kv_cache_configs == [ KVCacheConfig( num_blocks=10, kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), ], kv_cache_groups=[ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=4)), + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), ], ), KVCacheConfig( - num_blocks=20, + num_blocks=10, kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), ], kv_cache_groups=[ - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=4)), - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), ], ), ] - unify_kv_cache_configs(need_sort_kv_cache_config) - sorted_kv_cache_groups = [ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)), - ] - assert ( - need_sort_kv_cache_config[0].kv_cache_groups == sorted_kv_cache_groups) - assert ( - need_sort_kv_cache_config[1].kv_cache_groups == sorted_kv_cache_groups) - - diff_kv_cache_config = [ + # Different available memory. This is the case for TP. + # Use the smallest memory available. + kv_cache_configs = get_kv_cache_configs(vllm_config, same_kv_cache_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 20 + ]) + assert kv_cache_configs == [ KVCacheConfig( num_blocks=10, kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), ], kv_cache_groups=[ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=4)), + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), ], ), KVCacheConfig( - num_blocks=20, + num_blocks=10, kv_cache_tensors=[ - KVCacheTensor(size=100, shared_by=["layer1"]), - KVCacheTensor(size=100, shared_by=["layer2"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20, + shared_by=["layer2"]), ], kv_cache_groups=[ - KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer2"], - new_kv_cache_spec(num_kv_heads=8)), + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), ], ), ] + + # Different KV cache specs. This is the case for PP. + different_layer_specs = [{ + "layer1": new_kv_cache_spec(), + }, { + "layer2": new_kv_cache_spec(), + "layer3": new_kv_cache_spec(), + }] + + # Different workers have different layers. + kv_cache_configs = get_kv_cache_configs( + vllm_config, different_layer_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10 + ]) + assert kv_cache_configs == [ + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20, + shared_by=["layer1"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer3"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer2", "layer3"], new_kv_cache_spec()), + ], + ), + ] + + # Some layers are the same, some are different. This is the case for TP+PP + tp_pp_kv_cache_specs = [{ + "layer1": new_kv_cache_spec(), + "layer2": new_kv_cache_spec(), + }, { + "layer1": new_kv_cache_spec(), + "layer2": new_kv_cache_spec(), + }, { + "layer3": new_kv_cache_spec(), + }, { + "layer3": new_kv_cache_spec(), + }] + + kv_cache_configs = get_kv_cache_configs( + vllm_config, tp_pp_kv_cache_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ]) + assert kv_cache_configs == [ + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20, + shared_by=["layer3"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer3"], ref_kv_cache_spec), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20, + shared_by=["layer3"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer3"], ref_kv_cache_spec), + ], + ), + ] + + # Different workers have different types of layers. This is the case for + # hybrid models + PP. + different_type_layer_specs = [{ + "layer1": new_kv_cache_spec(), + "layer2": new_kv_cache_spec(), + }, { + "layer3": new_sliding_window_spec(), + "layer4": new_sliding_window_spec(), + }] + kv_cache_configs = get_kv_cache_configs( + vllm_config, different_type_layer_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ]) + assert kv_cache_configs == [ + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer2"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec), + KVCacheGroupSpec([], new_sliding_window_spec()), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer3"]), + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer4"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec([], ref_kv_cache_spec), + KVCacheGroupSpec(["layer3", "layer4"], + new_sliding_window_spec()), + ], + ), + ] + + # When divided into multiple KVCacheGroups, need to ensure the number of + # layers per group is similar. + different_type_layer_specs = [{ + "layer1": new_kv_cache_spec(), + "layer2": new_sliding_window_spec(), + "layer3": new_sliding_window_spec(), + }, { + "layer4": new_kv_cache_spec(), + "layer5": new_sliding_window_spec(), + "layer6": new_sliding_window_spec(), + }] + kv_cache_configs = get_kv_cache_configs( + vllm_config, different_type_layer_specs, [ + ref_kv_cache_spec.page_size_bytes * 10, + ref_kv_cache_spec.page_size_bytes * 10, + ]) + assert kv_cache_configs == [ + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer1", "layer2", "layer3"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer1"], ref_kv_cache_spec), + KVCacheGroupSpec(["layer2"], new_sliding_window_spec()), + KVCacheGroupSpec(["layer3"], new_sliding_window_spec()), + ], + ), + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[ + KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10, + shared_by=["layer4", "layer5", "layer6"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer4"], ref_kv_cache_spec), + KVCacheGroupSpec(["layer5"], new_sliding_window_spec()), + KVCacheGroupSpec(["layer6"], new_sliding_window_spec()), + ], + ), + ] + + # Have conflicting layers. Need to raise an error. + conflicting_layer_specs = [{ + "layer1": new_kv_cache_spec(), + }, { + "layer1": new_sliding_window_spec(), + }] with pytest.raises(AssertionError): - unify_kv_cache_configs(diff_kv_cache_config) + get_kv_cache_configs(vllm_config, conflicting_layer_specs, [ + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ref_kv_cache_spec.page_size_bytes * 2 * 10, + ]) def test_merge_kv_cache_spec(): @@ -890,7 +1094,7 @@ def test_allocate_with_lookahead(): assert len(blocks.get_block_ids()[0]) == 2 -def test_get_kv_cache_config(): +def test_get_kv_cache_config_one_worker(): # pass max_model_len to pass check_enough_kv_cache_memory model_config = ModelConfig(max_model_len=16) vllm_config = VllmConfig(model_config=model_config) @@ -901,8 +1105,10 @@ def test_get_kv_cache_config(): 'layer_1': new_kv_cache_spec(), 'layer_2': new_kv_cache_spec(), } - kv_cache_config_full = get_kv_cache_config( - vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32) + kv_cache_config_full = get_kv_cache_configs( + vllm_config, [kv_cache_specs_full], + [mem_per_block_per_layer * 2 * 32])[0] + print(kv_cache_config_full) assert kv_cache_config_full == KVCacheConfig( num_blocks=32, kv_cache_tensors=[ @@ -920,8 +1126,9 @@ def test_get_kv_cache_config(): 'layer_1': new_sliding_window_spec(), 'layer_2': new_sliding_window_spec(), } - kv_cache_config_sliding = get_kv_cache_config( - vllm_config, kv_cache_specs_sliding, mem_per_block_per_layer * 2 * 32) + kv_cache_config_sliding = get_kv_cache_configs( + vllm_config, [kv_cache_specs_sliding], + [mem_per_block_per_layer * 2 * 32])[0] assert kv_cache_config_sliding == KVCacheConfig( num_blocks=32, kv_cache_tensors=[ @@ -940,8 +1147,9 @@ def test_get_kv_cache_config(): 'layer_1': new_kv_cache_spec(), 'layer_2': new_sliding_window_spec(), } - kv_cache_config_hybrid = get_kv_cache_config( - vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32) + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 2 * 32])[0] assert kv_cache_config_hybrid == KVCacheConfig( num_blocks=32, kv_cache_tensors=[ @@ -962,8 +1170,9 @@ def test_get_kv_cache_config(): 'layer_1': new_kv_cache_spec(), 'layer_2': new_sliding_window_spec(), } - kv_cache_config_hybrid = get_kv_cache_config( - vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32) + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 2 * 32])[0] assert kv_cache_config_hybrid == KVCacheConfig( num_blocks=64, kv_cache_tensors=[ @@ -985,21 +1194,22 @@ def test_get_kv_cache_config(): 'layer_5': new_sliding_window_spec(), 'layer_6': new_sliding_window_spec(), } - kv_cache_config_hybrid = get_kv_cache_config( - vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32) + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 2 * 32])[0] assert kv_cache_config_hybrid == KVCacheConfig( num_blocks=32, kv_cache_tensors=[ KVCacheTensor(size=mem_per_block_per_layer * 32, - shared_by=["layer_1", "layer_3", "layer_5"]), + shared_by=["layer_1", "layer_3", "layer_4"]), KVCacheTensor(size=mem_per_block_per_layer * 32, - shared_by=["layer_2", "layer_4", "layer_6"]), + shared_by=["layer_2", "layer_5", "layer_6"]), ], kv_cache_groups=[ KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer_3", "layer_4"], + KVCacheGroupSpec(["layer_3", "layer_5"], new_sliding_window_spec()), - KVCacheGroupSpec(["layer_5", "layer_6"], + KVCacheGroupSpec(["layer_4", "layer_6"], new_sliding_window_spec()), ], ) @@ -1017,27 +1227,30 @@ def test_get_kv_cache_config(): 'layer_9': new_sliding_window_spec(), 'layer_10': new_sliding_window_spec(), } - kv_cache_config_hybrid = get_kv_cache_config( - vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 3 * 32) + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 3 * 32])[0] assert kv_cache_config_hybrid == KVCacheConfig( num_blocks=32, kv_cache_tensors=[ KVCacheTensor( size=mem_per_block_per_layer * 32, - shared_by=["layer_1", "layer_4", "layer_7", "layer_10"]), + shared_by=["layer_1", "layer_4", "layer_5", "layer_6"]), + KVCacheTensor( + size=mem_per_block_per_layer * 32, + shared_by=["layer_2", "layer_7", "layer_8", "layer_9"]), KVCacheTensor(size=mem_per_block_per_layer * 32, - shared_by=["layer_2", "layer_5", "layer_8"]), - KVCacheTensor(size=mem_per_block_per_layer * 32, - shared_by=["layer_3", "layer_6", "layer_9"]), + shared_by=["layer_3", "layer_10"]), ], kv_cache_groups=[ KVCacheGroupSpec(["layer_1", "layer_2", "layer_3"], new_kv_cache_spec()), - KVCacheGroupSpec(["layer_4", "layer_5", "layer_6"], + KVCacheGroupSpec(["layer_4", "layer_7", "layer_10"], new_sliding_window_spec()), - KVCacheGroupSpec(["layer_7", "layer_8", "layer_9"], + KVCacheGroupSpec(["layer_5", "layer_8"], + new_sliding_window_spec()), + KVCacheGroupSpec(["layer_6", "layer_9"], new_sliding_window_spec()), - KVCacheGroupSpec(["layer_10"], new_sliding_window_spec()), ], ) @@ -1047,13 +1260,14 @@ def test_get_kv_cache_config(): 'layer_2': new_kv_cache_spec(), } with pytest.raises(NotImplementedError): - get_kv_cache_config(vllm_config, kv_cache_specs_hybrid, - mem_per_block_per_layer * 2 * 32) + get_kv_cache_configs(vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 2 * 32])[0] # Test num_gpu_blocks_override vllm_config.cache_config.num_gpu_blocks_override = 16 - kv_cache_config_override_blocks = get_kv_cache_config( - vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32) + kv_cache_config_override_blocks = get_kv_cache_configs( + vllm_config, [kv_cache_specs_full], + [mem_per_block_per_layer * 2 * 32])[0] assert kv_cache_config_override_blocks == KVCacheConfig( num_blocks=16, kv_cache_tensors=[ @@ -1065,3 +1279,16 @@ def test_get_kv_cache_config(): kv_cache_groups=[ KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()) ]) + + +def test_get_kv_cache_configs_attention_free(): + kv_cache_specs: dict[str, KVCacheSpec] = {} + vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16)) + kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0]) + assert kv_cache_configs == [ + KVCacheConfig( + num_blocks=1, + kv_cache_tensors=[], + kv_cache_groups=[], + ) + ] diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 659d768bcf2e9..3cf9d93696767 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -25,6 +25,16 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, SlidingWindowSpec) +@pytest.fixture(autouse=True) +def _auto_init_hash_fn(request): + hash_fn: Callable + if "hash_fn" in request.fixturenames: + hash_fn = init_none_hash(request.getfixturevalue("hash_fn")) + else: + hash_fn = sha256 + init_none_hash(hash_fn) + + def make_request( request_id: str, prompt_token_ids: list[int], @@ -105,7 +115,6 @@ def make_kv_cache_config_hybrid_model(block_size: int, @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor]) def test_prefill(hash_fn): - init_none_hash(hash_fn) block_size = 16 manager = KVCacheManager( @@ -736,7 +745,6 @@ def test_cache_blocks(hash_fn): This is a unit test that tests the correctness of the _cache_full_blocks function of KVCacheManager. """ - init_none_hash(hash_fn) block_size = 4 block_pool = BlockPool( @@ -849,7 +857,6 @@ def test_mm_prefix_caching(): """ This tests that the multi-modal prefix caching is correct. """ - kv_cache_utils.init_none_hash(sha256) block_size = 16 manager = KVCacheManager( @@ -942,8 +949,6 @@ def test_cache_key_salting(): This tests that cache salts are applied during hashing and the cache is separated cache as expected. """ - kv_cache_utils.init_none_hash(sha256) - block_size = 16 manager = KVCacheManager( make_kv_cache_config(block_size, 11), diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py index 955c74d262a09..bdd41eece2317 100644 --- a/tests/v1/engine/test_processor_multi_modal_uuids.py +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -31,7 +31,7 @@ def _mk_processor(monkeypatch, raising=True) monkeypatch.setattr(ModelConfig, "__post_init__", - lambda self: None, + lambda self, *args: None, raising=True) monkeypatch.setattr(UnspecifiedPlatform, "is_async_output_supported", diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index efa604dd6b5a8..794c1f68f1471 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -9,24 +9,9 @@ from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -UNSUPPORTED_MODELS_V1 = [ - "facebook/bart-large-cnn", # encoder decoder -] - MODEL = "meta-llama/Llama-3.2-1B-Instruct" -@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1) -def test_reject_unsupported_models(monkeypatch, model): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - args = AsyncEngineArgs(model=model) - - with pytest.raises(NotImplementedError): - _ = args.create_engine_config() - m.delenv("VLLM_USE_V1") - - def test_reject_bad_config(monkeypatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "0") @@ -77,12 +62,6 @@ def test_enable_by_default_fallback(monkeypatch): assert envs.VLLM_USE_V1 m.delenv("VLLM_USE_V1") - # Should fall back to V0 for supported model. - _ = AsyncEngineArgs( - model=UNSUPPORTED_MODELS_V1[0]).create_engine_config() - assert not envs.VLLM_USE_V1 - m.delenv("VLLM_USE_V1") - def test_v1_llm_by_default(monkeypatch): with monkeypatch.context() as m: diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index c719e44acc9c2..bd9b6131c2222 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -10,7 +10,7 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.utils import GiB_bytes from vllm.v1.core.kv_cache_utils import (estimate_max_model_len, - get_kv_cache_config) + get_kv_cache_configs) from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.worker.tpu_model_runner import ( @@ -477,8 +477,8 @@ def test_init_kv_cache_without_kv_sharing(): # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim) # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB num_expected_blocks = 20480 # 20GB / 512KB / 2 (num layers) - kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, - available_memory) + kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec], + [available_memory])[0] assert kv_cache_config.num_blocks == num_expected_blocks assert len(kv_cache_config.kv_cache_tensors) == 2 assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2 @@ -550,8 +550,8 @@ def test_init_kv_cache_with_kv_sharing_valid(): # with KV sharing, we can allocate (available_mem//page_size//1) blocks # which is twice as many as without KV sharing num_expected_blocks = 2 * 20480 # 20GB / 512KB - kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, - available_memory) + kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec], + [available_memory])[0] assert kv_cache_config.num_blocks == num_expected_blocks assert len(kv_cache_config.kv_cache_tensors) == 1 # Each layer now has twice the available memory for KV cache diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 5ebc00d573030..4ad8df1ce3868 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -15,7 +15,7 @@ from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.utils import GiB_bytes, update_environment_variables from vllm.v1.core.kv_cache_utils import (estimate_max_model_len, - get_kv_cache_config) + get_kv_cache_configs) from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, @@ -585,8 +585,8 @@ def test_init_kv_cache_without_kv_sharing(): available_memory = 20 * GiB_bytes # page size for layer 0's kv_cache_spec is 32KB num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers) - kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, - available_memory) + kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec], + [available_memory])[0] assert kv_cache_config.num_blocks == num_expected_blocks assert len(kv_cache_config.kv_cache_tensors) == 2 assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2 @@ -657,8 +657,8 @@ def test_init_kv_cache_with_kv_sharing_valid(): # with KV sharing, we can allocate (available_mem//page_size//1) blocks # which is twice as many as without KV sharing num_expected_blocks = 655360 # 20GB / 32KB - kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, - available_memory) + kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec], + [available_memory])[0] assert kv_cache_config.num_blocks == num_expected_blocks assert len(kv_cache_config.kv_cache_tensors) == 1 # Each layer now has twice the available memory for KV cache @@ -788,8 +788,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): kv_cache_spec = runner.get_kv_cache_spec() available_memory = 5 * GiB_bytes - kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, - available_memory) + kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec], + [available_memory])[0] runner.initialize_kv_cache(kv_cache_config) # random partition of blocks diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py deleted file mode 100644 index 35ac90b38e840..0000000000000 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ /dev/null @@ -1,648 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import itertools - -import pytest -import torch - -from vllm.engine.arg_utils import EngineArgs -from vllm.platforms import current_platform -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import make_tensor_with_pad -from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner - -BATCH_SIZES = [1, 4, 16, 64, 256] - - -def _create_model_runner(model: str, *args, - **kwargs) -> EncoderDecoderModelRunner: - engine_args = EngineArgs(model, *args, **kwargs) - engine_config = engine_args.create_engine_config() - model_runner = EncoderDecoderModelRunner( - vllm_config=engine_config, - is_driver_worker=True, - ) - return model_runner - - -@pytest.mark.skipif(condition=current_platform.is_cpu(), - reason="CPU backend is currently " - "unsupported for encoder/ " - "decoder models") -def test_empty_seq_group(): - """Verify prepare prompt and decode returns empty output - for empty seq group list""" - - model_runner = _create_model_runner( - "facebook/bart-base", - seed=0, - dtype="float16", - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enforce_eager=True, - ) - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - ( - input_tokens, - input_positions, - encoder_input_tokens, - encoder_input_positions, - attn_metadata, - return_seq_lens, - ) = ( - model_input.input_tokens, - model_input.input_positions, - model_input.encoder_input_tokens, - model_input.encoder_input_positions, - model_input.attn_metadata, - model_input.seq_lens, - ) - assert input_tokens is None - assert input_positions is None - assert encoder_input_tokens is None - assert encoder_input_positions is None - assert attn_metadata is None - assert return_seq_lens is None - - -@pytest.mark.skipif(condition=current_platform.is_cpu(), - reason="CPU backend is currently " - "unsupported for encoder/ " - "decoder models") -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -def test_prepare_prompt(batch_size): - ''' - Test the ability of the encoder/decoder model runner subclass to - produce prefill-phase model inputs & attention metadata. - - Test behavior: - - * Instantiate BART base model & enc/dec model runner - * Construct sequence-group metadata for dummy prompts - * Test that encoder attention, decoder self-attention, - and encoder/decoder cross-attention inputs are correct - - Arguments: - - * batch_size - * backend_name: The attention backend under test - * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph) - ''' - - model_runner = _create_model_runner( - "facebook/bart-base", - seed=0, - dtype="float16", - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enforce_eager=True, - ) - - seq_lens: list[int] = [] - encoder_seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - block_tables = {0: [1]} - cross_block_table = [2] - for i in range(batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_lens.append(seq_len) - seq_data = SequenceData.from_seqs(range(seq_len)) - encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 - encoder_seq_lens.append(encoder_seq_len) - encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len)) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - encoder_seq_data=encoder_seq_data, - cross_block_table=cross_block_table, - ) - assert seq_group_metadata.token_chunk_size == seq_data.get_len() - seq_group_metadata_list.append(seq_group_metadata) - - # Build - # * Decoder model inputs - # * Decoder self-attention KV caching data structures - # * Encoder model inputs - # * Encoder/decoder cross-attention KV caching data structures - model_input = model_runner.prepare_model_input(seq_group_metadata_list) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - slot_mapping = attn_metadata.slot_mapping - encoder_input_tokens = model_input.encoder_input_tokens - encoder_input_positions = model_input.encoder_input_positions - cross_slot_mapping = attn_metadata.cross_slot_mapping - assert return_seq_lens == seq_lens - assert len(slot_mapping) == len(input_tokens) - assert len(cross_slot_mapping) == len(encoder_input_tokens) - - # Verify input metadata is correct for prompts. - # - Decoder attention metadata - device = model_runner.device - assert attn_metadata.num_prefills > 0 - assert attn_metadata.num_decode_tokens == 0 - assert torch.equal(attn_metadata.seq_lens_tensor, - torch.tensor(seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.seq_lens == seq_lens - assert attn_metadata.max_prefill_seq_len == max(seq_lens) - assert attn_metadata.max_decode_seq_len == 0 - # - Encoder attention metadata - assert attn_metadata.encoder_seq_lens == encoder_seq_lens - assert torch.equal( - attn_metadata.encoder_seq_lens_tensor, - torch.tensor(encoder_seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens) - assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens) - - # Test decoder subquery start locs. - start_idx = 0 - start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - start_loc.append(start_idx) - assert torch.equal( - attn_metadata.query_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device), - ) - - # Test decoder seq start locs & context lengths - - assert torch.equal( - attn_metadata.seq_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device), - ) - assert torch.equal( - attn_metadata.context_lens_tensor, - torch.zeros(attn_metadata.context_lens_tensor.shape[0], - dtype=torch.int, - device=device), - ) - - # Verify block tables are correct for prompts - # - Decoder self-attention - expected = torch.tensor( - [[] for _ in range(len(seq_group_metadata_list))], - dtype=torch.int32, - device=model_runner.device, - ) - assert torch.equal( - attn_metadata.block_tables, - expected, - ) - # - Encoder/decoder cross-attention - assert torch.equal( - attn_metadata.cross_block_tables, - expected, - ) - - # Cuda graph should not be used for prefill. - assert attn_metadata.use_cuda_graph is False - - # Verify the lengths of input tokens & positions - # - Decoder - assert len(input_tokens) == sum(seq_lens) - assert len(input_positions) == sum(seq_lens) - # -- An indirect check that model_input.input_tokens - # and model_input.input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - input_tokens, - input_positions, - ) - # - Encoder - assert len(encoder_input_tokens) == sum(encoder_seq_lens) - # -- An indirect check that model_input.encoder_input_tokens - # and model_input.encoder_input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - encoder_input_tokens, - encoder_input_positions, - ) - - # Test that vLLM sampling infrastructure chooses the correct - # sequence positions at which to sample (i.e. the end of - # each sequence) in the prefill phase - - expected_selected_token_indices = [] - selected_token_start_idx = 0 - for seq_len in seq_lens: - # Compute the index offset of the final token in each - # prompt (recall that the prompts are concatenated) - expected_selected_token_indices.append(selected_token_start_idx + - seq_len - 1) - selected_token_start_idx += seq_len - - sampling_metadata = model_input.sampling_metadata - actual = sampling_metadata.selected_token_indices - expected = torch.tensor( - expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype, - ) - assert torch.equal(actual, expected) - - -@pytest.mark.skipif(condition=current_platform.is_cpu(), - reason="CPU backend is currently " - "unsupported for encoder/ " - "decoder models") -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False]) -def test_prepare_decode(batch_size, multiple_seqs_per_seq_group): - ''' - Test the ability of the encoder/decoder model runner subclass to - produce decode-phase model inputs & attention metadata. - - Test behavior: - - * Instantiate BART base model & enc/dec model runner - * Construct sequence-group metadata for dummy prompts - * Test that encoder attention, decoder self-attention, - and encoder/decoder cross-attention inputs are correct - - Arguments: - - * batch_size - * multiple_seqs_per_seq_group - * backend_name: The attention backend under test - * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph) - ''' - - model_runner = _create_model_runner( - "facebook/bart-base", - seed=0, - dtype="float16", - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enforce_eager=True, - ) - - seq_lens: list[int] = [] - encoder_seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - block_tables = { - 0: [1], - 1: [3] - } if multiple_seqs_per_seq_group else { - 0: [1] - } - cross_block_table = [2] - for i in range(batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_data = SequenceData.from_seqs(range(seq_len)) - encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 - encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len)) - - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=False, - seq_data={ - 0: seq_data, - 1: seq_data - } if multiple_seqs_per_seq_group else {0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - encoder_seq_data=encoder_seq_data, - cross_block_table=cross_block_table, - ) - assert seq_group_metadata.token_chunk_size == 1 - seq_group_metadata_list.append(seq_group_metadata) - seq_lens.extend( - [seq_len for _ in range(len(seq_group_metadata.seq_data))]) - encoder_seq_lens.extend( - [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))]) - - # Build - # * Decoder model inputs - # * Decoder self-attention KV caching data structures - # * Encoder model inputs - # * Encoder/decoder cross-attention KV caching data structures - model_input = model_runner.prepare_model_input(seq_group_metadata_list) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - slot_mapping = attn_metadata.slot_mapping - encoder_input_tokens = model_input.encoder_input_tokens - encoder_input_positions = model_input.encoder_input_positions - cross_slot_mapping = attn_metadata.cross_slot_mapping - assert return_seq_lens == seq_lens - assert len(slot_mapping) == len(input_tokens) - assert len(cross_slot_mapping) == len(encoder_input_tokens) - - # Verify input metadata is correct for decode phase. - # - Decoder attention metadata - device = model_runner.device - assert attn_metadata.num_prefills == 0 - assert attn_metadata.num_decode_tokens > 0 - assert torch.equal(attn_metadata.seq_lens_tensor, - torch.tensor(seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.seq_lens == seq_lens - assert attn_metadata.max_prefill_seq_len == 0 - assert attn_metadata.max_decode_seq_len == max(seq_lens) - # - Encoder attention metadata - assert attn_metadata.encoder_seq_lens == encoder_seq_lens - assert torch.equal( - attn_metadata.encoder_seq_lens_tensor, - torch.tensor(encoder_seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens) - assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens) - - # Test decoder subquery start locs. - start_idx = 0 - start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += 1 - start_loc.append(start_idx) - assert torch.equal( - attn_metadata.query_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device), - ) - - # Test decoder seq start locs. Note that for normal prefill it is - # equivalent to query_start_loc. - start_idx = 0 - seq_start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - seq_start_loc.append(start_idx) - - # Test seq_start_loc and context lengths - - assert torch.equal( - attn_metadata.seq_start_loc, - torch.tensor(seq_start_loc, dtype=torch.int32, device=device), - ) - assert torch.equal( - attn_metadata.context_lens_tensor, - torch.tensor([seq_len - 1 for seq_len in seq_lens], - dtype=torch.int, - device=device)) - - # Verify block tables are correct for prompts - # - Decoder self-attention - flattened_block_tables = [ - block_table for block_table in block_tables.values() - ] - expected = torch.tensor(flattened_block_tables * - len(seq_group_metadata_list), - dtype=torch.int32, - device=model_runner.device) - assert torch.equal( - attn_metadata.block_tables, - expected, - ) - # - Encoder/decoder cross-attention - expected = torch.tensor([ - cross_block_table for seq_group_metadata in seq_group_metadata_list - for _ in range(len(seq_group_metadata.seq_data)) - ], - dtype=torch.int32, - device=model_runner.device) - assert torch.equal( - attn_metadata.cross_block_tables, - expected, - ) - - # Model runner's CUDAGraph setting should be propagated to attention - # metadata. - assert attn_metadata.use_cuda_graph is False - - # Verify the lengths of input tokens & positions - # - Decoder - assert len(input_tokens) == len(seq_lens) - assert len(input_positions) == len(seq_lens) - # -- An indirect check that model_input.input_tokens - # and model_input.input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - input_tokens, - input_positions, - ) - # - Encoder - assert len(encoder_input_tokens) == 0 - assert len(encoder_input_tokens) == 0 - # -- An indirect check that model_input.encoder_input_tokens - # and model_input.encoder_input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - encoder_input_tokens, - encoder_input_positions, - ) - - # Test that vLLM sampling infrastructure chooses the correct - # sequence positions at which to sample (i.e. the end of - # each sequence) in the decode phase - - expected_selected_token_indices = [] - for selected_token_start_idx, seq_len in enumerate(seq_lens): - # Compute the index offset of the final token in each - # sequence's decoded outputs; since a single token is - # decoded per iteration per sequence, then the length - # of the decoded tokens for a given sequence is 1 and - # the final index offset into a given sequence's - # generated tokens is 0 (i.e. the expected sampling index - # for a given sequence is just `selected_token_start_idx`) - expected_selected_token_indices.append(selected_token_start_idx) - - sampling_metadata = model_input.sampling_metadata - actual = sampling_metadata.selected_token_indices - expected = torch.tensor( - expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype, - ) - assert torch.equal(actual, expected) - - -@pytest.mark.parametrize("batch_size", list(range(1, 257))) -@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False]) -def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group): - """ - Tests that for encoder-decoder models with CUDA Graph capture and replay - enabled, the tensors used during the decode phase are correctly padded - for varying input batch sizes. - """ - model_runner = _create_model_runner( - "facebook/bart-base", - seed=0, - dtype="float16", - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enforce_eager=False, - ) - block_tables = { - 0: [1], - 1: [3] - } if multiple_seqs_per_seq_group else { - 0: [1] - } - seq_lens: list[int] = [] - encoder_seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - - cross_block_table = [2] - expanded_batch_size = 0 - for i in range(batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_data = SequenceData.from_seqs(range(seq_len)) - encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 - encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len)) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=False, - seq_data={ - 0: seq_data, - 1: seq_data - } if multiple_seqs_per_seq_group else {0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - encoder_seq_data=encoder_seq_data, - cross_block_table=cross_block_table, - ) - assert seq_group_metadata.token_chunk_size == 1 - seq_lens.extend( - [seq_len for _ in range(len(seq_group_metadata.seq_data))]) - encoder_seq_lens.extend( - [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))]) - expanded_batch_size = expanded_batch_size + len( - seq_group_metadata.seq_data) - seq_group_metadata_list.append(seq_group_metadata) - - model_input = model_runner.prepare_model_input(seq_group_metadata_list) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - slot_mapping = attn_metadata.slot_mapping - encoder_input_tokens = model_input.encoder_input_tokens - encoder_input_positions = model_input.encoder_input_positions - cross_slot_mapping = attn_metadata.cross_slot_mapping - - # With CUDA Graph capture and replay enabled, the decoder and encoder - # input sequences will be padded. Create the expected padded tensors - # accordingly. - graph_batch_size = model_runner.vllm_config.pad_for_cudagraph( - expanded_batch_size) - cuda_graph_pad_size = graph_batch_size - expanded_batch_size - padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size)) - padded_encoder_seq_lens = encoder_seq_lens + list( - itertools.repeat(1, cuda_graph_pad_size)) - - assert return_seq_lens == padded_seq_lens - assert len(slot_mapping) == len(input_tokens) - assert len(cross_slot_mapping) == len(encoder_input_tokens) - - # Verify attention metadata - device = model_runner.device - assert attn_metadata.num_prefills == 0 - assert attn_metadata.num_decode_tokens > 0 - assert torch.equal( - attn_metadata.seq_lens_tensor, - torch.tensor(padded_seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.seq_lens == padded_seq_lens - assert attn_metadata.max_prefill_seq_len == 0 - assert attn_metadata.max_decode_seq_len == max(seq_lens) - # - Encoder attention metadata - assert attn_metadata.encoder_seq_lens == padded_encoder_seq_lens - assert torch.equal( - attn_metadata.encoder_seq_lens_tensor, - torch.tensor(padded_encoder_seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.max_encoder_seq_len == max(padded_encoder_seq_lens) - assert attn_metadata.num_encoder_tokens == sum(padded_encoder_seq_lens) - - # Verify block tables are correct for prompts - # - Decoder self-attention. Pad the block tables as expected. - flattened_block_tables = [ - block_table for _ in range(len(seq_group_metadata_list)) - for block_table in block_tables.values() - ] - flattened_block_tables.extend([[] for _ in range(cuda_graph_pad_size)]) - expected = make_tensor_with_pad( - flattened_block_tables, - max_len=64, - pad=0, - dtype=torch.int32, - device=model_runner.device, - ) - assert torch.equal( - attn_metadata.block_tables, - expected, - ) - # - Encoder/decoder cross-attention. Pad the cross-attention block tables - # as expected. - expected = [ - cross_block_table for seq_group_metadata in seq_group_metadata_list - for _ in range(len(seq_group_metadata.seq_data)) - ] - expected.extend([[] for _ in range(cuda_graph_pad_size)]) - expected = make_tensor_with_pad( - expected, - max_len=64, - pad=0, - dtype=torch.int32, - device=model_runner.device, - ) - assert torch.equal( - attn_metadata.cross_block_tables, - expected, - ) - - # Model runner's CUDAGraph setting should be propagated to attention - # metadata. - assert attn_metadata.use_cuda_graph is True - - # Verify the lengths of input tokens & positions - # - Decoder - assert len(input_tokens) == len(padded_seq_lens) - assert len(input_positions) == len(padded_seq_lens) - # -- An indirect check that model_input.input_tokens - # and model_input.input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - input_tokens, - input_positions, - ) - # - Encoder - assert len(encoder_input_tokens) == 0 - assert len(encoder_input_tokens) == 0 - # -- An indirect check that model_input.encoder_input_tokens - # and model_input.encoder_input_positions are correct - - # by design of the test, the input tokens are - # equal to the input position values, so if - # the model_input data structure has the correct - # values then these two should be equal - assert torch.equal( - encoder_input_tokens, - encoder_input_positions, - ) diff --git a/use_existing_torch.py b/use_existing_torch.py index b5aafdde16c28..76480f3e58fee 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -1,5 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -print("vLLM is now using 'uv' to disable build isolation for 'torch'.") -print("Please instead install vLLM with 'uv pip install -e .' (must use 'uv')") +import glob + +requires_files = glob.glob('requirements/*.txt') +requires_files += ["pyproject.toml"] +for file in requires_files: + print(f">>> cleaning {file}") + with open(file) as f: + lines = f.readlines() + if "torch" in "".join(lines).lower(): + print("removed:") + with open(file, 'w') as f: + for line in lines: + if 'torch' not in line.lower(): + f.write(line) + else: + print(line.strip()) + print(f"<<< done cleaning {file}") + print() \ No newline at end of file diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 93b4f87ed260c..456c6b3ba9234 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -117,13 +117,14 @@ def paged_attention_rocm( k_scale: torch.Tensor, v_scale: torch.Tensor, fp8_out_scale: Optional[torch.Tensor] = None, + mfma_type: str = "fp8" if envs.VLLM_ROCM_FP8_MFMA_PAGE_ATTN else "f16", ) -> None: torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, seq_lens, query_start_loc, block_size, max_seq_len, alibi_slopes, kv_cache_dtype, k_scale, - v_scale, fp8_out_scale) + v_scale, fp8_out_scale, mfma_type) def mla_decode_kvcache_cpu( @@ -2010,3 +2011,27 @@ def onednn_scaled_mm( input_zp_adj, bias, dnnl_handler.handler) return output + + +def hadacore_transform(x: torch.Tensor, inplace: bool = True) -> torch.Tensor: + """ + Perform Hadamard transforms using [Hadacore](https://arxiv.org/abs/2412.08832) + kernels. Note that these kernels exploit the recursive properties of + Sylvester Hadamards, and therefore do not require transform weight data + + Note that sylvester hadamard transforms are also symmetric, which means that + this function is also applies the (transpose <=> inverse) transform. + + :param x: value to be transformed inplace + :param inplace: modify value in place + :return: value after transformation + """ + return torch.ops._C.hadacore_transform(x, inplace) + + +if hasattr(torch.ops._C, "hadacore_transform"): + + @register_fake("_C::hadacore_transform") + def _hadacore_transform_fake(x: torch.Tensor, + inplace: bool) -> torch.Tensor: + return torch.empty_like(x) if not inplace else x diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 59b0aed321502..9d2eda482fcf8 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -13,7 +13,7 @@ logger = init_logger(__name__) try: import intel_extension_for_pytorch as ipex except ImportError as e: - logger.warning("Import error msg: %s", e.msg) + logger.debug("Import error msg: %s", e.msg) class ipex_ops: diff --git a/vllm/assets/video.py b/vllm/assets/video.py index 983e9114cccfb..5c9e403c4b91f 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -76,7 +76,7 @@ def video_to_pil_images_list(path: str, return [Image.fromarray(frame) for frame in frames] -def video_get_metadata(path: str) -> dict[str, Any]: +def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]: cap = cv2.VideoCapture(path) if not cap.isOpened(): raise ValueError(f"Could not open video file {path}") @@ -85,11 +85,18 @@ def video_get_metadata(path: str) -> dict[str, Any]: fps = cap.get(cv2.CAP_PROP_FPS) duration = total_frames / fps if fps > 0 else 0 + if num_frames == -1 or num_frames > total_frames: + num_frames = total_frames + metadata = { - "total_num_frames": total_frames, + "total_num_frames": num_frames, "fps": fps, "duration": duration, - "video_backend": "opencv" + "video_backend": "opencv", + "frames_indices": list(range(num_frames)), + # extra field used to control hf processor's video + # sampling behavior + "do_sample_frames": num_frames == total_frames, } return metadata @@ -126,7 +133,7 @@ class VideoAsset: @property def metadata(self) -> dict[str, Any]: - ret = video_get_metadata(self.video_path) + ret = video_get_metadata(self.video_path, self.num_frames) return ret def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray: diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index 411eb5413f53c..aeaa0ab631cfb 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -17,7 +17,6 @@ from vllm.attention.backends.mla.common import (MLACommonBackend, from vllm.attention.ops.flashmla import (flash_mla_with_kvcache, get_mla_metadata, is_flashmla_supported) -from vllm.platforms.cuda import CudaPlatform class FlashMLABackend(MLACommonBackend): @@ -179,18 +178,8 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): logits_soft_cap, attn_type, kv_sharing_target_layer_name, **mla_args) - assert is_flashmla_supported(), \ - "FlashMLA is not supported on this device" - - # disallow FlashMLA on NVIDIA Blackwell (SM 10.0+) GPUs - # context: - # https://github.com/deepseek-ai/FlashMLA/issues/83 - # https://github.com/vllm-project/vllm/issues/24513 - if CudaPlatform.has_device_capability(100): - raise NotImplementedError( - "FlashMLA is temporarily disabled on Blackwell (SM 10.0). " - "Please use CUTLASS_MLA or TRITON_MLA instead. " - "Example: `export VLLM_ATTENTION_BACKEND=CUTLASS_MLA`") + is_supported, reason = is_flashmla_supported() + assert is_supported, reason unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 44cb2c7c6b642..22dc6dcbc8d62 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.platforms import _Backend, current_platform -from vllm.utils import direct_register_custom_op +from vllm.utils import GiB_bytes, direct_register_custom_op logger = init_logger(__name__) USE_XFORMERS_OPS = None @@ -225,9 +225,26 @@ class Attention(nn.Module, AttentionLayerBase): ).parallel_config.pipeline_parallel_size) ] - self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) - self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) - self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) + try: + self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, + dtype=torch.float32) + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, + dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, + dtype=torch.float32) + except torch.cuda.OutOfMemoryError as e: + logger.error( + "Failed to initialize attention q/k/v range constants: %s", e) + if torch.cuda.is_available(): + logger.debug("CUDA device: %s", torch.cuda.current_device()) + logger.debug("Allocated: %.2f GiB", + torch.cuda.memory_allocated() / GiB_bytes) + logger.debug("Reserved: %.2f GiB", + torch.cuda.memory_reserved() / GiB_bytes) + raise RuntimeError( + "Failed to initialize q/k/v range constants. " + "This may be caused by insufficient memory to allocate " + "kv cache.") from e def forward( self, diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index bf9e87198bcf1..0a297479bcc00 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -11,6 +11,7 @@ generation. Supported dataset types include: - HuggingFace - VisionArena """ +import argparse import ast import base64 import io @@ -1019,6 +1020,25 @@ class ShareGPTDataset(BenchmarkDataset): return samples +class _ValidateDatasetArgs(argparse.Action): + """Argparse action to validate dataset name and path compatibility.""" + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, values) + + # Get current values of both dataset_name and dataset_path + dataset_name = getattr(namespace, 'dataset_name', 'random') + dataset_path = getattr(namespace, 'dataset_path', None) + + # Validate the combination + if dataset_name == "random" and dataset_path is not None: + parser.error( + "Cannot use 'random' dataset with --dataset-path. " + "Please specify the appropriate --dataset-name (e.g., " + "'sharegpt', 'custom', 'sonnet') for your dataset file: " + f"{dataset_path}" + ) + + def add_dataset_parser(parser: FlexibleArgumentParser): parser.add_argument("--seed", type=int, default=0) parser.add_argument( @@ -1031,6 +1051,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "--dataset-name", type=str, default="random", + action=_ValidateDatasetArgs, choices=[ "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf", "custom", "prefix_repetition", "spec_bench" @@ -1046,6 +1067,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "--dataset-path", type=str, default=None, + action=_ValidateDatasetArgs, help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.", ) diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py index 4888d4d1298e3..17e85e70218da 100644 --- a/vllm/compilation/noop_elimination.py +++ b/vllm/compilation/noop_elimination.py @@ -62,9 +62,6 @@ class NoOpEliminationPass(VllmInductorPass): scaled_mm: "f16[s0, 4096]" = ... at = auto_functionalized(fused_add_rms_norm, input = scaled_mm, ...) out: "f16[s0, 4096]" = at[1] - - TODO(luka): This is currently tested in test_fusion, - but separate tests could be good. """ def __call__(self, graph: torch.fx.Graph): @@ -96,17 +93,19 @@ class NoOpEliminationPass(VllmInductorPass): # Invalid reshape args, skip continue - if self.all_dims_equivalent(shape, input_shape): + if self.reshape_all_dims_equivalent(shape, input_shape): node.replace_all_uses_with(input) graph.erase_node(node) count += 1 elif is_func(node, torch.ops.aten.slice.Tensor): + # python slicing semantics are different from reshape + # Don't treat -1 as inferred dimension input, dim_index, start, end = node.args[:4] input_shape = input.meta["val"].shape - i_dim = input_shape[dim_index] + output_shape = node.meta["val"].shape - if start == 0 and self.dims_equivalent(end, i_dim): + if output_shape == input_shape: node.replace_all_uses_with(input) graph.erase_node(node) count += 1 @@ -116,14 +115,7 @@ class NoOpEliminationPass(VllmInductorPass): base_shape = base.meta["val"].shape view_shape = view.meta["val"].shape - view_dim = view_shape[dim_index] - - # Check that view fully covers base and the full view is used - # (if the view fully covered the base after slicing but was not - # fully used, we could replace slice_scatter with a simple slice - # but that's a niche case). - if (base_shape == view_shape and start == 0 - and self.dims_equivalent(end, view_dim)): + if base_shape == view_shape: node.replace_all_uses_with(view) graph.erase_node(node) count += 1 @@ -132,13 +124,9 @@ class NoOpEliminationPass(VllmInductorPass): self.dump_graph(graph, "after_noop_elimination") self.end_and_log() - def all_dims_equivalent(self, dims: Iterable[Union[int, torch.fx.Node]], - i_dims: Iterable[Union[int, SymInt]]): - return all( - self.dims_equivalent(s, i_s) for s, i_s in zip(dims, i_dims)) - - def dims_equivalent(self, dim: Union[int, torch.fx.Node], - i_dim: Union[int, SymInt]) -> bool: + # ---------------------- Reshape helpers ---------------------- + def reshape_dims_equivalent(self, dim: Union[int, torch.fx.Node], + i_dim: Union[int, SymInt]) -> bool: """ This function checks if two dimensions are equivalent. :param dim: The dimension arg to reshape/slice @@ -156,10 +144,18 @@ class NoOpEliminationPass(VllmInductorPass): In case 3, the reshape dimension is a torch.fx.Node, and its value is a SymInt. That value is equal to the input dimension. - """ # Case 1 and 2 if dim == i_dim or dim == -1: return True # Case 3 return isinstance(dim, torch.fx.Node) and dim.meta["val"] == i_dim + + def reshape_all_dims_equivalent( + self, + dims: Iterable[Union[int, torch.fx.Node]], + i_dims: Iterable[Union[int, SymInt]], + ) -> bool: + return all( + self.reshape_dims_equivalent(s, i_s) + for s, i_s in zip(dims, i_dims)) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 85e58c290b792..0847fba878aa2 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -11,13 +11,12 @@ import json import os import textwrap import warnings -from collections.abc import Mapping from contextlib import contextmanager -from dataclasses import MISSING, Field, field, fields, is_dataclass, replace +from dataclasses import InitVar, field, fields, is_dataclass, replace from functools import cached_property, lru_cache from importlib.util import find_spec -from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional, - Protocol, TypeVar, Union, cast, get_args) +from typing import (TYPE_CHECKING, Any, Callable, Literal, Optional, Protocol, + TypeVar, Union, cast, get_args) import regex as re import torch @@ -37,6 +36,8 @@ from vllm.config.kv_events import KVEventsConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.config.load import LoadConfig from vllm.config.lora import LoRAConfig +from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode, + MultiModalConfig) from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy @@ -238,31 +239,12 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]: return out -def get_field(cls: ConfigType, name: str) -> Field: - """Get the default factory field of a dataclass by name. Used for getting - default factory fields in `EngineArgs`.""" - if not is_dataclass(cls): - raise TypeError("The given class is not a dataclass.") - cls_fields = {f.name: f for f in fields(cls)} - if name not in cls_fields: - raise ValueError(f"Field '{name}' not found in {cls.__name__}.") - named_field: Field = cls_fields[name] - if (default_factory := named_field.default_factory) is not MISSING: - return field(default_factory=default_factory) - if (default := named_field.default) is not MISSING: - return field(default=default) - raise ValueError( - f"{cls.__name__}.{name} must have a default value or default factory.") - - def is_init_field(cls: ConfigType, name: str) -> bool: return next(f for f in fields(cls) if f.name == name).init TokenizerMode = Literal["auto", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] -MMEncoderTPMode = Literal["weights", "data"] -MMCacheType = Literal["shm", "lru"] class LogprobsMode(enum.Enum): @@ -407,20 +389,6 @@ class ModelConfig: that this name(s) will also be used in `model_name` tag content of prometheus metrics, if multiple names provided, metrics tag will take the first one.""" - limit_mm_per_prompt: dict[str, int] = field(default_factory=dict) - """Maximum number of data items per modality per prompt. Only applicable - for multimodal models.""" - interleave_mm_strings: bool = False - """Enable fully interleaved support for multimodal prompts, while using - --chat-template-content-format=string. Defaults to False.""" - skip_mm_profiling: bool = False - """When enabled, skips multimodal memory profiling and only profiles with - language backbone model during engine initialization. - """ - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set - `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ use_async_output_proc: bool = True """Whether to use async output processor.""" config_format: Union[str, ConfigFormat] = "auto" @@ -436,41 +404,6 @@ class ModelConfig: hf_overrides: HfOverrides = field(default_factory=dict) """If a dictionary, contains arguments to be forwarded to the Hugging Face config. If a callable, it is called to update the HuggingFace config.""" - mm_processor_kwargs: Optional[dict[str, Any]] = None - """Arguments to be forwarded to the model's processor for multi-modal data, - e.g., image processor. Overrides for the multi-modal processor obtained - from `AutoProcessor.from_pretrained`. The available overrides depend on the - model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`. - """ - mm_processor_cache_gb: float = 4 - """The size (in GiB) of the multi-modal processor cache, which is used to - avoid re-processing past multi-modal inputs. - - This cache is duplicated for each API process and engine core process, - resulting in a total memory usage of - `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. - - Set to `0` to disable this cache completely (not recommended).""" - mm_processor_cache_type: MMCacheType = "lru" - """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, - use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" - mm_shm_cache_max_object_size_mb: int = 128 - """Size limit (in MiB) for each object stored in the multi-modal processor - shared memory cache. Only effective when `mm_processor_cache_type` is - `"shm"`.""" - mm_encoder_tp_mode: MMEncoderTPMode = "weights" - """Indicates how to optimize multi-modal encoder inference using - tensor parallelism (TP). - - - `"weights"`: Within the same vLLM engine, split the weights of - each layer across TP ranks. (default TP behavior) - - `"data"`: Within the same vLLM engine, split the batched input data - across TP ranks to process the data in parallel, while hosting - the full weights on each TP rank. - This batch-level DP is not to be confused with API request-level - DP (which is controlled by `--data-parallel-size`). - This is only supported on a per-model basis and falls back to - `"weights"` if the encoder does not support DP.""" pooler_config: Optional["PoolerConfig"] = field(init=False) """Pooler config which controls the behaviour of output pooling in pooling models.""" @@ -513,6 +446,18 @@ class ModelConfig: io_processor_plugin: Optional[str] = None """IOProcessor plugin name to load at model startup""" + # Multimodal config and init vars + multimodal_config: Optional[MultiModalConfig] = None + limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None + media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None + mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None + mm_processor_cache_gb: InitVar[Optional[float]] = None + mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None + mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None + mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None + interleave_mm_strings: InitVar[Optional[bool]] = None + skip_mm_profiling: InitVar[Optional[bool]] = None + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, @@ -546,7 +491,18 @@ class ModelConfig: assert_hashable(str_factors) return hashlib.sha256(str(factors).encode()).hexdigest() - def __post_init__(self) -> None: + def __post_init__( + self, + # Multimodal config init vars + limit_mm_per_prompt: Optional[dict[str, int]], + media_io_kwargs: Optional[dict[str, dict[str, Any]]], + mm_processor_kwargs: Optional[dict[str, Any]], + mm_processor_cache_gb: Optional[float], + mm_processor_cache_type: Optional[MMCacheType], + mm_shm_cache_max_object_size_mb: Optional[int], + mm_encoder_tp_mode: Optional[MMEncoderTPMode], + interleave_mm_strings: Optional[bool], + skip_mm_profiling: Optional[bool]) -> None: # Set the default seed to 0 in V1. # NOTE(woosuk): In V0, we set the default seed to None because the # driver worker shares the same process as the user process, and thus @@ -777,7 +733,33 @@ class ModelConfig: self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) - self.multimodal_config = self._init_multimodal_config() + # Init multimodal config if needed + if self._model_info.supports_multimodal: + if (mm_encoder_tp_mode == "data" and + not self._model_info.supports_multimodal_encoder_tp_data): + logger.warning_once( + "This model does not support `--mm-encoder-tp-mode data`. " + "Falling back to `--mm-encoder-tp-mode weights`.") + mm_encoder_tp_mode = "weights" + + mm_config_kwargs = dict( + limit_per_prompt=limit_mm_per_prompt, + media_io_kwargs=media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, + mm_processor_cache_gb=mm_processor_cache_gb, + mm_processor_cache_type=mm_processor_cache_type, + mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb, + mm_encoder_tp_mode=mm_encoder_tp_mode, + interleave_mm_strings=interleave_mm_strings, + skip_mm_profiling=skip_mm_profiling, + ) + + mm_config_kwargs = { + k: v + for k, v in mm_config_kwargs.items() if v is not None + } + + self.multimodal_config = MultiModalConfig(**mm_config_kwargs) if self.disable_sliding_window: # Set after get_and_verify_max_len to ensure that max_model_len @@ -875,30 +857,6 @@ class ModelConfig: ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"]) self.tokenizer = object_storage_tokenizer.dir - def _init_multimodal_config(self) -> Optional["MultiModalConfig"]: - if self._model_info.supports_multimodal: - if (self.mm_encoder_tp_mode == "data" and - not self._model_info.supports_multimodal_encoder_tp_data): - logger.warning_once( - "This model does not support `--mm-encoder-tp-mode data`. " - "Falling back to `--mm-encoder-tp-mode weights`.") - self.mm_encoder_tp_mode = "weights" - - return MultiModalConfig( - limit_per_prompt=self.limit_mm_per_prompt, - media_io_kwargs=self.media_io_kwargs, - mm_processor_kwargs=self.mm_processor_kwargs, - mm_processor_cache_gb=self.mm_processor_cache_gb, - mm_processor_cache_type=self.mm_processor_cache_type, - mm_shm_cache_max_object_size_mb=self. - mm_shm_cache_max_object_size_mb, - mm_encoder_tp_mode=self.mm_encoder_tp_mode, - interleave_mm_strings=self.interleave_mm_strings, - skip_mm_profiling=self.skip_mm_profiling, - ) - - return None - def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( self.model, self.revision) @@ -1243,11 +1201,8 @@ class ModelConfig: getattr(self.hf_config, "max_source_positions", 0)) self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, effective_max_seq_len) - # CUDAGraph capture not supported for enc-dec models and mllama on ROCm - ROCM_UNSUPPORTED_MODELS = ['mllama'] - unsupported_rocm = (self.hf_config.model_type - in ROCM_UNSUPPORTED_MODELS - or self.is_encoder_decoder) + # CUDAGraph capture not supported for encoder-decoder models on ROCm + unsupported_rocm = self.is_encoder_decoder if (unsupported_rocm and not self.enforce_eager and current_platform.is_rocm()): @@ -1713,10 +1668,6 @@ class ModelConfig: @property def is_encoder_decoder(self) -> bool: """Extract the HF encoder/decoder model flag.""" - """ - For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to - True to enable cross-attention - """ return is_encoder_decoder(self.hf_config) @property @@ -2417,129 +2368,6 @@ class SpeculativeConfig: return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})" -@config -@dataclass -class MultiModalConfig: - """Controls the behavior of multimodal models.""" - - limit_per_prompt: dict[str, int] = \ - cast(dict[str, int], get_field(ModelConfig, "limit_mm_per_prompt")) - """ - The maximum number of input items allowed per prompt for each modality. - Defaults to 1 (V0) or 999 (V1) for each modality. - - For example, to allow up to 16 images and 2 videos per prompt: - `{"image": 16, "video": 2}` - """ - - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set - `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ - - mm_processor_kwargs: Optional[dict[str, object]] = None - """ - Overrides for the multi-modal processor obtained from - `transformers.AutoProcessor.from_pretrained`. - - The available overrides depend on the model that is being run. - - For example, for Phi-3-Vision: - `{"num_crops": 4}`. - """ - - mm_processor_cache_gb: float = 4 - """ - The size (in GiB) of the multi-modal processor cache, which is used to - - This cache is duplicated for each API process and engine core process, - resulting in a total memory usage of - `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. - - Set to `0` to disable this cache completely (not recommended). - """ - - mm_processor_cache_type: MMCacheType = "lru" - """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, - use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" - - mm_shm_cache_max_object_size_mb: int = 128 - """Size limit (in MiB) for each object stored in the multi-modal processor - shared memory cache. Only effective when `mm_processor_cache_type` is - `"shm"`.""" - - mm_encoder_tp_mode: MMEncoderTPMode = "weights" - """ - Indicates how to optimize multi-modal encoder inference using - tensor parallelism (TP). - - - `"weights"`: Within the same vLLM engine, split the weights of - each layer across TP ranks. (default TP behavior) - - `"data"`: Within the same vLLM engine, split the batched input data - across TP ranks to process the data in parallel, while hosting - the full weights on each TP rank. - This batch-level DP is not to be confused with API request-level - DP (which is controlled by `--data-parallel-size`). - This is only supported on a per-model basis and falls back to - `"weights"` if the encoder does not support DP. - """ - - interleave_mm_strings: bool = False - """ - Enable fully interleaved support for multimodal prompts. - """ - - skip_mm_profiling: bool = False - """ - When enabled, skips multimodal memory profiling and only profiles with - language backbone model during engine initialization. - - This reduces engine startup time but shifts the responsibility to users for - estimating the peak memory usage of the activation of multimodal encoder and - embedding cache. - """ - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - # no factors to consider. - # this config will not affect the computation graph. - factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def get_limit_per_prompt(self, modality: str) -> int: - """ - Get the maximum number of input items allowed per prompt - for the given modality. - """ - return self.limit_per_prompt.get( - modality, - 999 if envs.VLLM_USE_V1 else 1, - ) - - def merge_mm_processor_kwargs( - self, - inference_kwargs: Mapping[str, object], - ) -> dict[str, object]: - """ - Get the keyword arguments to pass to the multi-modal processor - according to the extra arguments passed during inference. - """ - kwargs = self.mm_processor_kwargs or {} - return kwargs | dict(inference_kwargs) - - @config @dataclass class PoolerConfig: diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py new file mode 100644 index 0000000000000..1b93b520f33f9 --- /dev/null +++ b/vllm/config/multimodal.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from collections.abc import Mapping +from dataclasses import field +from typing import Any, Literal, Optional + +from pydantic.dataclasses import dataclass + +import vllm.envs as envs +from vllm.config.utils import config + +MMEncoderTPMode = Literal["weights", "data"] +MMCacheType = Literal["shm", "lru"] + + +@config +@dataclass +class MultiModalConfig: + """Controls the behavior of multimodal models.""" + + limit_per_prompt: dict[str, int] = field(default_factory=dict) + """The maximum number of input items allowed per prompt for each modality. + Defaults to 1 (V0) or 999 (V1) for each modality. + + For example, to allow up to 16 images and 2 videos per prompt: + `{"image": 16, "video": 2}`""" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" + mm_processor_kwargs: Optional[dict[str, object]] = None + """Arguments to be forwarded to the model's processor for multi-modal data, + e.g., image processor. Overrides for the multi-modal processor obtained + from `transformers.AutoProcessor.from_pretrained`. + + The available overrides depend on the model that is being run. + + For example, for Phi-3-Vision: + `{"num_crops": 4}`.""" + mm_processor_cache_gb: float = 4 + """The size (in GiB) of the multi-modal processor cache, which is used to + avoid re-processing past multi-modal inputs. + + This cache is duplicated for each API process and engine core process, + resulting in a total memory usage of + `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. + + Set to `0` to disable this cache completely (not recommended).""" + mm_processor_cache_type: MMCacheType = "lru" + """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, + use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" + mm_shm_cache_max_object_size_mb: int = 128 + """Size limit (in MiB) for each object stored in the multi-modal processor + shared memory cache. Only effective when `mm_processor_cache_type` is + `"shm"`.""" + mm_encoder_tp_mode: MMEncoderTPMode = "weights" + """Indicates how to optimize multi-modal encoder inference using tensor + parallelism (TP). + + - `"weights"`: Within the same vLLM engine, split the weights of + each layer across TP ranks. (default TP behavior)\n + - `"data"`: Within the same vLLM engine, split the batched input data + across TP ranks to process the data in parallel, while hosting + the full weights on each TP rank. + This batch-level DP is not to be confused with API request-level + DP (which is controlled by `--data-parallel-size`). + This is only supported on a per-model basis and falls back to + `"weights"` if the encoder does not support DP.""" + interleave_mm_strings: bool = False + """Enable fully interleaved support for multimodal prompts, while using + --chat-template-content-format=string.""" + skip_mm_profiling: bool = False + """When enabled, skips multimodal memory profiling and only profiles with + language backbone model during engine initialization. + + This reduces engine startup time but shifts the responsibility to users for + estimating the peak memory usage of the activation of multimodal encoder and + embedding cache.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + def get_limit_per_prompt(self, modality: str) -> int: + """ + Get the maximum number of input items allowed per prompt + for the given modality. + """ + return self.limit_per_prompt.get( + modality, + 999 if envs.VLLM_USE_V1 else 1, + ) + + def merge_mm_processor_kwargs( + self, + inference_kwargs: Mapping[str, object], + ) -> dict[str, object]: + """ + Get the keyword arguments to pass to the multi-modal processor + according to the extra arguments passed during inference. + """ + kwargs = self.mm_processor_kwargs or {} + return kwargs | dict(inference_kwargs) diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 98fbeb1fa86aa..db8c05ef8be4a 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import MISSING, Field, field, fields, is_dataclass from typing import TYPE_CHECKING, TypeVar if TYPE_CHECKING: @@ -27,3 +28,20 @@ def config(cls: ConfigT) -> ConfigT: script, which is invoked during the pre-commit checks. """ return cls + + +def get_field(cls: ConfigType, name: str) -> Field: + """Get the default factory field of a dataclass by name. Used for getting + default factory fields in `EngineArgs`.""" + if not is_dataclass(cls): + raise TypeError("The given class is not a dataclass.") + cls_fields = {f.name: f for f in fields(cls)} + if name not in cls_fields: + raise ValueError(f"Field '{name}' not found in {cls.__name__}.") + named_field: Field = cls_fields[name] + if (default_factory := named_field.default_factory) is not MISSING: + return field(default_factory=default_factory) + if (default := named_field.default) is not MISSING: + return field(default=default) + raise ValueError( + f"{cls.__name__}.{name} must have a default value or default factory.") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 7e0b927c5b78f..70c07eac6304b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -355,3 +355,14 @@ class KVConnectorBase_V1(ABC): raise TypeError("get_required_kvcache_layout should not be called " "on the abstract base class") return None + + def get_finished_count(self) -> Optional[int]: + """ + Get the count of requests expected to complete send/receive operations + via this connector. + + Returns: + int: expected sending or receiving completion count. + """ + + return None \ No newline at end of file diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index c306eeb5aa7ab..1ff1407aeb99b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -56,9 +56,9 @@ except ImportError: logger.warning("NIXL is not available") NixlWrapper = None -# Supported xPUs and types of kv transfer buffer. -# {xPU: tuple of supported kv buffer types} -_NIXL_SUPPORTED_XPUS = { +# Supported platforms and types of kv transfer buffer. +# {device: tuple of supported kv buffer types} +_NIXL_SUPPORTED_DEVICE = { "cuda": ("cuda", ), "tpu": ("cpu", ), "xpu": ("cpu", ), @@ -458,9 +458,9 @@ class NixlConnectorWorker: self.device_type = current_platform.device_type self.kv_buffer_device: str = \ vllm_config.kv_transfer_config.kv_buffer_device - if self.device_type not in _NIXL_SUPPORTED_XPUS: + if self.device_type not in _NIXL_SUPPORTED_DEVICE: raise RuntimeError(f"{self.device_type} is not supported.") - elif self.kv_buffer_device not in _NIXL_SUPPORTED_XPUS[ + elif self.kv_buffer_device not in _NIXL_SUPPORTED_DEVICE[ self.device_type]: raise RuntimeError( f"{self.device_type} with {self.kv_buffer_device} kv_buffer " @@ -468,7 +468,7 @@ class NixlConnectorWorker: self.device_kv_caches: dict[str, torch.Tensor] = {} # cpu kv buffer for xfer - # used when xPU memory can not be registered under nixl + # used when device memory can not be registered under nixl self.host_xfer_buffers: dict[str, torch.Tensor] = {} self.use_host_buffer = self.kv_buffer_device == "cpu" if self.kv_buffer_device == "cuda": @@ -927,6 +927,9 @@ class NixlConnectorWorker: if tp_ratio > 1: # Heterogeneous TP expects same kv_cache_layout. assert nixl_agent_meta.kv_cache_layout == self.kv_cache_layout + if self.device_type == "xpu": + raise ValueError( + "Heterogeneous TP is not supported on XPU") assert nixl_agent_meta.block_len == self.block_len * tp_ratio, ( "Remote P worker KV layer cache must be of shape [2, N, " diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index ef229299b6848..12571afaa4c13 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -663,14 +663,29 @@ class GroupCoordinator: tensor_dict: dict[str, Union[torch.Tensor, Any]], dst: Optional[int] = None, all_gather_group: Optional["GroupCoordinator"] = None, + all_gather_tensors: Optional[dict[str, bool]] = None, ) -> Optional[dict[str, Union[torch.Tensor, Any]]]: """Send the input tensor dictionary. NOTE: `dst` is the local rank of the source rank. + + all_gather_group: The group for the all-gather operation. If provided, + an optimization is enabled where each rank in the group sends a + slice of a tensor and the receiver reconstructs it using an + all-gather, which can improve performance. This is typically the + tensor-parallel group. + all_gather_tensors: A dictionary to specify which tensors should use + the all-gather optimization, which is only effective when + `all_gather_group` is provided. By default, this optimization is + on for any tensor whose size is divisible by the + `all_gather_group`'s world size. However, it should be disabled + for tensors that are not fully replicated across the group (e.g., + the residual tensor when sequence parallelism is enabled). This + dictionary allows overriding the default behavior on a per-tensor + basis. """ # Bypass the function if we are using only 1 GPU. if not torch.distributed.is_initialized() or self.world_size == 1: return tensor_dict - all_gather_size = (1 if all_gather_group is None else all_gather_group.world_size) all_gather_rank = (0 if all_gather_group is None else @@ -699,14 +714,23 @@ class GroupCoordinator: # `send_object_list` has serialization & deserialization, # all happening on CPU. Therefore, we can use the CPU group. self.send_object(metadata_list, dst=dst) - for tensor in tensor_list: + + tensor_keys = [ + k for k, v in tensor_dict.items() if isinstance(v, torch.Tensor) + ] + assert len(tensor_keys) == len(tensor_list) + + for key, tensor in zip(tensor_keys, tensor_list): if tensor.numel() == 0: # Skip sending empty tensors. continue # send-allgather: send only a slice, then do allgather. - if (all_gather_group is not None - and tensor.numel() % all_gather_size == 0): + use_all_gather = (all_gather_group is not None + and tensor.numel() % all_gather_size == 0) + use_all_gather = all_gather_tensors.get(key, use_all_gather) \ + if all_gather_tensors else use_all_gather + if use_all_gather: tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank] if tensor.is_cpu: @@ -725,14 +749,29 @@ class GroupCoordinator: self, src: Optional[int] = None, all_gather_group: Optional["GroupCoordinator"] = None, + all_gather_tensors: Optional[dict[str, bool]] = None, ) -> Optional[dict[str, Union[torch.Tensor, Any]]]: """Recv the input tensor dictionary. NOTE: `src` is the local rank of the source rank. + + all_gather_group: The group for the all-gather operation. If provided, + an optimization is enabled where each rank in the group sends a + slice of a tensor and the receiver reconstructs it using an + all-gather, which can improve performance. This is typically the + tensor-parallel group. + all_gather_tensors: A dictionary to specify which tensors should use + the all-gather optimization, which is only effective when + `all_gather_group` is provided. By default, this optimization is + on for any tensor whose size is divisible by the + `all_gather_group`'s world size. However, it should be disabled + for tensors that are not fully replicated across the group (e.g., + the residual tensor when sequence parallelism is enabled). This + dictionary allows overriding the default behavior on a per-tensor + basis. """ # Bypass the function if we are using only 1 GPU. if not torch.distributed.is_initialized() or self.world_size == 1: return None - all_gather_size = (1 if all_gather_group is None else all_gather_group.world_size) all_gather_rank = (0 if all_gather_group is None else @@ -766,6 +805,8 @@ class GroupCoordinator: # send-allgather: send only a slice, then do allgather. use_all_gather = (all_gather_group is not None and tensor.numel() % all_gather_size == 0) + use_all_gather = all_gather_tensors.get(key, use_all_gather) \ + if all_gather_tensors else use_all_gather if use_all_gather: orig_shape = tensor.shape diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ab43c0edc98d7..595d318fbaafe 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -27,12 +27,14 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, DistributedExecutorBackend, EPLBConfig, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, - LoRAConfig, MambaDType, MMCacheType, MMEncoderTPMode, - ModelConfig, ModelDType, ModelImpl, MultiModalConfig, - ObservabilityConfig, ParallelConfig, PoolerConfig, - PrefixCachingHashAlgo, RunnerOption, SchedulerConfig, - SchedulerPolicy, SpeculativeConfig, TaskOption, - TokenizerMode, VllmConfig, get_attr_docs, get_field) + LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, + ModelDType, ModelImpl, ObservabilityConfig, + ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, + RunnerOption, SchedulerConfig, SchedulerPolicy, + SpeculativeConfig, TaskOption, TokenizerMode, + VllmConfig, get_attr_docs) +from vllm.config.multimodal import MMCacheType, MultiModalConfig +from vllm.config.utils import get_field from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c303d093f6324..0fdd651425b90 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -40,6 +40,7 @@ from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.outputs import (PoolingRequestOutput, RequestOutput, RequestOutputFactory) +from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup, Sequence, SequenceGroup, SequenceGroupBase, @@ -372,6 +373,14 @@ class LLMEngine: "vllm.llm_engine", self.observability_config.otlp_traces_endpoint) + # Initialize reasoning parser if reasoning backend is set. + if self.decoding_config.reasoning_backend and \ + self.tokenizer: + reasoner_class = ReasoningParserManager.get_reasoning_parser( + self.decoding_config.reasoning_backend) + self.reasoner: ReasoningParser = reasoner_class( + self.tokenizer.get_lora_tokenizer()) + # Create sequence output processor, e.g. for beam search or # speculative decoding. self.output_processor = ( @@ -381,8 +390,12 @@ class LLMEngine: self.scheduler, self.seq_counter, get_tokenizer_for_seq, - stop_checker=StopChecker(self.scheduler_config.max_model_len, - get_tokenizer_for_seq), + stop_checker=StopChecker( + self.scheduler_config.max_model_len, + get_tokenizer_for_seq, + self.reasoner if self.decoding_config.reasoning_backend + and self.tokenizer else None, + ), )) self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {} @@ -1776,7 +1789,7 @@ class LLMEngine: assert isinstance(mm_processor, EncDecMultiModalProcessor) if mm_processor.pad_dummy_encoder_prompt: - return # Skip encoder length check for Whisper and Donut + return # Skip encoder length check for Whisper if model_config.is_multimodal_model: suggestion = ( diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 3fb2f71b5e999..68a63044df05e 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -4,6 +4,7 @@ from typing import Callable, List, Optional, Tuple from vllm.lora.request import LoRARequest +from vllm.reasoning import ReasoningParser from vllm.sampling_params import SamplingParams from vllm.sequence import Sequence, SequenceStatus from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -16,11 +17,16 @@ class StopChecker: emitted, or if we have exceeded the max model len. """ - def __init__(self, max_model_len: int, - get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer]): + def __init__( + self, + max_model_len: int, + get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer], + reasoner: Optional[ReasoningParser] = None, + ): # Do not use it directly, but use `self._get_max_model_len`. self._max_model_len = max_model_len self.get_tokenizer_for_seq = get_tokenizer_for_seq + self.reasoner = reasoner def _get_max_model_len(self, lora_req: Optional[LoRARequest]): if lora_req and lora_req.long_lora_max_len: @@ -57,6 +63,11 @@ class StopChecker: seq.status = SequenceStatus.FINISHED_STOPPED return + # Skip stop string/token checks if in reasoning content generation + if self.reasoner is not None and \ + not self.reasoner.is_reasoning_end(seq.get_token_ids()): + return + # Check if a stop token was encountered. # This assumes a single token produced per step. last_token_id = seq.get_last_token_id() diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index aa231de93c0c3..00ef39f134653 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -800,9 +800,10 @@ class MultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker - + multimodal_config = self._tracker.model_config.multimodal_config + media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector = MediaConnector( - media_io_kwargs=self._tracker._model_config.media_io_kwargs, + media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, ) @@ -883,8 +884,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker + multimodal_config = self._tracker.model_config.multimodal_config + media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector = MediaConnector( - media_io_kwargs=self._tracker._model_config.media_io_kwargs, + media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, ) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 9012639457cad..6658f91595e51 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext): available_tools: list[str], ): self._messages = messages + self.finish_reason: Optional[str] = None self.available_tools = available_tools self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {} self.called_tools: set[str] = set() @@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext): if self.parser.current_channel in {"analysis", "commentary"}: self.num_reasoning_tokens += 1 - def append_output(self, output) -> None: + def append_output(self, output: Union[RequestOutput, + list[Message]]) -> None: if isinstance(output, RequestOutput): output_token_ids = output.outputs[0].token_ids self.parser = get_streamable_parser_for_assistant() @@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext): # Move current turn to previous turn for next turn's calculations self.previous_turn = self.current_turn.copy() output_msgs = self.parser.messages + # The responses finish reason is set in the last message + self.finish_reason = output.outputs[0].finish_reason else: # Tool output. output_msgs = output @@ -157,18 +161,18 @@ class HarmonyContext(ConversationContext): def _update_prefill_token_usage(self, output: RequestOutput) -> None: """Update token usage statistics for the prefill phase of generation. - + The prefill phase processes the input prompt tokens. This method: 1. Counts the prompt tokens for this turn 2. Calculates tool output tokens for multi-turn conversations 3. Updates cached token counts 4. Tracks state for next turn calculations - + Tool output tokens are calculated as: - current_prompt_tokens - last_turn_prompt_tokens - + current_prompt_tokens - last_turn_prompt_tokens - last_turn_output_tokens This represents tokens added between turns (typically tool responses). - + Args: output: The RequestOutput containing prompt token information """ @@ -214,18 +218,18 @@ class HarmonyContext(ConversationContext): def _update_decode_token_usage(self, output: RequestOutput) -> int: """Update token usage statistics for the decode phase of generation. - + The decode phase processes the generated output tokens. This method: 1. Counts output tokens from all completion outputs 2. Updates the total output token count 3. Tracks tokens generated in the current turn - + In streaming mode, this is called for each token generated. In non-streaming mode, this is called once with all output tokens. - + Args: output: The RequestOutput containing generated token information - + Returns: int: Number of output tokens processed in this call """ @@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext): def messages(self) -> list: return self.parser.messages - def append_output(self, output) -> None: + def append_output(self, output: Union[RequestOutput, + list[Message]]) -> None: if isinstance(output, RequestOutput): # append_output is called for each output token in streaming case, # so we only want to add the prompt tokens once for each message. diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index f7528ba81dce5..1364a41be950d 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -387,7 +387,9 @@ def parse_remaining_state( id=f"msg_{random_uuid()}", content=[output_text], role="assistant", - status="completed", + # if the parser still has messages (ie if the generator got cut + # abruptly), this should be incomplete + status="incomplete", type="message", ) return [text_item] diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index c159bcee315f2..2e4aa7f3d5a6f 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -15,7 +15,7 @@ import socket import tempfile import uuid from argparse import Namespace -from collections.abc import AsyncIterator, Awaitable +from collections.abc import AsyncGenerator, AsyncIterator, Awaitable from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus @@ -29,6 +29,7 @@ from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse +from openai import BaseModel from prometheus_client import make_asgi_app from prometheus_fastapi_instrumentator import Instrumentator from starlette.concurrency import iterate_in_threadpool @@ -577,6 +578,18 @@ async def show_version(): return JSONResponse(content=ver) +async def _convert_stream_to_sse_events( + generator: AsyncGenerator[BaseModel, + None]) -> AsyncGenerator[str, None]: + """Convert the generator to a stream of events in SSE format""" + async for event in generator: + event_type = getattr(event, 'type', 'unknown') + # https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format + event_data = (f"event: {event_type}\n" + f"data: {event.model_dump_json(indent=None)}\n\n") + yield event_data + + @router.post("/v1/responses", dependencies=[Depends(validate_json_request)], responses={ @@ -612,7 +625,9 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): status_code=generator.error.code) elif isinstance(generator, ResponsesResponse): return JSONResponse(content=generator.model_dump()) - return StreamingResponse(content=generator, media_type="text/event-stream") + + return StreamingResponse(content=_convert_stream_to_sse_events(generator), + media_type="text/event-stream") @router.get("/v1/responses/{response_id}") @@ -640,10 +655,10 @@ async def retrieve_responses( if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), status_code=response.error.code) - elif stream: - return StreamingResponse(content=response, - media_type="text/event-stream") - return JSONResponse(content=response.model_dump()) + elif isinstance(response, ResponsesResponse): + return JSONResponse(content=response.model_dump()) + return StreamingResponse(content=_convert_stream_to_sse_events(response), + media_type="text/event-stream") @router.post("/v1/responses/{response_id}/cancel") diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 4dcb1f3f1c89f..8ecb1a8239c35 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -30,7 +30,7 @@ except ImportError: # For newer openai versions (>= 1.100.0) from openai.types.responses import (ResponseFormatTextConfig as ResponseTextConfig) -from openai.types.responses.response import ToolChoice +from openai.types.responses.response import IncompleteDetails, ToolChoice from openai.types.responses.tool import Tool from openai.types.shared import Metadata, Reasoning from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, @@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") created_at: int = Field(default_factory=lambda: int(time.time())) # error: Optional[ResponseError] = None - # incomplete_details: Optional[IncompleteDetails] = None + incomplete_details: Optional[IncompleteDetails] = None instructions: Optional[str] = None metadata: Optional[Metadata] = None model: str @@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel): status: ResponseStatus, usage: Optional[ResponseUsage] = None, ) -> "ResponsesResponse": + + incomplete_details: Optional[IncompleteDetails] = None + if status == 'incomplete': + incomplete_details = IncompleteDetails(reason='max_output_tokens') + # TODO: implement the other reason for incomplete_details, + # which is content_filter + # incomplete_details = IncompleteDetails(reason='content_filter') + return cls( id=request.request_id, created_at=created_time, + incomplete_details=incomplete_details, instructions=request.instructions, metadata=request.metadata, model=model_name, @@ -2109,7 +2118,7 @@ class DetokenizeResponse(OpenAIBaseModel): class TokenizerInfoResponse(OpenAIBaseModel): """ - Response containing tokenizer configuration + Response containing tokenizer configuration equivalent to tokenizer_config.json """ @@ -2199,7 +2208,7 @@ class TranscriptionRequest(OpenAIBaseModel): to_language: Optional[str] = None """The language of the output audio we transcribe to. - Please note that this is not currently used by supported models at this + Please note that this is not currently used by supported models at this time, but it is a placeholder for future use, matching translation api. """ diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 401ba6c53331c..9e285e6e51756 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -10,7 +10,7 @@ from collections.abc import AsyncGenerator, AsyncIterator, Sequence from contextlib import AsyncExitStack from copy import copy from http import HTTPStatus -from typing import Callable, Final, Optional, Union +from typing import Callable, Final, Optional, TypeVar, Union import jinja2 import openai.types.responses as openai_responses_types @@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent, ResponseReasoningItem, ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent, - response_text_delta_event) + ResponseStatus, response_text_delta_event) from openai.types.responses.response_output_text import (Logprob, LogprobTopLogprob) # yapf: enable @@ -175,7 +175,8 @@ class OpenAIServingResponses(OpenAIServing): # HACK(wuhang): This is a hack. We should use a better store. # FIXME: If enable_store=True, this may cause a memory leak since we # never remove events from the store. - self.event_store: dict[str, tuple[deque[str], asyncio.Event]] = {} + self.event_store: dict[str, tuple[deque[BaseModel], + asyncio.Event]] = {} self.background_tasks: dict[str, asyncio.Task] = {} @@ -185,7 +186,8 @@ class OpenAIServingResponses(OpenAIServing): self, request: ResponsesRequest, raw_request: Optional[Request] = None, - ) -> Union[AsyncGenerator[str, None], ResponsesResponse, ErrorResponse]: + ) -> Union[AsyncGenerator[BaseModel, None], ResponsesResponse, + ErrorResponse]: error_check_ret = await self._check_model(request) if error_check_ret is not None: logger.error("Error with model %s", error_check_ret) @@ -461,10 +463,22 @@ class OpenAIServingResponses(OpenAIServing): # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) + # NOTE: Implementation of stauts is still WIP, but for now + # we guarantee that if the status is not "completed", it is accurate. + # "completed" is implemented as the "catch-all" for now. + status: ResponseStatus = "completed" + if self.use_harmony: assert isinstance(context, HarmonyContext) output = self._make_response_output_items_with_harmony(context) num_tool_output_tokens = context.num_tool_output_tokens + if len(output) > 0: + if context.finish_reason == "length": + status = "incomplete" + elif context.finish_reason == "abort": + status = "cancelled" + else: + status = "incomplete" else: assert isinstance(context, SimpleContext) final_res = context.last_output @@ -501,7 +515,7 @@ class OpenAIServingResponses(OpenAIServing): model_name=model_name, created_time=created_time, output=output, - status="completed", + status=status, usage=usage, ) @@ -658,7 +672,7 @@ class OpenAIServingResponses(OpenAIServing): self, context: HarmonyContext, ) -> list[ResponseOutputItem]: - output_items = [] + output_items: list[ResponseOutputItem] = [] num_init_messages = context.num_init_messages for msg in context.messages[num_init_messages:]: output_items.extend(parse_output_message(msg)) @@ -800,7 +814,7 @@ class OpenAIServingResponses(OpenAIServing): *args, **kwargs, ): - event_deque: deque[str] = deque() + event_deque: deque[BaseModel] = deque() new_event_signal = asyncio.Event() self.event_store[request.request_id] = (event_deque, new_event_signal) response = None @@ -815,8 +829,6 @@ class OpenAIServingResponses(OpenAIServing): request.request_id) response = self.create_error_response(str(e)) finally: - # Mark as finished with a special marker - event_deque.append("__STREAM_END__") new_event_signal.set() if response is not None and isinstance(response, ErrorResponse): @@ -855,7 +867,7 @@ class OpenAIServingResponses(OpenAIServing): self, response_id: str, starting_after: Optional[int] = None, - ): + ) -> AsyncGenerator[BaseModel, None]: if response_id not in self.event_store: raise ValueError(f"Unknown response_id: {response_id}") @@ -869,9 +881,9 @@ class OpenAIServingResponses(OpenAIServing): # Yield existing events from start_index while current_index < len(event_deque): event = event_deque[current_index] - if event == "__STREAM_END__": - return yield event + if getattr(event, 'type', 'unknown') == "response.completed": + return current_index += 1 await new_event_signal.wait() @@ -881,7 +893,8 @@ class OpenAIServingResponses(OpenAIServing): response_id: str, starting_after: Optional[int], stream: Optional[bool], - ) -> Union[ErrorResponse, ResponsesResponse]: + ) -> Union[ErrorResponse, ResponsesResponse, AsyncGenerator[BaseModel, + None]]: if not response_id.startswith("resp_"): return self._make_invalid_id_error(response_id) @@ -964,8 +977,9 @@ class OpenAIServingResponses(OpenAIServing): tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, created_time: int, - _send_event: Callable[[BaseModel], str], - ) -> AsyncGenerator[str, None]: + _increment_sequence_number_and_return: Callable[[BaseModel], + BaseModel], + ) -> AsyncGenerator[BaseModel, None]: current_content_index = 0 current_output_index = 0 current_item_id = "" @@ -1002,7 +1016,7 @@ class OpenAIServingResponses(OpenAIServing): if not first_delta_sent: current_item_id = str(uuid.uuid4()) if delta_message.reasoning_content: - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1017,7 +1031,7 @@ class OpenAIServingResponses(OpenAIServing): ), )) else: - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1032,7 +1046,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseContentPartAddedEvent( type="response.content_part.added", sequence_number=-1, @@ -1060,7 +1074,7 @@ class OpenAIServingResponses(OpenAIServing): reason_content = ''.join( pm.reasoning_content for pm in previous_delta_messages if pm.reasoning_content is not None) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseReasoningTextDoneEvent( type="response.reasoning_text.done", item_id=current_item_id, @@ -1082,14 +1096,14 @@ class OpenAIServingResponses(OpenAIServing): id=current_item_id, summary=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, output_index=current_output_index, item=reasoning_item, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseOutputItemAddedEvent( type="response.output_item.added", sequence_number=-1, @@ -1104,7 +1118,7 @@ class OpenAIServingResponses(OpenAIServing): )) current_output_index += 1 current_item_id = str(uuid.uuid4()) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseContentPartAddedEvent( type="response.content_part.added", sequence_number=-1, @@ -1123,7 +1137,7 @@ class OpenAIServingResponses(OpenAIServing): previous_delta_messages = [] if delta_message.reasoning_content is not None: - yield _send_event( + yield _increment_sequence_number_and_return( ResponseReasoningTextDeltaEvent( type="response.reasoning_text.delta", sequence_number=-1, @@ -1133,7 +1147,7 @@ class OpenAIServingResponses(OpenAIServing): delta=delta_message.reasoning_content, )) elif delta_message.content is not None: - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseTextDeltaEvent( type="response.output_text.delta", sequence_number=-1, @@ -1156,7 +1170,7 @@ class OpenAIServingResponses(OpenAIServing): reason_content = ''.join(pm.reasoning_content for pm in previous_delta_messages if pm.reasoning_content is not None) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseReasoningTextDoneEvent( type="response.reasoning_text.done", item_id=current_item_id, @@ -1178,7 +1192,7 @@ class OpenAIServingResponses(OpenAIServing): id=current_item_id, summary=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1189,7 +1203,7 @@ class OpenAIServingResponses(OpenAIServing): final_content = ''.join(pm.content for pm in previous_delta_messages if pm.content is not None) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseTextDoneEvent( type="response.output_text.done", sequence_number=-1, @@ -1205,7 +1219,7 @@ class OpenAIServingResponses(OpenAIServing): type="output_text", annotations=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseContentPartDoneEvent( type="response.content_part.done", sequence_number=-1, @@ -1225,7 +1239,7 @@ class OpenAIServingResponses(OpenAIServing): id=current_item_id, summary=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1243,8 +1257,9 @@ class OpenAIServingResponses(OpenAIServing): tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, created_time: int, - _send_event: Callable[[BaseModel], str], - ) -> AsyncGenerator[str, None]: + _increment_sequence_number_and_return: Callable[[BaseModel], + BaseModel], + ) -> AsyncGenerator[BaseModel, None]: current_content_index = 0 # FIXME: this number is never changed current_output_index = 0 current_item_id = "" # FIXME: this number is never changed @@ -1276,7 +1291,7 @@ class OpenAIServingResponses(OpenAIServing): id=current_item_id, summary=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseReasoningTextDoneEvent( type="response.reasoning_text.done", item_id=current_item_id, @@ -1285,7 +1300,7 @@ class OpenAIServingResponses(OpenAIServing): content_index=current_content_index, text=previous_item.content[0].text, )) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1298,7 +1313,7 @@ class OpenAIServingResponses(OpenAIServing): text=previous_item.content[0].text, annotations=[], ) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseTextDoneEvent( type="response.output_text.done", sequence_number=-1, @@ -1308,7 +1323,7 @@ class OpenAIServingResponses(OpenAIServing): logprobs=[], item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseContentPartDoneEvent( type="response.content_part.done", @@ -1318,7 +1333,7 @@ class OpenAIServingResponses(OpenAIServing): content_index=current_content_index, part=text_content, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1332,12 +1347,13 @@ class OpenAIServingResponses(OpenAIServing): ), )) + # stream the output of a harmony message if ctx.parser.last_content_delta: if (ctx.parser.current_channel == "final" and ctx.parser.current_recipient is None): if not sent_output_item_added: sent_output_item_added = True - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1352,7 +1368,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseContentPartAddedEvent( type="response.content_part.added", @@ -1367,7 +1383,7 @@ class OpenAIServingResponses(OpenAIServing): logprobs=[], ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseTextDeltaEvent( type="response.output_text.delta", sequence_number=-1, @@ -1382,7 +1398,7 @@ class OpenAIServingResponses(OpenAIServing): and ctx.parser.current_recipient is None): if not sent_output_item_added: sent_output_item_added = True - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1396,7 +1412,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseContentPartAddedEvent( type="response.content_part.added", @@ -1411,7 +1427,7 @@ class OpenAIServingResponses(OpenAIServing): logprobs=[], ), )) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseReasoningTextDeltaEvent( type="response.reasoning_text.delta", item_id=current_item_id, @@ -1428,7 +1444,7 @@ class OpenAIServingResponses(OpenAIServing): ) and ctx.parser.current_recipient == "python": if not sent_output_item_added: sent_output_item_added = True - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1444,7 +1460,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseCodeInterpreterCallInProgressEvent( type= @@ -1453,7 +1469,7 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseCodeInterpreterCallCodeDeltaEvent( type="response.code_interpreter_call_code.delta", @@ -1462,6 +1478,8 @@ class OpenAIServingResponses(OpenAIServing): item_id=current_item_id, delta=ctx.parser.last_content_delta, )) + + # stream tool call outputs if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0: previous_item = ctx.parser.messages[-1] if (self.tool_server is not None @@ -1498,7 +1516,7 @@ class OpenAIServingResponses(OpenAIServing): raise ValueError( f"Unknown function name: {function_name}") - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseOutputItemAddedEvent( type="response.output_item.added", sequence_number=-1, @@ -1513,7 +1531,7 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", ), )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseWebSearchCallInProgressEvent( type="response.web_search_call.in_progress", @@ -1521,7 +1539,7 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseWebSearchCallSearchingEvent( type="response.web_search_call.searching", @@ -1531,7 +1549,7 @@ class OpenAIServingResponses(OpenAIServing): )) # enqueue - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseWebSearchCallCompletedEvent( type="response.web_search_call.completed", @@ -1539,7 +1557,7 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1557,7 +1575,7 @@ class OpenAIServingResponses(OpenAIServing): and self.tool_server.has_tool("python") and previous_item.recipient is not None and previous_item.recipient.startswith("python")): - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseCodeInterpreterCallCodeDoneEvent( type="response.code_interpreter_call_code.done", @@ -1566,7 +1584,7 @@ class OpenAIServingResponses(OpenAIServing): item_id=current_item_id, code=previous_item.content[0].text, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseCodeInterpreterCallInterpretingEvent( type="response.code_interpreter_call.interpreting", @@ -1574,7 +1592,7 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types. ResponseCodeInterpreterCallCompletedEvent( type="response.code_interpreter_call.completed", @@ -1582,7 +1600,7 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, )) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseOutputItemDoneEvent( type="response.output_item.done", sequence_number=-1, @@ -1609,7 +1627,7 @@ class OpenAIServingResponses(OpenAIServing): tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, created_time: Optional[int] = None, - ) -> AsyncGenerator[str, None]: + ) -> AsyncGenerator[BaseModel, None]: # TODO: # 1. Handle disconnect @@ -1617,16 +1635,15 @@ class OpenAIServingResponses(OpenAIServing): sequence_number = 0 - def _send_event(event: BaseModel): + T = TypeVar("T", bound=BaseModel) + + def _increment_sequence_number_and_return(event: T) -> T: nonlocal sequence_number # Set sequence_number if the event has this attribute if hasattr(event, 'sequence_number'): event.sequence_number = sequence_number sequence_number += 1 - # Get event type from the event's type field if it exists - event_type = getattr(event, 'type', 'unknown') - return (f"event: {event_type}\n" - f"data: {event.model_dump_json(indent=None)}\n\n") + return event async with AsyncExitStack() as exit_stack: processer = None @@ -1646,24 +1663,23 @@ class OpenAIServingResponses(OpenAIServing): status="in_progress", usage=None, ).model_dump() - yield _send_event( + yield _increment_sequence_number_and_return( ResponseCreatedEvent( type="response.created", sequence_number=-1, response=initial_response, )) - yield _send_event( + yield _increment_sequence_number_and_return( ResponseInProgressEvent( type="response.in_progress", sequence_number=-1, response=initial_response, )) - async for event_data in processer(request, sampling_params, - result_generator, context, - model_name, tokenizer, - request_metadata, created_time, - _send_event): + async for event_data in processer( + request, sampling_params, result_generator, context, + model_name, tokenizer, request_metadata, created_time, + _increment_sequence_number_and_return): yield event_data async def empty_async_generator(): @@ -1682,7 +1698,7 @@ class OpenAIServingResponses(OpenAIServing): request_metadata, created_time=created_time, ) - yield _send_event( + yield _increment_sequence_number_and_return( openai_responses_types.ResponseCompletedEvent( type="response.completed", sequence_number=-1, diff --git a/vllm/envs.py b/vllm/envs.py index bb10c7cc2ac27..d2006979ea81c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -70,6 +70,7 @@ if TYPE_CHECKING: VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" + VLLM_MAIN_CUDA_VERSION: str = "12.8" MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False @@ -92,6 +93,7 @@ if TYPE_CHECKING: VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False VLLM_DISABLED_KERNELS: list[str] = [] + VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION: bool = False VLLM_USE_V1: bool = True VLLM_ROCM_USE_AITER: bool = False VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False @@ -167,6 +169,7 @@ if TYPE_CHECKING: VLLM_HAS_FLASHINFER_CUBIN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False + VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None @@ -247,6 +250,11 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(), + # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9], + # 12.8 is the default. This follows PyTorch but can be overridden. + "VLLM_MAIN_CUDA_VERSION": + lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.8", + # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs "MAX_JOBS": @@ -744,6 +752,13 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[ "VLLM_DISABLED_KERNELS"].split(","), + # Swaps the all reduce backend that we use to coordinate the DP padding + # information from NCCL to gloo. + "VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION": + lambda: + (os.getenv("VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION", "False").lower() in + ("true", "1")), + # If set, use the V1 code path. "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))), @@ -1219,6 +1234,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ENABLE_RESPONSES_API_STORE": lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))), + # If set, use the fp8 mfma in rocm paged attention. + "VLLM_ROCM_FP8_MFMA_PAGE_ATTN": + lambda: bool(int(os.getenv("VLLM_ROCM_FP8_MFMA_PAGE_ATTN", "0"))), + # Whether to use pytorch symmetric memory for allreduce "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))), @@ -1340,6 +1359,7 @@ def compute_hash() -> str: "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", + "VLLM_ROCM_FP8_MFMA_PAGE_ATTN", ] for key in environment_variables_to_hash: # if this goes out of sync with environment_variables, diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index a3c1d79a58b26..d18bef1256af5 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -13,6 +13,7 @@ from typing_extensions import TypeVar import vllm.platforms from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput @@ -54,6 +55,7 @@ class ExecutorBase(ABC): self._init_executor() self.is_sleeping = False self.sleeping_tags: set[str] = set() + self.kv_output_aggregator = None @abstractmethod def _init_executor(self) -> None: @@ -252,6 +254,11 @@ class ExecutorBase(ABC): exception.""" self.check_health() + def init_kv_output_aggregator(self, finished_count: Optional[int]) -> None: + """Init KVOutputAggregator""" + self.kv_output_aggregator = KVOutputAggregator( + finished_count or self.parallel_config.world_size) + class DistributedExecutorBase(ExecutorBase): """Abstract superclass of distributed executor implementations.""" diff --git a/vllm/forward_context.py b/vllm/forward_context.py index c57c51d289ac8..b3ddd7b9a7392 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -13,6 +13,7 @@ import torch.distributed as dist import vllm.envs as envs from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig from vllm.logger import init_logger +from vllm.platforms import current_platform if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -75,14 +76,26 @@ class DPMetadata: Gather the num_tokens across all DP ranks and return results in a CPU tensor of size dp_size. """ + from vllm.distributed.parallel_state import get_dp_group + device = current_platform.device_type + group = get_dp_group().device_group + + # Transfering this tensor from GPU to CPU will introduce a GPU sync + # point that could adversely affect performance of vllm with asynch + # scheduling. This environment variable exists to quickly disable + # this optimization if we run into this case. + if envs.VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION: + logger.info_once( + "Using CPU all reduce to syncronize DP padding between ranks.") + device = "cpu" + group = get_dp_group().cpu_group num_tokens_across_dp = [0] * dp_size num_tokens_across_dp[dp_rank] = num_tokens num_tokens_tensor = torch.tensor(num_tokens_across_dp, - device="cpu", + device=device, dtype=torch.int32) - from vllm.distributed.parallel_state import get_dp_group - dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group) - return num_tokens_tensor + dist.all_reduce(num_tokens_tensor, group=group) + return num_tokens_tensor.cpu() @staticmethod def make( diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..22e3d09676d06 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} + diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..8bac7af0c2dac --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..5910027e17f9b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 06edfb0552e84..30e46ffa7b176 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -720,7 +720,10 @@ def get_moe_configs( logger.info("Using configuration from %s for MoE layer.", config_file_path) # If a configuration has been found, return it - return {int(key): val for key, val in json.load(f).items()} + tuned_config = json.load(f) + # Delete triton_version from tuned_config + tuned_config.pop("triton_version", None) + return {int(key): val for key, val in tuned_config.items()} # If no optimized configuration is available, we will use the default # configuration diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 773dfeae25d93..cd05136520977 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -29,6 +29,7 @@ from vllm.model_executor.parameter import (BasevLLMParameter, # yapf: enable from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +from vllm.utils import GiB_bytes logger = init_logger(__name__) @@ -190,10 +191,27 @@ class UnquantizedLinearMethod(LinearMethodBase): output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype), - requires_grad=False) + # This method creates unquantized linear weights. + # The weights are not quantized, and they are not sharded. + # The amount of memory allocated for the weights is + # sum(output_partition_sizes) * input_size_per_partition. + try: + weight = Parameter(torch.empty(sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype), + requires_grad=False) + except torch.cuda.OutOfMemoryError as e: + logger.error("Failed to create unquantized linear weights: %s", e) + if torch.cuda.is_available(): + logger.debug("CUDA device: %s", torch.cuda.current_device()) + logger.debug("Allocated: %.2f GiB", + torch.cuda.memory_allocated() / GiB_bytes) + logger.debug("Reserved: %.2f GiB", + torch.cuda.memory_reserved() / GiB_bytes) + raise RuntimeError( + "Failed to create unquantized linear weights. " + "This may be caused by insufficient memory to allocate " + "the weight.") from e set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) layer.register_parameter("weight", weight) set_weight_attrs(weight, extra_weight_attrs) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 97041a5a050f1..b56a691311774 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -129,7 +129,7 @@ class CompressedTensorsConfig(QuantizationConfig): # choose transform method if any((input_tfms, output_tfms)): return CompressedTensorsLinearTransformMethod.from_schemes( - quant_method, input_tfms, output_tfms) + quant_method, quant_scheme, input_tfms, output_tfms) else: return quant_method diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py index 2fc94b3c257e6..d098185146e41 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py @@ -12,6 +12,8 @@ from compressed_tensors.utils import is_match from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED, LinearMethodBase, QKVCrossParallelLinear) +from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 + CompressedTensorsScheme) from vllm.model_executor.layers.quantization.compressed_tensors.transform.module import ( # noqa: E501 HadamardTransform) from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import ( # noqa: E501 @@ -26,14 +28,22 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase): @classmethod def from_schemes( - cls, quant_method: LinearMethodBase, input_tfms: dict[int, - TransformTuple], - output_tfms: dict[int, TransformTuple] + cls, + quant_method: LinearMethodBase, + quant_scheme: Optional[CompressedTensorsScheme], + input_tfms: dict[int, TransformTuple], + output_tfms: dict[int, TransformTuple], ) -> "CompressedTensorsLinearTransformMethod": + from vllm.model_executor.layers.quantization.compressed_tensors.transform.schemes.linear_qutlass_nvfp4 import ( # noqa: E501 + QutlassNvFP4LinearMethod, is_qutlass_fp4_scheme) + assert input_tfms or output_tfms - # TODO (@ksayers): implement QutlassLinearMethodNvFP4 - # hadacore and fwht can be selected by Transform module + if is_qutlass_fp4_scheme(quant_scheme, input_tfms): + return QutlassNvFP4LinearMethod(quant_method, input_tfms, + output_tfms) + + # hadacore or dense gemm is selected by Transform module return cls(quant_method, input_tfms, output_tfms) @@ -129,11 +139,12 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase): assert bias is None x = self.quant_method.apply(layer, x, bias) - # TODO (@ksayers): Write a triton kernel to do this in parallel + # In most cases, input transforms are preferred over output transforms + # (@ksayers): confirm that this is done concurrently if self.output_transform is not None: for part_id, (start, length) in enumerate(self.partition_ranges): x[:, start:start + length] = self.output_transform( - x[:, start:start + length], part_id=part_id) + x[:, start:start + length].contiguous(), part_id=part_id) return x diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py index 48ab2582a3b26..5e863354715e4 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py @@ -2,12 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Hashable -from typing import Callable, Optional +from typing import Callable import torch -from compressed_tensors.transform import TransformLocation, TransformScheme +from compressed_tensors.transform import (TransformArgs, TransformLocation, + TransformScheme) from torch import Tensor +import vllm._custom_ops as ops from vllm.distributed.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.layers.linear import LinearBase @@ -28,16 +30,12 @@ class HadamardTransform(torch.nn.Module): transforms: dict[int, TransformTuple] # info parsed from transforms config weight: SharedWeightParameter # container for shared tensors - kernel: Callable # function used during application scales: dict[int, float] # hadamard scale, usually sqrt(matrix.size(0)) - def __init__(self, - transforms: dict[int, TransformTuple], - layer: torch.nn.Module, - weight_loader: Callable, + def __init__(self, transforms: dict[int, TransformTuple], + layer: torch.nn.Module, weight_loader: Callable, input_size_per_partition: int, - output_partition_sizes: list[int], - kernel: Optional[Callable] = None): + output_partition_sizes: list[int]): super().__init__() self.transforms = transforms self.scales = {} @@ -55,7 +53,7 @@ class HadamardTransform(torch.nn.Module): for part_index, (_scheme_name, scheme, args) in self.transforms.items(): output_size = output_partition_sizes[part_index] - weight_size = self._get_weight_size(layer, args.location, + weight_size = self._get_weight_size(layer, scheme, args, input_size, output_size) data_key = self._get_data_key(scheme, weight_size) @@ -69,9 +67,6 @@ class HadamardTransform(torch.nn.Module): # validate that shared tensors and schemes are correct self._validate_input_transforms() - # select kernel based on transform schemes - self.kernel = self._infer_kernel() if kernel is None else kernel - def process_weights_after_loading(self): for part_id in self.weight.partitions: data = self.weight.partitions[part_id].data @@ -90,32 +85,59 @@ class HadamardTransform(torch.nn.Module): if part_id not in self.weight.partitions: return value - weight = self.weight.partitions[part_id] - weight = weight if self.transforms[ - part_id].args.inverse else weight.T # linear := x(W.T) - scale = self.scales[part_id] - return self.kernel(self, value.to(weight.dtype), weight, None).to( - value.dtype) * scale + # use hadacore if possible + if self.transforms[part_id].scheme.type == "hadamard": + if self.transforms[part_id].scheme.head_dim is not None: + weight_size = self.transforms[part_id].scheme.head_dim + value = value.unflatten(-1, (-1, weight_size)) + value = ops.hadacore_transform(value) + value = value.flatten(-2, -1) + + return value + + # sylvester transforms are symmetric, inv => transpose => original + return ops.hadacore_transform(value) + + # fall back to dense + else: + weight = self.weight.partitions[part_id] + weight = weight if self.transforms[ + part_id].args.inverse else weight.T # linear := x(W.T) + scale = self.scales[part_id] + + if self.transforms[part_id].scheme.head_dim is not None: + value = value.unflatten(-1, (-1, weight.size(0))) + value = dispatch_unquantized_gemm()(self, value.to( + weight.dtype), weight, None).to(value.dtype) * scale + value = value.flatten(-2, -1) + + return value + + return dispatch_unquantized_gemm()(self, value.to( + weight.dtype), weight, None).to(value.dtype) * scale def _get_data_key(self, scheme: TransformScheme, weight_size: int) -> Hashable: return (id(scheme), weight_size) - def _get_weight_size(self, layer: torch.nn.Module, - location: TransformLocation, input_size: int, + def _get_weight_size(self, layer: torch.nn.Module, scheme: TransformScheme, + args: TransformArgs, input_size: int, output_size: int) -> int: + if scheme.head_dim is not None: + return scheme.head_dim + if isinstance(layer, LinearBase): - if location == TransformLocation.INPUT: + if args.location == TransformLocation.INPUT: return input_size - elif location == TransformLocation.OUTPUT: + elif args.location == TransformLocation.OUTPUT: return output_size elif isinstance(layer, VocabParallelEmbedding): - if location == TransformLocation.INPUT: + if args.location == TransformLocation.INPUT: return output_size - elif location == TransformLocation.OUTPUT: + elif args.location == TransformLocation.OUTPUT: return input_size raise ValueError() @@ -129,7 +151,3 @@ class HadamardTransform(torch.nn.Module): for partition in self.weight.partitions.values(): if partition.data.data_ptr() != first_data.data_ptr(): raise ValueError("") - - def _infer_kernel(self) -> Callable: - # TODO (@ksayers): use fwht, hadacore - return dispatch_unquantized_gemm() diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py index f42258f9f9d7f..69b39f31eec1e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py @@ -4,18 +4,43 @@ from typing import Optional import torch +from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 + CompressedTensorsScheme, CompressedTensorsW4A4Fp4) from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import ( # noqa: E501 - CompressedTensorsLinearTransformMethod) + CompressedTensorsLinearTransformMethod, TransformTuple) + +__all__ = ["is_qutlass_fp4_scheme", "QutlassNvFP4LinearMethod"] -# Because qutlass fuses hadamard with quantization, it cannot automatically be -# composed with kernels in the way CompressedTensorsLinearTransformMethod does. -# Therefore, a separate scheme must be created for each quantized dtype -class QutlassLinearMethodNvFP4(CompressedTensorsLinearTransformMethod): +def is_qutlass_fp4_scheme(quant_scheme: Optional[CompressedTensorsScheme], + input_tfms: dict[int, TransformTuple]) -> bool: + return isinstance( + quant_scheme, + (CompressedTensorsW4A4Fp4, )) and len(input_tfms) == 1 and input_tfms[ + 0].scheme.head_dim == quant_scheme.group_size + + +class QutlassNvFP4LinearMethod(CompressedTensorsLinearTransformMethod): + + def create_weights(self, layer, input_size_per_partition, + output_partition_sizes, input_size, output_size, + params_dtype, **extra_weight_attrs): + # initializes fp4 qparams + assert isinstance(layer.scheme, (CompressedTensorsW4A4Fp4, )) + ret = super().create_weights(layer, input_size_per_partition, + output_partition_sizes, input_size, + output_size, params_dtype, + **extra_weight_attrs) + + assert self.input_transform is not None + assert len(self.input_transform.weight) == 1 + assert self.input_transform.weight[0].size( + 0) == layer.scheme.group_size + + return ret def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - # fused hadamard quant linear method raise NotImplementedError() diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 3d94626e5d8c6..49ff87df93c31 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -772,10 +772,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): if self.allow_deep_gemm and not is_deep_gemm_e8m0_used(): if _is_col_major(layer.w13_weight_scale_inv): layer.w13_weight_scale_inv = \ - get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv).contiguous() + get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv) if _is_col_major(layer.w2_weight_scale_inv): layer.w2_weight_scale_inv = \ - get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv).contiguous() + get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv) # If checkpoint is fp16, quantize in place. elif not self.quant_config.is_checkpoint_fp8_serialized: @@ -923,10 +923,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): # Ensure column-major TMA alignment expected by DeepGEMM. if _is_col_major(layer.w13_weight_scale_inv): layer.w13_weight_scale_inv = get_col_major_tma_aligned_tensor( - layer.w13_weight_scale_inv).contiguous() + layer.w13_weight_scale_inv) if _is_col_major(layer.w2_weight_scale_inv): layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor( - layer.w2_weight_scale_inv).contiguous() + layer.w2_weight_scale_inv) def select_gemm_impl( self, diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index d6d7ec9b15805..c25b3dd6080dc 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -190,7 +190,7 @@ class MoeWNA16Method(FusedMoEMethodBase): group_size = self.quant_config.group_size group_size_div_factor = 1 - # make intermediate_size and hidden_size diviable by group_size + # make intermediate_size and hidden_size divisible by group_size # we reduce the group size to ensure that # and we would repeat the loaded_weight later while intermediate_size_per_partition % group_size or \ diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py index b2c228c242532..f5acd03cc6622 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py @@ -19,7 +19,7 @@ class MarlinWorkspace: def __init__(self, out_features, min_thread_n, max_parallel): assert (out_features % min_thread_n == 0), ( - "out_features = {} is undivisible by min_thread_n = {}".format( + "out_features = {} is indivisible by min_thread_n = {}".format( out_features, min_thread_n)) max_workspace_size = ((out_features // min_thread_n) * max_parallel) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index e89a5e643b0e5..8cda1789e6c97 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -179,7 +179,7 @@ def rocm_per_tensor_w8a8_scaled_mm_impl(qinput: torch.Tensor, bias: torch.Tensor) -> torch.Tensor: from vllm.platforms.rocm import on_mi3xx if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx( - ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0: + ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0 and bias is None: output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b, current_platform.get_cu_count()) else: diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 829dd82b0bd4d..9d93cad2420ad 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -649,7 +649,7 @@ def _sample_with_torch( else: sampled_token_ids_tensor = None - # Counterintiutively, having two loops here is actually faster. + # Counterintuitively, having two loops here is actually faster. # The first loop can run without waiting on GPU<->CPU sync. for sampling_type in SamplingType: sample_indices = categorized_sample_indices[sampling_type] diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index a42640cef9d44..5f6025abf315c 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -43,7 +43,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( @@ -68,6 +67,7 @@ class BailingAttention(nn.Module): config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, prefix: str = "", ): super().__init__() @@ -84,10 +84,11 @@ class BailingAttention(nn.Module): self.head_dim = config.head_dim or (self.hidden_size // self.total_num_heads) self.q_size_per_rank = self.head_dim * self.num_heads - self.num_kv_heads = self.total_kv_heads // tp_size self.kv_size_per_rank = self.num_kv_heads * self.head_dim self.scale = self.head_dim**-0.5 + self.use_qk_norm = getattr(config, "use_qk_norm", False) + self.use_rmsnorm = getattr(config, "use_rmsnorm", False) self.query_key_value = QKVParallelLinear( self.hidden_size, @@ -99,28 +100,45 @@ class BailingAttention(nn.Module): prefix=f"{prefix}.query_key_value", ) + if self.use_qk_norm: + self.query_layernorm = (RMSNorm( + self.head_dim, eps=config.rms_norm_eps) if self.use_rmsnorm + else nn.LayerNorm(self.head_dim, eps=1e-6)) + self.key_layernorm = (RMSNorm( + self.head_dim, eps=config.rms_norm_eps) if self.use_rmsnorm + else nn.LayerNorm(self.head_dim, eps=1e-6)) + self.dense = RowParallelLinear( self.total_num_heads * self.head_dim, self.hidden_size, bias=config.use_bias, quant_config=quant_config, + reduce_results=reduce_results, prefix=f"{prefix}.dense", ) - self.attn = Attention(self.num_heads, - self.head_dim, - self.scale, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - prefix=f"{prefix}.attn") + self.partial_rotary_factor = getattr(config, "partial_rotary_factor", + 1.0) + + self.rotary_dim = getattr(config, "rotary_dim", self.head_dim) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, + rotary_dim=self.rotary_dim, max_position=config.max_position_embeddings, base=config.rope_theta, is_neox_style=True, rope_scaling=config.rope_scaling, + partial_rotary_factor=self.partial_rotary_factor, + ) + + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scale, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + prefix=f"{prefix}.attn", ) def forward( @@ -135,6 +153,14 @@ class BailingAttention(nn.Module): ], dim=-1) + if self.use_qk_norm: + q = q.view(-1, self.num_heads, self.head_dim) + k = k.view(-1, self.num_kv_heads, self.head_dim) + q = self.query_layernorm(q) + k = self.key_layernorm(k) + q = q.view(-1, self.q_size_per_rank) + k = k.view(-1, self.kv_size_per_rank) + q, k = self.rotary_emb(position_ids, q, k) context_layer = self.attn(q, k, v) @@ -198,24 +224,72 @@ class BailingMoE(nn.Module): self.hidden_size = config.hidden_size self.quant_config = quant_config self.num_shared_experts = config.num_shared_experts - # Gate always runs at half / full precision for now. - self.gate = ReplicatedLinear(self.hidden_size, - self.num_experts, - bias=False, - quant_config=None) + self.score_function = getattr(config, "score_function", None) + self.n_group = getattr(config, "n_group", None) + self.topk_group = getattr(config, "topk_group", None) + self.use_grouped_topk = (self.n_group is not None + and self.topk_group is not None) + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", + 1.0) - self.experts = FusedMoE(num_experts=self.num_experts, - top_k=self.top_k, - hidden_size=self.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=self.norm_expert_prob, - quant_config=quant_config, - prefix=f"{prefix}.experts") + router_dtype = getattr(config, "router_dtype", None) + if router_dtype is None: + self.router_dtype = None + elif router_dtype == "fp32": + self.router_dtype = torch.float32 + else: + self.router_dtype = torch.bfloat16 + + self.gate = nn.Linear( + self.hidden_size, + self.num_experts, + bias=False, + dtype=self.router_dtype, + ) + + if getattr(config, "moe_router_enable_expert_bias", False): + self.gate.expert_bias = nn.Parameter( + torch.empty((config.num_experts, ), dtype=torch.float32)) + else: + self.gate.expert_bias = None + + self.correction_bias = (self.gate.expert_bias.data + if self.gate.expert_bias is not None else None) + + if self.score_function is not None: + assert ( + self.score_function == "softmax" + and self.correction_bias is None + ) or ( + self.score_function == "sigmoid" + and self.correction_bias is not None + ), "score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)" # noqa: E501 + else: + # default value for scoring_func + self.score_function = "softmax" + + self.experts = FusedMoE( + num_experts=self.num_experts, + top_k=self.top_k, + hidden_size=self.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=self.norm_expert_prob, + quant_config=quant_config, + prefix=f"{prefix}.experts", + scoring_func=self.score_function, + e_score_correction_bias=self.gate.expert_bias, + num_expert_group=self.n_group, + topk_group=self.topk_group, + use_grouped_topk=self.use_grouped_topk, + ) if self.num_shared_experts > 0: - intermediate_size = (config.moe_intermediate_size * - self.num_shared_experts) + if hasattr(config, "moe_shared_expert_intermediate_size"): + intermediate_size = config.moe_shared_expert_intermediate_size + else: + intermediate_size = config.moe_intermediate_size + intermediate_size *= config.num_shared_experts self.shared_experts = BailingMLP( intermediate_size=intermediate_size, config=config, @@ -228,14 +302,18 @@ class BailingMoE(nn.Module): def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: num_tokens, hidden_size = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_size) - if self.num_shared_experts > 0: + if self.shared_experts: shared_output = self.shared_experts(hidden_states) # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) + router_logits = self.gate(hidden_states.to(self.router_dtype)) + router_logits = router_logits.to(hidden_states.dtype) + final_hidden_states = self.experts(hidden_states=hidden_states, router_logits=router_logits) - if self.num_shared_experts > 0: + final_hidden_states *= self.routed_scaling_factor + + if self.shared_experts: final_hidden_states = final_hidden_states + shared_output if self.tp_size > 1: @@ -254,20 +332,30 @@ class BailingMoeBlock(nn.Module): prefix: str = "", ): super().__init__() + layer_idx = int(prefix.split('.')[-1]) + self.config = config hidden_size = config.hidden_size intermediate_size = config.intermediate_size + self.input_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps) self.attention = BailingAttention(config, cache_config, quant_config, prefix=f"{prefix}.attention") + self.post_attention_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps) - self.mlp = BailingMoE(intermediate_size, - config, - quant_config, - True, - prefix=f"{prefix}.mlp") + + # Choose MLP class based on the number of experts and layer index + if layer_idx < config.first_k_dense_replace: + mlp_class = BailingMLP + else: + mlp_class = BailingMoE + self.mlp = mlp_class(intermediate_size, + config, + quant_config, + True, + prefix=f"{prefix}.mlp") def forward( self, @@ -310,11 +398,17 @@ class BailingMoeModel(nn.Module): self.config = config self.vocab_size = config.vocab_size self.embed_dim = config.hidden_size + self.tie_word_embeddings = getattr(config, "tie_word_embeddings", + False) - if get_pp_group().is_first_rank or (config.tie_word_embeddings + if get_pp_group().is_first_rank or (self.tie_word_embeddings and get_pp_group().is_last_rank): self.word_embeddings = VocabParallelEmbedding( - self.vocab_size, self.embed_dim) + self.vocab_size, + self.embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.word_embeddings", + ) else: self.word_embeddings = PPMissingLayer() @@ -372,8 +466,11 @@ class BailingMoeModel(nn.Module): "hidden_states": hidden_states, "residual": residual }) - - hidden_states, _ = self.norm(hidden_states, residual) + else: + if residual is None: + hidden_states = self.norm(hidden_states) + else: + hidden_states, _ = self.norm(hidden_states, residual) return hidden_states def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: @@ -396,7 +493,8 @@ class BailingMoeModel(nn.Module): loaded_params: set[str] = set() expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: - if self.config.norm_head and "lm_head.weight" in name: + if (hasattr(self.config, "norm_head") and self.config.norm_head + and "lm_head.weight" in name): loaded_weight = F.normalize(loaded_weight, dim=0, p=2, @@ -430,13 +528,17 @@ class BailingMoeModel(nn.Module): if is_pp_missing_parameter(name, self): continue + if name not in params_dict: + continue param = params_dict[name] weight_loader = param.weight_loader - weight_loader(param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id) + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) break else: if name.endswith(".bias") and name not in params_dict: @@ -473,19 +575,30 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): ) -> None: super().__init__() - config = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_config.get_text_config() + vllm_config.model_config.hf_config = config quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config + self.lora_config = lora_config self.quant_config = quant_config self.max_position_embeddings = config.max_position_embeddings self.model = BailingMoeModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + self.tie_word_embeddings = getattr(config, "tie_word_embeddings", + False) + if get_pp_group().is_last_rank: - self.lm_head = (self.word_embeddings if config.tie_word_embeddings - else ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config)) + if self.tie_word_embeddings: + self.lm_head = self.model.word_embeddings + else: + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.lm_head", + ) self.logits_processor = LogitsProcessor(config.vocab_size) else: self.lm_head = PPMissingLayer() @@ -520,10 +633,13 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, - skip_prefixes=(["lm_head."] - if self.config.tie_word_embeddings else None), + skip_prefixes=(["lm_head."] if self.tie_word_embeddings else None), ) return loader.load_weights(weights) def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: return self.model.get_expert_mapping() + + +class BailingMoeV2ForCausalLM(BailingMoeForCausalLM): + pass diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py deleted file mode 100644 index 242530817c642..0000000000000 --- a/vllm/model_executor/models/bart.py +++ /dev/null @@ -1,1319 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Derived from BART implementation posted on HuggingFace; license below: -# -# coding=utf-8 -# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch BART model.""" -import math -from collections.abc import Iterable -from typing import Optional - -import torch -from torch import nn -from transformers import BartConfig -from transformers.utils import logging - -from vllm.attention import Attention, AttentionType -from vllm.config import CacheConfig, VllmConfig -from vllm.config.lora import LoRAConfig -from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - QKVCrossParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors - -from .interfaces import SupportsQuant, SupportsV0Only -from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, - maybe_prefix) - -logger = logging.get_logger(__name__) - - -def get_bsz_seq_len(input_ids): - shp = input_ids.shape - ndim = len(shp) - if ndim == 1: - return 1, input_ids.numel() - else: - return shp[:2] - - -class BartLearnedPositionalEmbedding(VocabParallelEmbedding): - """ - This module learns positional embeddings up to a fixed maximum size. - """ - - def __init__(self, num_embeddings: int, embedding_dim: int): - # Bart is set up so that if padding_idx is - # specified then offset the embedding ids by 2 - # and adjust num_embeddings appropriately. - # Other models don't have this hack - self.offset = 2 - super().__init__(num_embeddings + self.offset, embedding_dim) - - def forward( - self, - positions: torch.Tensor, - ) -> torch.Tensor: - """`input_ids' shape is expected to be [bsz x seqlen].""" - return super().forward(positions + self.offset) - - -class BartScaledWordEmbedding(VocabParallelEmbedding): - """ - This module overrides VocabParallelEmbedding's - forward by multiplying with embeddings scale. - """ - - def __init__(self, - num_embeddings: int, - embedding_dim: int, - embed_scale: float = 1.0): - super().__init__(num_embeddings, embedding_dim) - self.embed_scale = embed_scale - - def forward(self, input_ids: torch.Tensor) -> torch.Tensor: - return super().forward(input_ids) * self.embed_scale - - -class BartParallelLMHead(ParallelLMHead): - """ - This module overrides ParallelLMHead's - forward by dividing by embeddings scale, - yielding effectively the inverse of - BartScaledWordEmbedding - """ - - def __init__(self, - num_embeddings: int, - embedding_dim: int, - embed_scale: float = 1.0): - super().__init__(num_embeddings, embedding_dim) - self.embed_scale = embed_scale - - def forward(self, input_ids: torch.Tensor) -> torch.Tensor: - return super().forward(input_ids) / self.embed_scale - - -class BartEncoderAttention(nn.Module): - - def __init__( - self, - embed_dim: int, - num_heads: int, - bias: bool = True, - config: Optional[BartConfig] = None, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.d_model = config.d_model - self.embed_dim = embed_dim - self.total_num_heads = num_heads - self.total_num_kv_heads = self.total_num_heads - self.head_dim = embed_dim // num_heads - self.config = config - - if (self.head_dim * num_heads) != self.embed_dim: - raise ValueError(f"embed_dim must be divisible by num_heads " - f"(got `embed_dim`: {self.embed_dim}" - f" and `num_heads`: {num_heads}).") - self.scaling = self.head_dim**-0.5 - - self.qkv_proj = QKVParallelLinear( - self.d_model, - self.d_model // self.total_num_heads, - self.total_num_heads, - self.total_num_kv_heads, - bias=bias, - quant_config=quant_config, - ) - - self.out_proj = RowParallelLinear( - embed_dim, - embed_dim, - bias=bias, - quant_config=quant_config, - ) - - tp_world_size = get_tensor_model_parallel_world_size() - assert self.total_num_heads % tp_world_size == 0 - self.num_heads = self.total_num_heads // tp_world_size - - if self.total_num_kv_heads >= tp_world_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_world_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_world_size % self.total_num_kv_heads == 0 - self.num_kv_heads = self.num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - attn_type=AttentionType.ENCODER) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - """Input shape: Batch x Time x Channel""" - - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - - attn_output = self.attn(q, k, v) - - output, _ = self.out_proj(attn_output) - return output - - -class BartDecoderSelfAttention(nn.Module): - - def __init__( - self, - embed_dim: int, - num_heads: int, - bias: bool = True, - config: Optional[BartConfig] = None, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.d_model = config.d_model - self.embed_dim = embed_dim - self.total_num_heads = num_heads - self.total_num_kv_heads = self.total_num_heads - self.head_dim = embed_dim // num_heads - self.config = config - - if (self.head_dim * num_heads) != self.embed_dim: - raise ValueError(f"embed_dim must be divisible by num_heads " - f"(got `embed_dim`: {self.embed_dim}" - f" and `num_heads`: {num_heads}).") - self.scaling = self.head_dim**-0.5 - - self.qkv_proj = QKVParallelLinear( - self.d_model, - self.d_model // self.total_num_heads, - self.total_num_heads, - self.total_num_kv_heads, - bias=bias, - quant_config=quant_config, - ) - - self.out_proj = RowParallelLinear( - embed_dim, - embed_dim, - bias=bias, - quant_config=quant_config, - ) - - tp_world_size = get_tensor_model_parallel_world_size() - assert self.total_num_heads % tp_world_size == 0 - self.num_heads = self.total_num_heads // tp_world_size - - if self.total_num_kv_heads >= tp_world_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_world_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_world_size % self.total_num_kv_heads == 0 - self.num_kv_heads = self.num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - attn_type=AttentionType.DECODER) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - """Input shape: Batch x Time x Channel""" - - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - - attn_output = self.attn(q, k, v) - - output, _ = self.out_proj(attn_output) - return output - - -class BartCrossAttention(nn.Module): - - def __init__( - self, - embed_dim: int, - num_heads: int, - bias: bool = True, - config: Optional[BartConfig] = None, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.d_model = config.d_model - self.embed_dim = embed_dim - self.total_num_heads = num_heads - self.total_num_kv_heads = self.total_num_heads - self.head_dim = embed_dim // num_heads - self.config = config - - if (self.head_dim * num_heads) != self.embed_dim: - raise ValueError(f"embed_dim must be divisible by num_heads " - f"(got `embed_dim`: {self.embed_dim}" - f" and `num_heads`: {num_heads}).") - self.scaling = self.head_dim**-0.5 - - # TP sharding sizes is accounted for within "*Parallel" layers. - self.qkv_proj = QKVCrossParallelLinear(self.d_model, - self.d_model // - self.total_num_heads, - self.total_num_heads, - self.total_num_kv_heads, - bias, - quant_config=quant_config) - - self.out_proj = RowParallelLinear( - embed_dim, - embed_dim, - bias=bias, - quant_config=quant_config, - ) - - tp_world_size = get_tensor_model_parallel_world_size() - assert self.total_num_heads % tp_world_size == 0 - self.num_heads = self.total_num_heads // tp_world_size - - if self.total_num_kv_heads >= tp_world_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_world_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_world_size % self.total_num_kv_heads == 0 - self.num_kv_heads = self.num_heads # No GQA in bart - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - attn_type=AttentionType.ENCODER_DECODER) - - def forward( - self, - decoder_hidden_states: torch.Tensor, - encoder_hidden_states: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Input shape: Batch x Time x Channel""" - - q, k, v = self.qkv_proj(decoder_hidden_states, encoder_hidden_states) - - attn_output = self.attn(q, k, v) - - output, _ = self.out_proj(attn_output) - return output - - -class BartEncoderLayer(nn.Module): - - def __init__( - self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.embed_dim = config.d_model - - self.self_attn = BartEncoderAttention( - embed_dim=self.embed_dim, - num_heads=config.encoder_attention_heads, - config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - self.activation_fn = get_act_fn(config.activation_function) - - ffn_hidden_size = self.embed_dim - ffn_intermediate_size = config.encoder_ffn_dim - ffn_has_bias = True - self.fc1 = ColumnParallelLinear( - ffn_hidden_size, - ffn_intermediate_size, - bias=ffn_has_bias, - quant_config=quant_config, - ) - self.act = get_act_fn("gelu") - self.fc2 = RowParallelLinear( - ffn_intermediate_size, - ffn_hidden_size, - bias=ffn_has_bias, - quant_config=quant_config, - ) - - self.final_layer_norm = nn.LayerNorm(self.embed_dim) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - r""" - Args: - hidden_states: torch.Tensor of *encoder* input embeddings. - Returns: - Encoder layer output torch.Tensor - """ - residual = hidden_states - hidden_states = self.self_attn(hidden_states=hidden_states) - - hidden_states = residual + hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) - - residual = hidden_states - fc1_out, _ = self.fc1(hidden_states) - hidden_states = self.activation_fn(fc1_out) - - hidden_states, _ = self.fc2(hidden_states) - - hidden_states = residual + hidden_states - hidden_states = self.final_layer_norm(hidden_states) - - if hidden_states.dtype == torch.float16 and ( - torch.isinf(hidden_states).any() - or torch.isnan(hidden_states).any()): - hidden_states = cast_overflow_tensors(hidden_states) - - return hidden_states - - -class BartDecoderLayer(nn.Module): - - def __init__( - self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.embed_dim = config.d_model - - self.self_attn = BartDecoderSelfAttention( - embed_dim=self.embed_dim, - num_heads=config.decoder_attention_heads, - config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - self.activation_fn = get_act_fn(config.activation_function) - - self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - ''' - afeldman-nm: personally I would call this "cross-attention", - however I left the name as "encoder_attn" to maintain consistency - with the name of the pretrained weights. - ''' - self.encoder_attn = BartCrossAttention( - self.embed_dim, - config.decoder_attention_heads, - config=config, - prefix=f"{prefix}.encoder_attn", - ) - self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) - - ffn_hidden_size = self.embed_dim - ffn_intermediate_size = config.encoder_ffn_dim - ffn_has_bias = True - self.fc1 = ColumnParallelLinear( - ffn_hidden_size, - ffn_intermediate_size, - bias=ffn_has_bias, - quant_config=quant_config, - ) - self.fc2 = RowParallelLinear( - ffn_intermediate_size, - ffn_hidden_size, - bias=ffn_has_bias, - quant_config=quant_config, - ) - - self.final_layer_norm = nn.LayerNorm(self.embed_dim) - - def forward( - self, - decoder_hidden_states: torch.Tensor, - encoder_hidden_states: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - decoder_hidden_states: torch.Tensor of *decoder* input embeddings. - encoder_hidden_states: torch.Tensor of *encoder* input embeddings. - Returns: - Decoder layer output torch.Tensor - """ - residual = decoder_hidden_states - - # Self Attention - hidden_states = self.self_attn(hidden_states=decoder_hidden_states) - - hidden_states = residual + hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) - - # Cross-Attention Block - - residual = hidden_states - - hidden_states = self.encoder_attn( - decoder_hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - ) - - hidden_states = residual + hidden_states - hidden_states = self.encoder_attn_layer_norm(hidden_states) - - # Fully Connected - residual = hidden_states - fc1_out, _ = self.fc1(hidden_states) - hidden_states = self.activation_fn(fc1_out) - - hidden_states, _ = self.fc2(hidden_states) - - hidden_states = residual + hidden_states - hidden_states = self.final_layer_norm(hidden_states) - - return hidden_states - - -class BartEncoder(nn.Module): - """ - Transformer encoder consisting of *config.encoder_layers* - self attention layers. Each layer is a [`BartEncoderLayer`]. - Args: - config: BartConfig - embed_tokens (nn.Embedding): output embedding - """ - - def __init__(self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - embed_tokens: Optional[nn.Embedding] = None, - prefix: str = ""): - super().__init__() - - self.cache_config = cache_config - self.quant_config = quant_config - self.lora_config = lora_config - embed_dim = config.d_model - self.max_source_positions = config.max_position_embeddings - embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 - - self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, - embed_dim, - embed_scale=embed_scale) - - if embed_tokens is not None: - self.embed_tokens.weight = embed_tokens.weight - - self.embed_positions = BartLearnedPositionalEmbedding( - config.max_position_embeddings, - embed_dim, - ) - self.layers = nn.ModuleList([ - BartEncoderLayer(config, - cache_config, - quant_config, - prefix=f"{prefix}.layers.{layer_idx}") - for layer_idx in range(config.encoder_layers) - ]) - - self.layernorm_embedding = nn.LayerNorm(embed_dim) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - input_ids: Indices of *encoder* input sequence tokens in the - vocabulary. - Padding will be ignored by default should you provide it. - positions: Positions of *encoder* input sequence tokens. - Returns: - Decoder output torch.Tensor - """ - # retrieve input_ids and inputs_embeds - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - embed_pos = self.embed_positions(positions) - embed_pos = embed_pos.to(inputs_embeds.device) - - hidden_states = inputs_embeds + embed_pos - hidden_states = self.layernorm_embedding(hidden_states) - - for encoder_layer in self.layers: - hidden_states = encoder_layer(hidden_states=hidden_states) - - return hidden_states - - -class BartDecoder(nn.Module): - """ - Transformer decoder consisting of *config.decoder_layers* layers. - Each layer is a [`BartDecoderLayer`] - Args: - config: BartConfig - embed_tokens (nn.Embedding): output embedding - """ - - def __init__( - self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - embed_tokens: Optional[nn.Embedding] = None, - prefix: str = "", - ): - super().__init__() - self.cache_config = cache_config - self.quant_config = quant_config - self.lora_config = lora_config - self.max_target_positions = config.max_position_embeddings - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, - config.d_model, - embed_scale=embed_scale) - - if embed_tokens is not None: - self.embed_tokens.weight = embed_tokens.weight - - self.embed_positions = BartLearnedPositionalEmbedding( - config.max_position_embeddings, - config.d_model, - ) - - self.layers = nn.ModuleList( - [BartDecoderLayer(config,cache_config,quant_config, - prefix=f"{prefix}.layers.{layer_idx}") \ - for layer_idx in range(config.decoder_layers)]) - - self.layernorm_embedding = nn.LayerNorm(config.d_model) - - def forward( - self, - decoder_input_ids: torch.Tensor, - decoder_positions: torch.Tensor, - encoder_hidden_states: Optional[torch.Tensor], - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - decoder_input_ids: Indices of *decoder* input sequence tokens - in the vocabulary. - Padding will be ignored by default should you provide it. - decoder_positions: Positions of *decoder* input sequence tokens. - encoder_hidden_states: Tensor of encoder output embeddings. - Returns: - Decoder output torch.Tensor - """ - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(decoder_input_ids) - else: - decoder_positions = inputs_embeds[:, -1] - - # embed positions - embed_pos = self.embed_positions(decoder_positions) - embed_pos = embed_pos.to(inputs_embeds.device) - - hidden_states = inputs_embeds + embed_pos - hidden_states = self.layernorm_embedding(hidden_states) - - # decoder layers - - for decoder_layer in self.layers: - hidden_states = decoder_layer( - decoder_hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - ) - - return hidden_states - - -class BartModel(nn.Module, SupportsQuant): - _tied_weights_keys = [ - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - - self.config = config - - lora_vocab = (lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0 - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size - - self.encoder = BartEncoder(config, - cache_config, - quant_config=quant_config, - prefix=f"{prefix}.encoder") - self.decoder = BartDecoder(config, - cache_config, - quant_config=quant_config, - prefix=f"{prefix}.decoder") - - def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor) -> torch.Tensor: - r""" - Args: - input_ids: Indices of *decoder* input sequence tokens - in the vocabulary. - Padding will be ignored by default should you provide it. - positions: Positions of *decoder* input sequence tokens. - encoder_input_ids: Indices of *encoder* input sequence tokens - in the vocabulary. - encoder_positions: Positions of *encoder* input sequence tokens. - Returns: - Model output torch.Tensor - """ - - encoder_hidden_states = None - - if encoder_input_ids.numel() > 0: - # Run encoder attention if a non-zero number of encoder tokens - # are provided as input - encoder_hidden_states = self.encoder(input_ids=encoder_input_ids, - positions=encoder_positions) - - # decoder outputs consists of - # (dec_features, past_key_value, dec_hidden, dec_attn) - decoder_outputs = self.decoder( - decoder_input_ids=input_ids, - decoder_positions=positions, - encoder_hidden_states=encoder_hidden_states) - - return decoder_outputs - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - other_weights = [] - loaded_stacked_params = [] - model_params_dict = dict(self.named_parameters()) - - for name, loaded_weight in weights: - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - if name not in model_params_dict: - continue - param = model_params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - loaded_stacked_params.append(name) - break - else: - if name in model_params_dict: - other_weights.append((name, loaded_weight)) - - loader = AutoWeightsLoader(self) - loaded_params = loader.load_weights(other_weights) - loaded_params.update(loaded_stacked_params) - return loaded_params - - -class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "decoder.": "model.decoder.", - "encoder.": "model.encoder.", - "shared.": "model.shared." - }, - orig_to_new_substr={ - "beta": "bias", - "gamma": "weight", - "LayerNorm": "layernorm", - }, - ) - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - - super().__init__() - config = vllm_config.model_config.hf_config - lora_config = vllm_config.lora_config - # currently all existing BART models have `tie_word_embeddings` enabled - assert config.tie_word_embeddings - self.config = config - self.model = BartModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size - - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.lm_head = BartParallelLMHead(config.vocab_size, - config.d_model, - embed_scale=embed_scale) - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - *, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - r""" - Args: - input_ids: torch.Tensor of *decoder* input token ids. - positions: torch.Tensor of *decoder* position indices. - encoder_input_ids: torch.Tensor of *encoder* input token ids. - encoder_positions: torch.Tensor of *encoder* position indices. - Returns: - Output torch.Tensor - """ - return self.model(input_ids, positions, encoder_input_ids, - encoder_positions) - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - weights_tuple_list = list(weights) - - shared_embedding_weight = None - for name, loaded_weight in weights_tuple_list: - if ('shared.weight' in name - or 'encoder.embed_tokens.weight' in name - or 'decoder.embed_tokens.weight' in name - or 'lm_head.weight' in name): - assert shared_embedding_weight is None, ( - "Conflicting embedding weights.") - shared_embedding_weight = loaded_weight - - loader = AutoWeightsLoader( - self, - skip_prefixes=(["cls.", "pooler."]), - ) - loaded_params = loader.load_weights(weights_tuple_list, - mapper=self.hf_to_vllm_mapper) - - if shared_embedding_weight is not None: - weight_loader = getattr(self.lm_head.weight, "weight_loader", - default_weight_loader) - weight_loader(self.lm_head.weight, shared_embedding_weight) - - self.model.encoder.embed_tokens.weight = self.lm_head.weight - self.model.decoder.embed_tokens.weight = self.lm_head.weight - loaded_params.update({ - 'model.encoder.embed_tokens.weight', 'lm_head.weight', - 'model.decoder.embed_tokens.weight' - }) - - return loaded_params - - -class MBartEncoderLayer(BartEncoderLayer): - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - r""" - Args: - hidden_states: torch.Tensor of *encoder* input embeddings. - Returns: - Encoder layer output torch.Tensor - """ - residual = hidden_states - hidden_states = self.self_attn_layer_norm(hidden_states) - hidden_states = self.self_attn(hidden_states=hidden_states) - - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.final_layer_norm(hidden_states) - fc1_out, _ = self.fc1(hidden_states) - hidden_states = self.activation_fn(fc1_out) - - hidden_states, _ = self.fc2(hidden_states) - - hidden_states = residual + hidden_states - - if hidden_states.dtype == torch.float16 and ( - torch.isinf(hidden_states).any() - or torch.isnan(hidden_states).any()): - hidden_states = cast_overflow_tensors(hidden_states) - - return hidden_states - - -class MBartDecoderLayer(BartDecoderLayer): - - def forward( - self, - decoder_hidden_states: torch.Tensor, - encoder_hidden_states: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - residual = decoder_hidden_states - hidden_states = self.self_attn_layer_norm(decoder_hidden_states) - - # Self Attention - hidden_states = self.self_attn(hidden_states=hidden_states) - - hidden_states = residual + hidden_states - - # Cross-Attention Block - - residual = hidden_states - hidden_states = self.encoder_attn_layer_norm(hidden_states) - - hidden_states = self.encoder_attn( - decoder_hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - ) - - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.final_layer_norm(hidden_states) - fc1_out, _ = self.fc1(hidden_states) - hidden_states = self.activation_fn(fc1_out) - - hidden_states, _ = self.fc2(hidden_states) - - hidden_states = residual + hidden_states - - return hidden_states - - -class MBartEncoder(nn.Module): - """ - Transformer encoder consisting of *config.encoder_layers* - self attention layers. Each layer is a [`BartEncoderLayer`]. - Args: - config: BartConfig - embed_tokens (nn.Embedding): output embedding - """ - - def __init__(self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - embed_tokens: Optional[nn.Embedding] = None, - prefix: str = ""): - super().__init__() - - self.cache_config = cache_config - self.quant_config = quant_config - self.lora_config = lora_config - embed_dim = config.d_model - self.max_source_positions = config.max_position_embeddings - embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 - - self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, - embed_dim, - embed_scale=embed_scale) - - if embed_tokens is not None: - self.embed_tokens.weight = embed_tokens.weight - - self.embed_positions = BartLearnedPositionalEmbedding( - config.max_position_embeddings, - embed_dim, - ) - self.layers = nn.ModuleList([ - MBartEncoderLayer(config, - cache_config, - quant_config, - prefix=f"{prefix}.layers.{layer_idx}") - for layer_idx in range(config.encoder_layers) - ]) - - self.layernorm_embedding = nn.LayerNorm(embed_dim) - self.layer_norm = nn.LayerNorm(config.d_model) # 改动 - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - input_ids: Indices of *encoder* input sequence tokens in the - vocabulary. - Padding will be ignored by default should you provide it. - positions: Positions of *encoder* input sequence tokens. - Returns: - Decoder output torch.Tensor - """ - # retrieve input_ids and inputs_embeds - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - embed_pos = self.embed_positions(positions) - embed_pos = embed_pos.to(inputs_embeds.device) - - hidden_states = inputs_embeds + embed_pos - hidden_states = self.layernorm_embedding(hidden_states) - - for encoder_layer in self.layers: - hidden_states = encoder_layer(hidden_states=hidden_states) - - hidden_states = self.layer_norm(hidden_states) - return hidden_states - - -class MBartDecoder(nn.Module): - """ - Transformer decoder consisting of *config.decoder_layers* layers. - Each layer is a [`BartDecoderLayer`] - Args: - config: BartConfig - embed_tokens (nn.Embedding): output embedding - """ - - def __init__( - self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - embed_tokens: Optional[nn.Embedding] = None, - prefix: str = "", - ): - super().__init__() - self.cache_config = cache_config - self.quant_config = quant_config - self.lora_config = lora_config - self.max_target_positions = config.max_position_embeddings - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, - config.d_model, - embed_scale=embed_scale) - - if embed_tokens is not None: - self.embed_tokens.weight = embed_tokens.weight - - self.embed_positions = BartLearnedPositionalEmbedding( - config.max_position_embeddings, - config.d_model, - ) - - self.layers = nn.ModuleList( - [MBartDecoderLayer(config, cache_config, quant_config, - prefix=f"{prefix}.layers.{layer_idx}") \ - for layer_idx in range(config.decoder_layers)]) - - self.layernorm_embedding = nn.LayerNorm(config.d_model) - self.layer_norm = nn.LayerNorm(config.d_model) - - def forward( - self, - decoder_input_ids: torch.Tensor, - decoder_positions: torch.Tensor, - encoder_hidden_states: Optional[torch.Tensor], - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - decoder_input_ids: Indices of *decoder* input sequence tokens - in the vocabulary. - Padding will be ignored by default should you provide it. - decoder_positions: Positions of *decoder* input sequence tokens. - encoder_hidden_states: Tensor of encoder output embeddings. - Returns: - Decoder output torch.Tensor - """ - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(decoder_input_ids) - else: - decoder_positions = inputs_embeds[:, -1] - - # embed positions - embed_pos = self.embed_positions(decoder_positions) - embed_pos = embed_pos.to(inputs_embeds.device) - - hidden_states = inputs_embeds + embed_pos - hidden_states = self.layernorm_embedding(hidden_states) - - # decoder layers - - for decoder_layer in self.layers: - hidden_states = decoder_layer( - decoder_hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - ) - - hidden_states = self.layer_norm(hidden_states) - return hidden_states - - -class MBartModel(nn.Module, SupportsQuant): - _tied_weights_keys = [ - "encoder.embed_tokens.weight", "decoder.embed_tokens.weight" - ] - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - - self.config = config - - lora_vocab = (lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0 - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size - - self.encoder = MBartEncoder(config, - cache_config, - quant_config=quant_config, - prefix=f"{prefix}.encoder") - self.decoder = MBartDecoder(config, - cache_config, - quant_config=quant_config, - prefix=f"{prefix}.decoder") - - def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor) -> torch.Tensor: - r""" - Args: - input_ids: Indices of *decoder* input sequence tokens - in the vocabulary. - Padding will be ignored by default should you provide it. - positions: Positions of *decoder* input sequence tokens. - encoder_input_ids: Indices of *encoder* input sequence tokens - in the vocabulary. - encoder_positions: Positions of *encoder* input sequence tokens. - Returns: - Model output torch.Tensor - """ - - encoder_hidden_states = None - - if encoder_input_ids.numel() > 0: - # Run encoder attention if a non-zero number of encoder tokens - # are provided as input - encoder_hidden_states = self.encoder(input_ids=encoder_input_ids, - positions=encoder_positions) - - # decoder outputs consists of - # (dec_features, past_key_value, dec_hidden, dec_attn) - decoder_outputs = self.decoder( - decoder_input_ids=input_ids, - decoder_positions=positions, - encoder_hidden_states=encoder_hidden_states) - - return decoder_outputs - - -class MBartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): - base_model_prefix = "model" - - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "decoder.": "model.decoder.", - "encoder.": "model.encoder.", - "shared.": "model.shared." - }, - orig_to_new_substr={ - "beta": "bias", - "gamma": "weight", - "LayerNorm": "layernorm", - }, - ) - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - lora_config = vllm_config.lora_config - assert config.tie_word_embeddings - self.config = config - self.model = MBartModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size - - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.lm_head = BartParallelLMHead(config.vocab_size, - config.d_model, - embed_scale=embed_scale) - - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - *, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - return self.model(input_ids, positions, encoder_input_ids, - encoder_positions) - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - model_params_dict = dict(self.named_parameters()) - loaded_params = set() - remaining_weights = [] - shared_embedding_weight = None - - for name, loaded_weight in weights: - if any(skip in name - for skip in ["cls.", "pooler.", "final_logits_bias"]): - continue - if any(embed_name in name for embed_name in [ - 'shared.weight', 'encoder.embed_tokens.weight', - 'decoder.embed_tokens.weight' - ]): - if shared_embedding_weight is None: - shared_embedding_weight = loaded_weight - continue - is_stacked = False - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - vllm_name = name - for src, dst in self.hf_to_vllm_mapper.orig_to_new_substr.items( - ): - vllm_name = vllm_name.replace(src, dst) - for src, dst in self.hf_to_vllm_mapper.orig_to_new_prefix.items( - ): - if vllm_name.startswith(src): - vllm_name = dst + vllm_name[len(src):] - break - vllm_name = vllm_name.replace(weight_name, param_name) - if vllm_name in model_params_dict: - param = model_params_dict[vllm_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight, shard_id) - loaded_params.add(vllm_name) - is_stacked = True - break - if not is_stacked: - remaining_weights.append((name, loaded_weight)) - loader = AutoWeightsLoader(self, skip_prefixes=["cls.", "pooler."]) - auto_loaded_params = loader.load_weights(remaining_weights, - mapper=self.hf_to_vllm_mapper) - loaded_params.update(auto_loaded_params) - if shared_embedding_weight is not None: - lm_head_param = self.lm_head.weight - weight_loader = getattr(lm_head_param, "weight_loader", - default_weight_loader) - weight_loader(lm_head_param, shared_embedding_weight) - self.model.encoder.embed_tokens.weight = self.lm_head.weight - self.model.decoder.embed_tokens.weight = self.lm_head.weight - loaded_params.update({ - 'model.encoder.embed_tokens.weight', 'lm_head.weight', - 'model.decoder.embed_tokens.weight' - }) - return loaded_params diff --git a/vllm/model_executor/models/donut.py b/vllm/model_executor/models/donut.py deleted file mode 100644 index 23f4c6a4f93fc..0000000000000 --- a/vllm/model_executor/models/donut.py +++ /dev/null @@ -1,381 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -from collections.abc import Iterable, Mapping, Sequence -from typing import Annotated, Literal, Optional, Union - -import torch -import torch.nn as nn -from transformers import BatchFeature, NougatProcessor - -from vllm.config import VllmConfig -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.bart import BartParallelLMHead, MBartDecoder -from vllm.model_executor.models.interfaces import (MultiModalEmbeddings, - SupportsMultiModal, - SupportsV0Only) -from vllm.model_executor.models.swin import SwinModel -from vllm.model_executor.models.utils import (AutoWeightsLoader, - _flatten_embeddings, flatten_bn) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargsItems) -from vllm.multimodal.parse import MultiModalDataItems -from vllm.multimodal.processing import (BaseProcessingInfo, - EncDecMultiModalProcessor, - PromptIndexTargets, PromptInsertion, - PromptUpdate) -from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.utils.tensor_schema import TensorSchema, TensorShape - - -class MBartDecoderWrapper(nn.Module): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.decoder = MBartDecoder(config, - cache_config, - quant_config=quant_config, - prefix=f"{prefix}.decoder") - - def forward(self, *args, **kwargs): - return self.decoder(*args, **kwargs) - - -class DonutLanguageForConditionalGeneration(nn.Module, SupportsV0Only): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - - self.config = config - self.model = MBartDecoderWrapper(vllm_config=vllm_config, - prefix=f"{prefix}.model") - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.vocab_size = config.vocab_size - self.lm_head = BartParallelLMHead(self.vocab_size, - config.d_model, - embed_scale=embed_scale) - - self.logits_processor = LogitsProcessor(self.vocab_size, - config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - inputs_embeds: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - r""" - Args: - input_ids: torch.Tensor of *decoder* input token ids. - positions: torch.Tensor of *decoder* position indices. - Returns: - Output torch.Tensor - """ - - return self.model(decoder_input_ids=input_ids, - decoder_positions=positions, - encoder_hidden_states=inputs_embeds) - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - if "final_logits_bias" in name: - continue - # if self.config.tie_word_embeddings and "embed_tokens" in name: - # continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class DonutImagePixelInputs(TensorSchema): - """ - Dimensions: - - b: Batch size - - c: Number of channels (3) - - h: Height - - w: Width - """ - type: Literal["pixel_values"] - data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")] - - -class DonutProcessingInfo(BaseProcessingInfo): - - def get_hf_config(self): - return self.ctx.get_hf_config() - - def get_hf_processor(self): - return self.ctx.get_hf_processor() - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 1} - - def get_num_image_tokens(self) -> int: - return 1 - - -class DonutDummyInputsBuilder(BaseDummyInputsBuilder[DonutProcessingInfo]): - - def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - return "" - - def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> MultiModalDataDict: - num_images = mm_counts.get("image", 0) - - target_width, target_height = self.info.get_hf_config( - ).encoder.image_size - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - -class DonutMultiModalProcessor(EncDecMultiModalProcessor[DonutProcessingInfo]): - - def _hf_processor_applies_updates( - self, - prompt_text: str, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - tokenization_kwargs: Mapping[str, object], - ) -> bool: - return False - - def create_encoder_prompt( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - ) -> Union[str, list[int]]: - return prompt - - def create_decoder_prompt( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - ) -> Union[str, list[int]]: - return prompt - - @property - def pad_dummy_encoder_prompt(self) -> bool: - return True - - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - hf_processor = self.info.get_hf_processor() - if mm_data: - processed_outputs = super()._call_hf_processor( - prompt, mm_data, mm_kwargs, tok_kwargs) - if isinstance(hf_processor, NougatProcessor): - processed_outputs["input_ids"] = processed_outputs["labels"] - else: - tokenizer = hf_processor.tokenizer - processed_outputs = tokenizer(prompt, - add_special_tokens=False, - return_tensors="pt") - return processed_outputs - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values=MultiModalFieldConfig.batched("image")) - - def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargsItems, - ) -> Sequence[PromptUpdate]: - hf_processor = self.info.get_hf_processor() - tokenizer = hf_processor.tokenizer - pad_token_id = tokenizer.pad_token_id - num_image_tokens = self.info.get_num_image_tokens() - image_tokens = [pad_token_id] * num_image_tokens - - return [ - PromptInsertion( - modality="image", - target=PromptIndexTargets.start(), - insertion=image_tokens, - ) - ] - - -@MULTIMODAL_REGISTRY.register_processor(DonutMultiModalProcessor, - info=DonutProcessingInfo, - dummy_inputs=DonutDummyInputsBuilder) -class DonutForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsV0Only): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - processor_config = vllm_config.model_config.hf_image_processor_config - - self.config = config - self.vision_config = config.encoder - self.processor_config = processor_config - self.encoder = SwinModel(config=config.encoder) - - self.decoder = DonutLanguageForConditionalGeneration( - vllm_config=vllm_config.with_hf_config(config.decoder), - prefix=f"{prefix}.decoder", - ) - self.pad_token_id = config.pad_token_id - - def _parse_and_validate_image_input(self, **kwargs: object): - pixel_values: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "pixel_values", None) - image_embeds: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "image_embeds", None) - - if pixel_values is None and image_embeds is None: - return None - - if pixel_values is not None and image_embeds is not None: - raise ValueError( - "Both pixel values and image embeds are provided.") - - if pixel_values is not None: - h, w = self.config.encoder.image_size - return DonutImagePixelInputs(type="pixel_values", - data=flatten_bn(pixel_values, - concat=True), - resolve_bindings={ - "h": h, - "w": w, - }) - - if image_embeds is not None: - raise NotImplementedError - - raise AssertionError("This line should be unreachable.") - - def _process_image_input( - self, image_input: DonutImagePixelInputs) -> torch.Tensor: - assert image_input["type"] == "pixel_values" - pixel_values = image_input["data"] - dtype = next(self.encoder.parameters()).dtype - pixel_values = pixel_values.to(dtype) - return self.encoder(pixel_values) - - def get_language_model(self) -> torch.nn.Module: - return self.decoder - - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: - return None - vision_embeddings = self._process_image_input(image_input) - return vision_embeddings - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: MultiModalEmbeddings, - ) -> torch.Tensor: - return _flatten_embeddings(multimodal_embeddings) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - *, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - r""" - Args: - input_ids: torch.Tensor of *decoder* input token ids. - positions: torch.Tensor of *decoder* position indices. - encoder_input_ids: torch.Tensor of *encoder* input token ids. - encoder_positions: torch.Tensor of *encoder* position indices - Returns: - Output torch.Tensor - """ - - inputs_embeds = None - if encoder_input_ids.numel() > 0: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(encoder_input_ids, - vision_embeddings) - - hidden_states = self.decoder(input_ids, - positions, - inputs_embeds=inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - return self.decoder.compute_logits(hidden_states, sampling_metadata) - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py deleted file mode 100644 index 5e05e0c60f41c..0000000000000 --- a/vllm/model_executor/models/florence2.py +++ /dev/null @@ -1,1097 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -from collections import OrderedDict -from collections.abc import Iterable, Mapping, Sequence -from typing import Annotated, Literal, Optional, Union - -import torch -import torch.nn as nn -import torch.nn.functional as F -from einops import rearrange -from transformers import BartTokenizer, BatchFeature, PretrainedConfig - -from vllm.config import VllmConfig -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, - BartParallelLMHead, - BartScaledWordEmbedding) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargsItems) -from vllm.multimodal.parse import MultiModalDataItems -from vllm.multimodal.processing import (BaseProcessingInfo, - EncDecMultiModalProcessor, - PromptIndexTargets, PromptInsertion, - PromptUpdate) -from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.sequence import IntermediateTensors -from vllm.utils.tensor_schema import TensorSchema, TensorShape - -from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, - SupportsV0Only) -from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings - - -class Florence2ImagePixelInputs(TensorSchema): - """ - Dimensions: - - b: Batch size - - c: Number of channels (3) - - h: Height of the image - - w: Width of the image - """ - - type: Literal["pixel_values"] - - data: Annotated[ - torch.Tensor, - TensorShape("b", 3, "h", "w"), - ] - - -# ViT implementation are all copied from -# https://huggingface.co/microsoft/Florence-2-base/blob/main/modeling_florence2.py -class LearnedAbsolutePositionEmbedding2D(nn.Module): - """ - This module learns positional embeddings up to a fixed maximum size. - """ - - def __init__(self, embedding_dim=256, num_pos=50): - super().__init__() - self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2) - self.column_embeddings = nn.Embedding( - num_pos, embedding_dim - (embedding_dim // 2)) - - def forward(self, pixel_values): - """ - pixel_values: (batch_size, height, width, num_channels) - returns: (batch_size, height, width, embedding_dim * 2) - """ - if len(pixel_values.shape) != 4: - raise ValueError('pixel_values must be a 4D tensor') - height, width = pixel_values.shape[1:3] - width_values = torch.arange(width, device=pixel_values.device) - height_values = torch.arange(height, device=pixel_values.device) - x_emb = self.column_embeddings(width_values) - y_emb = self.row_embeddings(height_values) - # (height, width, embedding_dim * 2) - pos = torch.cat([ - x_emb.unsqueeze(0).repeat(height, 1, 1), - y_emb.unsqueeze(1).repeat(1, width, 1) - ], - dim=-1) - # (embedding_dim * 2, height, width) - pos = pos.permute(2, 0, 1) - pos = pos.unsqueeze(0) - # (batch_size, embedding_dim * 2, height, width) - pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) - # (batch_size, height, width, embedding_dim * 2) - pos = pos.permute(0, 2, 3, 1) - return pos - - -class PositionalEmbeddingCosine1D(nn.Module): - """ - This class implements a very simple positional encoding. It follows closely - the encoder from the link below: - https://pytorch.org/tutorials/beginner/translation_transformer.html - Args: - embed_dim: The dimension of the embeddings. - dropout_prob: The dropout probability. - max_seq_len: The maximum length to precompute the positional encodings. - """ - - def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None: - super().__init__() - self.embed_dim = embed_dim - self.max_seq_len = max_seq_len - # Generate the sinusoidal arrays. - factor = math.log(10000) - denominator = torch.exp(-factor * torch.arange(0, self.embed_dim, 2) / - self.embed_dim) - # Matrix where rows correspond to a positional embedding as a function - # of the position index (i.e., the row index). - frequencies = \ - torch.arange(0, self.max_seq_len) \ - .reshape(self.max_seq_len, 1) * denominator - pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim)) - # Populate uneven entries. - pos_idx_to_embed[:, 0::2] = torch.sin(frequencies) - pos_idx_to_embed[:, 1::2] = torch.cos(frequencies) - # Save the positional embeddings in a constant buffer. - # self.register_buffer("pos_idx_to_embed", pos_idx_to_embed) - self.pos_idx_to_embed = nn.Parameter(pos_idx_to_embed, - requires_grad=False) - - def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor: - """ - Args: - seq_embeds: The sequence embeddings in order. Allowed size: - 1. [T, D], where T is the length of the sequence, and D is the - frame embedding dimension. - 2. [B, T, D], where B is the batch size and T and D are the - same as above. - Returns a tensor of with the same dimensions as the input: i.e., - [1, T, D] or [T, D]. - """ - shape_len = len(seq_embeds.shape) - assert 2 <= shape_len <= 3 - len_seq = seq_embeds.size(-2) - assert len_seq <= self.max_seq_len - pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :] - # Adapt pre-computed positional embeddings to the input. - if shape_len == 3: - pos_embeds = pos_embeds.view( - (1, pos_embeds.size(0), pos_embeds.size(1))) - return pos_embeds - - -class MySequential(nn.Sequential): - - def forward(self, *inputs): - for module in self._modules.values(): - if isinstance(inputs, tuple): - inputs = module(*inputs) - else: - inputs = module(inputs) - return inputs - - -class PreNorm(nn.Module): - - def __init__(self, norm, fn): - super().__init__() - self.norm = norm - self.fn = fn - - def forward(self, x, *args, **kwargs): - shortcut = x - if self.norm is not None: - x, size = self.fn(self.norm(x), *args, **kwargs) - else: - x, size = self.fn(x, *args, **kwargs) - - x = shortcut + x - - return x, size - - -class Mlp(nn.Module): - - def __init__( - self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - ): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.net = nn.Sequential( - OrderedDict([("fc1", nn.Linear(in_features, hidden_features)), - ("act", act_layer()), - ("fc2", nn.Linear(hidden_features, out_features))])) - - def forward(self, x, size): - return self.net(x), size - - -class DepthWiseConv2d(nn.Module): - - def __init__( - self, - dim_in, - kernel_size, - padding, - stride, - bias=True, - ): - super().__init__() - self.dw = nn.Conv2d(dim_in, - dim_in, - kernel_size=kernel_size, - padding=padding, - groups=dim_in, - stride=stride, - bias=bias) - - def forward(self, x, size): - B, N, C = x.shape - H, W = size - assert N == H * W - - x = self.dw(x.transpose(1, 2).view(B, C, H, W)) - size = (x.size(-2), x.size(-1)) - x = x.flatten(2).transpose(1, 2) - return x, size - - -class ConvEmbed(nn.Module): - """ Image to Patch Embedding - """ - - def __init__(self, - patch_size=7, - in_chans=3, - embed_dim=64, - stride=4, - padding=2, - norm_layer=None, - pre_norm=True): - super().__init__() - self.patch_size = patch_size - - self.proj = nn.Conv2d(in_chans, - embed_dim, - kernel_size=patch_size, - stride=stride, - padding=padding) - - dim_norm = in_chans if pre_norm else embed_dim - self.norm = norm_layer(dim_norm) if norm_layer else None - - self.pre_norm = pre_norm - - def forward(self, x, size): - H, W = size - if len(x.size()) == 3: - if self.norm and self.pre_norm: - x = self.norm(x) - x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W) - - x = self.proj(x) - - _, _, H, W = x.shape - x = rearrange(x, 'b c h w -> b (h w) c') - if self.norm and not self.pre_norm: - x = self.norm(x) - - return x, (H, W) - - -class ChannelAttention(nn.Module): - - def __init__(self, dim, groups=8, qkv_bias=True): - super().__init__() - - self.groups = groups - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.proj = nn.Linear(dim, dim) - - def forward(self, x, size): - B, N, C = x.shape - - qkv = self.qkv(x).reshape(B, N, 3, self.groups, - C // self.groups).permute(2, 0, 3, 1, 4) - q, k, v = qkv[0], qkv[1], qkv[2] - - q = q * (float(N)**-0.5) - attention = q.transpose(-1, -2) @ k - attention = attention.softmax(dim=-1) - x = (attention @ v.transpose(-1, -2)).transpose(-1, -2) - x = x.transpose(1, 2).reshape(B, N, C) - x = self.proj(x) - return x, size - - -class ChannelBlock(nn.Module): - - def __init__(self, - dim, - groups, - mlp_ratio=4., - qkv_bias=True, - drop_path_rate=0., - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - conv_at_attn=True, - conv_at_ffn=True): - super().__init__() - - self.conv1 = PreNorm(None, DepthWiseConv2d( - dim, 3, 1, 1)) if conv_at_attn else None - self.channel_attn = PreNorm( - norm_layer(dim), - ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias), - ) - self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, - 1)) if conv_at_ffn else None - self.ffn = PreNorm( - norm_layer(dim), - Mlp(in_features=dim, - hidden_features=int(dim * mlp_ratio), - act_layer=act_layer), - ) - - def forward(self, x, size): - if self.conv1: - x, size = self.conv1(x, size) - x, size = self.channel_attn(x, size) - - if self.conv2: - x, size = self.conv2(x, size) - x, size = self.ffn(x, size) - - return x, size - - -def window_partition(x, window_size: int): - B, H, W, C = x.shape - x = x.view(B, H // window_size, window_size, W // window_size, window_size, - C) - windows = x.permute(0, 1, 3, 2, 4, - 5).contiguous().view(-1, window_size, window_size, C) - return windows - - -def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int): - B = batch_size - - x = windows.view(B, H // window_size, W // window_size, window_size, - window_size, -1) - x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) - return x - - -class WindowAttention(nn.Module): - - def __init__(self, dim, num_heads, window_size, qkv_bias=True): - - super().__init__() - self.dim = dim - self.window_size = window_size - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = float(head_dim)**-0.5 - - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.proj = nn.Linear(dim, dim) - - self.softmax = nn.Softmax(dim=-1) - - def forward(self, x, size): - - H, W = size - B, L, C = x.shape - assert L == H * W, "input feature has wrong size" - - x = x.view(B, H, W, C) - - pad_l = pad_t = 0 - pad_r = (self.window_size - W % self.window_size) % self.window_size - pad_b = (self.window_size - H % self.window_size) % self.window_size - x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) - _, Hp, Wp, _ = x.shape - - x = window_partition(x, self.window_size) - x = x.view(-1, self.window_size * self.window_size, C) - - # W-MSA/SW-MSA - # attn_windows = self.attn(x_windows) - - B_, N, C = x.shape - qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, - C // self.num_heads).permute(2, 0, 3, 1, 4) - q, k, v = qkv[0], qkv[1], qkv[2] - - q = q * self.scale - attn = (q @ k.transpose(-2, -1)) - attn = self.softmax(attn) - - x = (attn @ v).transpose(1, 2).reshape(B_, N, C) - x = self.proj(x) - - # merge windows - x = x.view(-1, self.window_size, self.window_size, C) - x = window_reverse(x, B, self.window_size, Hp, Wp) - - if pad_r > 0 or pad_b > 0: - x = x[:, :H, :W, :].contiguous() - - x = x.view(B, H * W, C) - - return x, size - - -class SpatialBlock(nn.Module): - - def __init__(self, - dim, - num_heads, - window_size, - mlp_ratio=4., - qkv_bias=True, - drop_path_rate=0., - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - conv_at_attn=True, - conv_at_ffn=True): - super().__init__() - - self.conv1 = PreNorm(None, DepthWiseConv2d( - dim, 3, 1, 1)) if conv_at_attn else None - self.window_attn = PreNorm( - norm_layer(dim), - WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias), - ) - self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, - 1)) if conv_at_ffn else None - self.ffn = PreNorm( - norm_layer(dim), - Mlp(in_features=dim, - hidden_features=int(dim * mlp_ratio), - act_layer=act_layer), - ) - - def forward(self, x, size): - if self.conv1: - x, size = self.conv1(x, size) - x, size = self.window_attn(x, size) - - if self.conv2: - x, size = self.conv2(x, size) - x, size = self.ffn(x, size) - return x, size - - -class DaViT(nn.Module): - - def __init__( - self, - in_chans=3, - num_classes=1000, - depths=(1, 1, 3, 1), - patch_size=(7, 2, 2, 2), - patch_stride=(4, 2, 2, 2), - patch_padding=(3, 0, 0, 0), - patch_prenorm=(False, False, False, False), - embed_dims=(64, 128, 192, 256), - num_heads=(3, 6, 12, 24), - num_groups=(3, 6, 12, 24), - window_size=7, - mlp_ratio=4., - qkv_bias=True, - drop_path_rate=0.1, - norm_layer=nn.LayerNorm, - enable_checkpoint=False, - conv_at_attn=True, - conv_at_ffn=True, - ): - super().__init__() - - self.num_classes = num_classes - self.embed_dims = embed_dims - self.num_heads = num_heads - self.num_groups = num_groups - self.num_stages = len(self.embed_dims) - self.enable_checkpoint = enable_checkpoint - assert self.num_stages == len(self.num_heads) == len(self.num_groups) - - num_stages = len(embed_dims) - dpr = [ - x.item() for x in torch.linspace(0, drop_path_rate, - sum(depths) * 2) - ] - - depth_offset = 0 - convs = [] - blocks = [] - for i in range(num_stages): - conv_embed = ConvEmbed( - patch_size=patch_size[i], - stride=patch_stride[i], - padding=patch_padding[i], - in_chans=in_chans if i == 0 else self.embed_dims[i - 1], - embed_dim=self.embed_dims[i], - norm_layer=norm_layer, - pre_norm=patch_prenorm[i]) - convs.append(conv_embed) - - block = MySequential(*[ - MySequential( - OrderedDict([('spatial_block', - SpatialBlock( - embed_dims[i], - num_heads[i], - window_size, - drop_path_rate=dpr[depth_offset + j * 2], - qkv_bias=qkv_bias, - mlp_ratio=mlp_ratio, - conv_at_attn=conv_at_attn, - conv_at_ffn=conv_at_ffn, - )), - ('channel_block', - ChannelBlock( - embed_dims[i], - num_groups[i], - drop_path_rate=dpr[depth_offset + j * 2 + - 1], - qkv_bias=qkv_bias, - mlp_ratio=mlp_ratio, - conv_at_attn=conv_at_attn, - conv_at_ffn=conv_at_ffn, - ))])) for j in range(depths[i]) - ]) - blocks.append(block) - depth_offset += depths[i] * 2 - - self.convs = nn.ModuleList(convs) - self.blocks = nn.ModuleList(blocks) - - self.avgpool = nn.AdaptiveAvgPool1d(1) - - @property - def dim_out(self): - return self.embed_dims[-1] - - def forward_features_unpool(self, x): - """ - forward until avg pooling - Args: - x (_type_): input image tensor - """ - input_size = (x.size(2), x.size(3)) - for conv, block in zip(self.convs, self.blocks): - x, input_size = conv(x, input_size) - x, input_size = block(x, input_size) - return x - - def forward_features(self, x): - x = self.forward_features_unpool(x) - - # (batch_size, num_tokens, token_dim) - x = self.avgpool(x.transpose(1, 2)) - # (batch_size, 1, num_tokens) - x = torch.flatten(x, 1) - x = self.norms(x) - - return x - - def forward(self, x): - x = self.forward_features(x) - x = self.head(x) - return x - - @classmethod - def from_config(cls, config): - return cls( - depths=config.depths, - embed_dims=config.dim_embed, - num_heads=config.num_heads, - num_groups=config.num_groups, - patch_size=config.patch_size, - patch_stride=config.patch_stride, - patch_padding=config.patch_padding, - patch_prenorm=config.patch_prenorm, - drop_path_rate=config.drop_path_rate, - window_size=config.window_size, - ) - - -# Language backbone and processor implementation -class Florence2LanguageModel(nn.Module): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.config = config - - self.vocab_size = config.vocab_size - - self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model) - self.encoder = BartEncoder(config, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.encoder") - self.decoder = BartDecoder(config, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.decoder") - - if self.config.tie_word_embeddings: - self.encoder.embed_tokens.weight = self.shared.weight - self.decoder.embed_tokens.weight = self.shared.weight - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r""" - Args: - input_ids: Indices of *decoder* input sequence tokens - in the vocabulary. - Padding will be ignored by default should you - provide it. - positions: Positions of *decoder* input sequence tokens. - encoder_input_ids: Indices of *encoder* input sequence tokens - in the vocabulary. - encoder_positions: Positions of *encoder* input sequence tokens. - Returns: - Model output torch.Tensor - """ - - encoder_hidden_states = None - - if ((inputs_embeds is not None and inputs_embeds.numel() > 0) - or encoder_input_ids.numel() > 0): - # Run encoder attention if a non-zero number of encoder tokens - # are provided as input - encoder_hidden_states = self.encoder(input_ids=encoder_input_ids, - positions=encoder_positions, - inputs_embeds=inputs_embeds) - - # decoder outputs consists of - # (dec_features, past_key_value, dec_hidden, dec_attn) - decoder_outputs = self.decoder( - decoder_input_ids=input_ids, - decoder_positions=positions, - encoder_hidden_states=encoder_hidden_states) - - return decoder_outputs - - -class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - - self.config = config - self.model = Florence2LanguageModel(vllm_config=vllm_config, - prefix=f"{prefix}.model") - embed_scale = math.sqrt( - config.d_model) if config.scale_embedding else 1.0 - - self.vocab_size = config.vocab_size - self.lm_head = BartParallelLMHead(self.vocab_size, - config.d_model, - embed_scale=embed_scale) - if self.config.tie_word_embeddings: - self.lm_head.tie_weights(self.model.shared) - - self.logits_processor = LogitsProcessor(self.vocab_size, - config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - inputs_embeds: Optional[torch.Tensor] = None, - **kwargs, - ) -> torch.Tensor: - r""" - Args: - input_ids: torch.Tensor of *decoder* input token ids. - positions: torch.Tensor of *decoder* position indices. - encoder_input_ids: torch.Tensor of *encoder* input token ids. - encoder_positions: torch.Tensor of *encoder* position indices - Returns: - Output torch.Tensor - """ - - return self.model(input_ids, - positions, - encoder_input_ids, - encoder_positions, - inputs_embeds=inputs_embeds) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.encoder.embed_tokens(input_ids) - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - if "final_logits_bias" in name: - continue - if self.config.tie_word_embeddings and ("embed_tokens" in name - or "lm_head" in name): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class Florence2ProcessingInfo(BaseProcessingInfo): - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 1} - - def get_num_image_tokens(self) -> int: - processor_config = self.ctx.get_hf_image_processor_config() - return processor_config["image_seq_length"] - - -class Florence2DummyInputsBuilder( - BaseDummyInputsBuilder[Florence2ProcessingInfo]): - - def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - return "" - - def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> MultiModalDataDict: - num_images = mm_counts.get("image", 0) - - target_width = target_height = self.info.get_hf_config().projection_dim - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - -class Florence2MultiModalProcessor( - EncDecMultiModalProcessor[Florence2ProcessingInfo]): - - def _hf_processor_applies_updates( - self, - prompt_text: str, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - tokenization_kwargs: Mapping[str, object], - ) -> bool: - return False - - def create_encoder_prompt( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - ) -> Union[str, list[int]]: - return prompt - - def create_decoder_prompt( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - ) -> Union[str, list[int]]: - return [self.info.get_hf_config().eos_token_id] - - def _apply_hf_processor_tokens_only( - self, - prompt_tokens: list[int], - ) -> list[int]: - hf_processor = self.info.get_hf_processor() - tokenizer: BartTokenizer = hf_processor.tokenizer - prompt_text = tokenizer.decode(prompt_tokens) - # convert task tokens to prompt - prompt_text = hf_processor._construct_prompts([prompt_text])[0] - prompt_tokens = tokenizer.encode(prompt_text, add_special_tokens=False) - return prompt_tokens - - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - if mm_data: - processed_outputs = super()._call_hf_processor( - prompt, mm_data, mm_kwargs, tok_kwargs) - else: - hf_processor = self.info.get_hf_processor() - tokenizer = hf_processor.tokenizer - prompt = hf_processor._construct_prompts([prompt])[0] - processed_outputs = tokenizer(prompt, - add_special_tokens=True, - return_tensors="pt") - return processed_outputs - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values=MultiModalFieldConfig.batched("image")) - - def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargsItems, - ) -> Sequence[PromptUpdate]: - hf_config = self.info.get_hf_config() - pad_token_id = hf_config.pad_token_id - num_image_tokens = self.info.get_num_image_tokens() - image_tokens = [pad_token_id] * num_image_tokens - - return [ - PromptInsertion( - modality="image", - target=PromptIndexTargets.start(), - insertion=image_tokens, - ) - ] - - -@MULTIMODAL_REGISTRY.register_processor( - Florence2MultiModalProcessor, - info=Florence2ProcessingInfo, - dummy_inputs=Florence2DummyInputsBuilder) -class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsV0Only): - - @classmethod - def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: - if modality.startswith("image"): - return None - - raise ValueError("Only image modality is supported") - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - processor_config = vllm_config.model_config.hf_image_processor_config - - self.config = config - self.vision_config = config.vision_config - self.processor_config = processor_config - assert config.vision_config.model_type == 'davit', ( - 'only DaViT is supported for now') - self.vision_tower = DaViT.from_config(config=config.vision_config) - self._build_image_projection_layers(config) - self.language_model = Florence2LanguageForConditionalGeneration( - vllm_config=vllm_config.with_hf_config(config.text_config), - prefix=f"{prefix}.language_model", - ) - self.pad_token_id = config.pad_token_id - - def _build_image_projection_layers(self, config: PretrainedConfig): - image_dim_out = config.vision_config.dim_embed[-1] - dim_projection = config.vision_config.projection_dim - self.image_projection = nn.Parameter( - torch.empty(image_dim_out, dim_projection)) - self.image_proj_norm = nn.LayerNorm(dim_projection) - image_pos_embed_config = config.vision_config.image_pos_embed - if image_pos_embed_config['type'] == 'learned_abs_2d': - self.image_pos_embed = LearnedAbsolutePositionEmbedding2D( - embedding_dim=image_dim_out, - num_pos=image_pos_embed_config['max_pos_embeddings']) - else: - raise NotImplementedError("Florence2 only supports learned_abs_2d " - "as image position embedding.") - - self.image_feature_source = config.vision_config.image_feature_source - - # temporal embedding - visual_temporal_embedding_config = ( - self.vision_config.visual_temporal_embedding) - if visual_temporal_embedding_config['type'] == 'COSINE': - self.visual_temporal_embed = PositionalEmbeddingCosine1D( - embed_dim=image_dim_out, - max_seq_len=visual_temporal_embedding_config[ - 'max_temporal_embeddings']) - else: - raise NotImplementedError( - 'Florence2 only supports COSINE as temporal embedding.') - - def _parse_and_validate_image_input(self, **kwargs: object): - pixel_values: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "pixel_values", None) - image_embeds: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "image_embeds", None) - - if pixel_values is None and image_embeds is None: - return None - - if pixel_values is not None and image_embeds is not None: - raise ValueError( - "Both pixel values and image embeds are provided.") - - if pixel_values is not None: - size = self.processor_config["size"] - expected_h, expected_w = size["height"], size["width"] - - return Florence2ImagePixelInputs( - type="pixel_values", - data=flatten_bn(pixel_values, concat=True), - resolve_bindings={ - "h": expected_h, - "w": expected_w - }, - ) - - if image_embeds is not None: - raise NotImplementedError - - raise AssertionError("This line should be unreachable.") - - def _encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor: - dtype = next(self.vision_tower.parameters()).dtype - pixel_values = pixel_values.to(dtype) - - batch_size, T = pixel_values.size(0), 1 - x = self.vision_tower.forward_features_unpool(pixel_values) - if self.image_pos_embed is not None: - x = x.view(batch_size * T, -1, x.shape[-1]) - num_tokens = x.shape[-2] - h, w = int(num_tokens**0.5), int(num_tokens**0.5) - assert h * w == num_tokens, ( - 'only support square feature maps for now') - x = x.view(batch_size * T, h, w, x.shape[-1]) - pos_embed = self.image_pos_embed(x) - x = x + pos_embed - x = x.view(batch_size, T * h * w, x.shape[-1]) - - if self.visual_temporal_embed is not None: - visual_temporal_embed = self.visual_temporal_embed( - x.view(batch_size, T, -1, x.shape[-1])[:, :, 0]) - x = x.view(batch_size, T, -1, - x.shape[-1]) + visual_temporal_embed.view( - 1, T, 1, x.shape[-1]) - - x_feat_dict = {} - - spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2) - x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x - - temporal_avg_pool_x = x.view(batch_size, T, -1, - x.shape[-1]).mean(dim=1) - x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x - - x = x.view(batch_size, T, -1, x.shape[-1])[:, -1] - x_feat_dict['last_frame'] = x - - new_x = [] - for _image_feature_source in self.image_feature_source: - if _image_feature_source not in x_feat_dict: - raise ValueError('invalid image feature source: {}'.format( - _image_feature_source)) - new_x.append(x_feat_dict[_image_feature_source]) - - x = torch.cat(new_x, dim=1) - - x = x @ self.image_projection - x = self.image_proj_norm(x) - - return x - - def _process_image_input( - self, image_input: Florence2ImagePixelInputs) -> torch.Tensor: - assert image_input["type"] == "pixel_values" - pixel_values = image_input["data"] - return self._encode_image(pixel_values) - - def get_language_model(self) -> torch.nn.Module: - return self.language_model - - def get_multimodal_embeddings(self, - **kwargs: object) -> MultiModalEmbeddings: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: - return [] - vision_embeddings = self._process_image_input(image_input) - return vision_embeddings - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.pad_token_id) - return inputs_embeds - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - *, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, - **kwargs, - ) -> torch.Tensor: - r""" - Args: - input_ids: torch.Tensor of *decoder* input token ids. - positions: torch.Tensor of *decoder* position indices. - encoder_input_ids: torch.Tensor of *encoder* input token ids. - encoder_positions: torch.Tensor of *encoder* position indices - Returns: - Output torch.Tensor - """ - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - if encoder_input_ids.numel() > 0 or vision_embeddings is not None: - inputs_embeds = self.get_input_embeddings(encoder_input_ids, - vision_embeddings) - else: - inputs_embeds = None - - hidden_states = self.language_model(input_ids, - positions, - encoder_input_ids, - encoder_positions, - inputs_embeds=inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 22386a5e819ab..cbf327ce02b6b 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -36,7 +36,9 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange +from packaging.version import Version from transformers import BatchFeature +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig from transformers.models.glm4v.image_processing_glm4v import ( Glm4vImageProcessor, smart_resize) @@ -1001,28 +1003,32 @@ class Glm4vProcessingInfo(BaseProcessingInfo): max_frame_idx = meta_frames - 1 duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1) - if duration <= video_processor.max_duration: - n = int(math.floor(duration * video_processor.fps)) - frame_indices = [ - min( - max_frame_idx, - int(math.ceil(i * video_fps / video_processor.fps)), - ) for i in range(n) - ] + do_sample_frames = metadata["do_sample_frames"] + if not do_sample_frames: + frame_indices = metadata["frames_indices"] else: - num_samples = int(video_processor.max_duration * - video_processor.fps) - if num_samples >= meta_frames: - frame_indices = list(range(meta_frames)) - else: - target_seconds = np.linspace(0, - duration, - num_samples, - endpoint=True) + if duration <= video_processor.max_duration: + n = int(math.floor(duration * video_processor.fps)) frame_indices = [ - min(max_frame_idx, int(math.ceil(t * video_fps))) - for t in target_seconds + min( + max_frame_idx, + int(math.ceil(i * video_fps / video_processor.fps)), + ) for i in range(n) ] + else: + num_samples = int(video_processor.max_duration * + video_processor.fps) + if num_samples >= meta_frames: + frame_indices = list(range(meta_frames)) + else: + target_seconds = np.linspace(0, + duration, + num_samples, + endpoint=True) + frame_indices = [ + min(max_frame_idx, int(math.ceil(t * video_fps))) + for t in target_seconds + ] seen, uniq = set(), [] for idx in frame_indices: @@ -1139,7 +1145,9 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): "fps": 2.0, "duration": num_frames / 2.0, "total_num_frames": num_frames, + "frames_indices": [i for i in range(num_frames)], "video_backend": "opencv", + "do_sample_frames": False, } video_item = (video.copy(), video_metadata) video_items.append(video_item) @@ -1172,34 +1180,37 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): for item in mm_data.pop("videos", []): video_array, metadata = item - if metadata["video_backend"] == "opencv_dynamic": - mm_kwargs["do_sample_frames"] = False - - elif metadata["total_num_frames"] != len(video_array): - logger.warning( - "Total frames in metadata " - "(%s) does not match the length of " - "video array %s. This can " - "be because the video is resampled " - "in advance. This may cause " - "a divergence with HF implementation.", - metadata["total_num_frames"], - len(video_array), - ) - metadata["total_num_frames"] = len(video_array) + # don't update mm_kwargs inplace + video_mm_kwargs = dict(**mm_kwargs) + video_mm_kwargs["do_sample_frames"] = metadata.get( + "do_sample_frames", True) video_mm_data = dict() video_mm_data["videos"] = [[video_array]] - video_mm_data["video_metadata"] = [[VideoMetadata(**metadata)]] + + # backward compatibility for Transformers 4.55 + unuse_metadata = ["do_sample_frames"] + if not hasattr( + VideoMetadata, + "frames_indices") and "frames_indices" in metadata: + unuse_metadata.append("frames_indices") + + video_mm_data["video_metadata"] = [[ + VideoMetadata( + **{ + k: metadata[k] + for k in metadata if k not in unuse_metadata + }) + ]] video_outputs = super()._call_hf_processor( prompt="<|begin_of_video|><|video|><|end_of_video|>", mm_data=video_mm_data, - mm_kwargs=mm_kwargs, + mm_kwargs=video_mm_kwargs, tok_kwargs=tok_kwargs, ) - if "do_sample_frames" in mm_kwargs and not mm_kwargs[ - "do_sample_frames"]: + if not video_mm_kwargs["do_sample_frames"] and Version( + TRANSFORMERS_VERSION) < Version("4.56.0"): # Transformers v4.55 has incorrect timestamps issue for # skip sampling. We construct the placeholder manually to # get placeholders with correct timestamps. @@ -1218,6 +1229,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): prompt = prompt.replace( "<|begin_of_video|><|video|><|end_of_video|>", video_placeholder, + 1, ) video_grid_thw_lst.append(video_outputs["video_grid_thw"]) @@ -1524,7 +1536,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, return None # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index d998b8a0ab4f7..b59d1b88cf5ce 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -738,7 +738,7 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, return [] # The result multimodal_embeddings is tuple of tensors, with each - # tensor correspoending to a multimodal data item (image or video). + # tensor corresponding to a multimodal data item (image or video). multimodal_embeddings: tuple[torch.Tensor, ...] = () # NOTE: It is important to iterate over the keys in this dictionary diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index bceb6cc42768e..99b77729b5018 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -9,7 +9,7 @@ import torch.nn as nn from transformers import LlamaConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear @@ -33,10 +33,14 @@ class LlamaDecoderLayer(LlamaDecoderLayer): def __init__( self, config: LlamaConfig, + cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: - super().__init__(config, quant_config=quant_config, prefix=prefix) + super().__init__(config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix) # override qkv self.self_attn.qkv_proj = QKVParallelLinear( @@ -114,6 +118,8 @@ class LlamaModel(nn.Module): speculative_config.draft_model_config.hf_config self.vocab_size = self.config.vocab_size + current_vllm_config = get_current_vllm_config() + self.embed_tokens = VocabParallelEmbedding( self.config.vocab_size, self.config.hidden_size, @@ -123,6 +129,7 @@ class LlamaModel(nn.Module): self.layers = nn.ModuleList([ LlamaDecoderLayer( config=self.config, + cache_config=current_vllm_config.cache_config, prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"), ) ]) diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index e314ae357ecd4..140800dd41c76 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -497,8 +497,11 @@ class MiDashengLMDummyInputsBuilder( hf_processor = self.info.get_hf_processor() audio_token = hf_processor.audio_token + audio_bos_token = hf_processor.audio_bos_token + audio_eos_token = hf_processor.audio_eos_token - return audio_token * num_audios + single_audio_text = f"{audio_bos_token}{audio_token}{audio_eos_token}" + return single_audio_text * num_audios def get_dummy_mm_data( self, @@ -577,14 +580,7 @@ class MiDashengLMMultiModalProcessor( vocab = tokenizer.get_vocab() audio_token = getattr(processor, "audio_token", "<|AUDIO|>") - audio_bos_token = getattr(processor, "audio_bos_token", - "<|audio_bos|>") - audio_eos_token = getattr(processor, "audio_eos_token", - "<|audio_eos|>") - audio_token_id = vocab[audio_token] - audio_bos_id = vocab[audio_bos_token] - audio_eos_id = vocab[audio_eos_token] out_mm_data = out_mm_kwargs.get_data() audio_length = out_mm_data.get("audio_length") @@ -604,7 +600,7 @@ class MiDashengLMMultiModalProcessor( audio_tokens = [audio_token_id] * num_features return PromptUpdateDetails.select_token_id( - [audio_bos_id] + audio_tokens + [audio_eos_id], + audio_tokens, embed_token_id=audio_token_id, ) @@ -670,8 +666,18 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP): f"Got type: {type(mm_input)}") if isinstance(mm_input, torch.Tensor): return mm_input.reshape(-1, *mm_input.shape[2:]) - else: - return torch.concat(mm_input) + + if name == "input_values": + max_length = max(tensor.shape[1] for tensor in mm_input) + padded_mm_input = [ + torch.nn.functional.pad(tensor, + (0, max_length - tensor.shape[1])) + if tensor.shape[1] < max_length else tensor + for tensor in mm_input + ] + return torch.concat(padded_mm_input) + + return torch.concat(mm_input) def _parse_and_validate_audio_input( self, **kwargs: object) -> Optional[MiDashengLMAudioInputs]: diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py deleted file mode 100644 index 048894085b360..0000000000000 --- a/vllm/model_executor/models/mllama.py +++ /dev/null @@ -1,1697 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copyright 2024 the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch Mllama model.""" -import math -from collections.abc import Iterable, Mapping, Sequence -from typing import Annotated, Literal, Optional, Union - -import numpy as np -import torch -import torch.nn.functional as F -import transformers.models.mllama.configuration_mllama as config_mllama -from PIL.Image import Image -from torch import nn -from transformers import BatchFeature, MllamaConfig -from transformers.modeling_outputs import (BaseModelOutput, - CausalLMOutputWithPast) -from transformers.models.mllama.image_processing_mllama import ( - get_optimal_tiled_canvas) -from transformers.models.mllama.processing_mllama import ( - MllamaProcessor, get_cross_attention_token_mask) - -import vllm.distributed.parallel_state as ps -from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.attention.layer import MultiHeadAttention -from vllm.attention.ops.paged_attn import PagedAttention -from vllm.attention.selector import _Backend -from vllm.config import VllmConfig -from vllm.distributed import get_pp_group, get_tp_group -from vllm.forward_context import get_forward_context -from vllm.logger import init_logger -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - QKVCrossParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, - MultiModalFieldConfig, - MultiModalKwargsItems, MultiModalUUIDDict) -from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, - MultiModalDataItems) -from vllm.multimodal.processing import (BaseProcessingInfo, - EncDecMultiModalProcessor, - PromptReplacement, PromptUpdate) -from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.utils.tensor_schema import TensorSchema, TensorShape - -from .clip import CLIPMLP -from .interfaces import SupportsMultiModal, SupportsV0Only -from .llama import LlamaDecoderLayer, LlamaMLP -from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix - -logger = init_logger(__name__) - - -class MllamaImagePixelInputs(TensorSchema): - """ - Dimensions: - - batch_size: Batch size - - max_num_image: Max number of images - - max_num_chunk: Max number of chunks - - max_num_tiles: Max number of tiles per image - - num_channel: Number of channels - - height: Height - - width: Width - """ - - type: Literal["pixel_values"] = "pixel_values" - - data: Annotated[torch.Tensor, - TensorShape("batch_size", "max_num_image", "max_num_chunk", - "num_channel", "height", "width")] - - aspect_ratio_ids: Annotated[torch.Tensor, - TensorShape("batch_size", "max_num_image")] - - aspect_ratio_mask: Annotated[ - torch.Tensor, - TensorShape("batch_size", "max_num_image", "max_num_tiles")] - - -# TODO: support LlamaImageEmbeddingInputs - - -def calc_token_per_chunk(image_size: int) -> int: - assert image_size % 14 == 0, "chunk size should be multiple of 14" - token_per_chunk = (image_size // 14)**2 + 1 - return token_per_chunk - - -class MllamaProcessingInfo(BaseProcessingInfo): - - def get_hf_config(self) -> MllamaConfig: - return self.ctx.get_hf_config(MllamaConfig) - - def get_hf_processor(self, **kwargs: object) -> MllamaProcessor: - return self.ctx.get_hf_processor(MllamaProcessor, **kwargs) - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} - - def get_token_per_chunk_from_config(self) -> int: - image_size = self.get_hf_config().vision_config.image_size - return calc_token_per_chunk(image_size) - - def get_num_tiles_per_image(self, image_height: int, - image_width: int) -> int: - vision_config = self.get_hf_config().vision_config - max_num_tiles = vision_config.max_num_tiles - image_size = vision_config.image_size - tiled_height, tiled_width = get_optimal_tiled_canvas( - image_height, - image_width, - max_num_tiles, - tile_size=image_size, - ) - num_tiles_height = tiled_height // image_size - num_tiles_width = tiled_width // image_size - return num_tiles_height * num_tiles_width - - def get_image_size_with_most_features(self) -> ImageSize: - vision_config = self.get_hf_config().vision_config - image_size = vision_config.image_size - max_num_tiles = vision_config.max_num_tiles - # Result in the max possible feature size (h:w = 16:1) - return ImageSize(height=max_num_tiles * image_size, width=image_size) - - -class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]): - - def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - num_images = mm_counts.get("image", 0) - - processor = self.info.get_hf_processor() - image_token = processor.image_token - - return image_token * num_images - - def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> MultiModalDataDict: - num_images = mm_counts.get("image", 0) - - target_width, target_height = \ - self.info.get_image_size_with_most_features() - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - -class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] - ): - - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_uuids: Optional[MultiModalUUIDDict] = None, - ) -> MultiModalEncDecInputs: - mm_inputs = super().apply(prompt, - mm_data, - hf_processor_mm_kwargs, - tokenization_kwargs, - mm_uuids=mm_uuids) - - image_token_id = self.info.get_hf_config().image_token_index - # Check that the number of image tokens in the decoder prompt matches - # the number of images provided in mm_data - num_image_tokens = mm_inputs['prompt_token_ids'].count(image_token_id) - image_data = mm_data.get("image", []) - num_images = 1 if isinstance(image_data, Image) else len(image_data) - if num_image_tokens != num_images: - raise ValueError( - f"The number of image tokens ({num_image_tokens}) must be" - f" the same as the number of images ({num_images})") - - # Given prompt: P0 P1 P3 P4 D5 D6...., (P-prefill, D-decode) # noqa: E501 - # P0 & P1 do cross attention with placeholder of - # P3 P4 D5 D6 do cross attention with placeholder of and - # Example input to encoder and decoder: - # { - # 'encoder': { - # 'type': 'token', - # 'prompt_token_ids': [128256, 128256, ..., 128256], - # 'prompt': '<|image|><|image|>...<|image|>', - # 'multi_modal_data': {'image': }, # noqa: E501 - # }, - # 'decoder': { - # 'type': 'token', - # 'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30], # noqa: E501 - # 'prompt': '<|image|><|begin_of_text|>What is the content of this image?', # noqa: E501 - # 'multi_modal_data': {'image': }, # noqa: E501 - # }, - # } - - if mm_data: - hf_processor = self.info.get_hf_processor() - image_token: str = hf_processor.image_token - - # Since only the last group of consecutive images - # are attended by the decoded tokens, we only need to - # get the number of tokens for those images. - token_per_chunk = self.info.get_token_per_chunk_from_config() - num_decode_images = self._get_num_image_in_last_group( - mm_inputs["prompt_token_ids"]) - num_encode_images = num_images - num_decode_images - - # Set encoder prompt length based on the number of tiles. - # This tells the block manager to allocate correct number - # of slots for encoder tokens. - num_tiles = mm_inputs["mm_kwargs"].get_data()["num_tiles"] - decode_tiles = num_tiles[num_encode_images:num_images].sum().item() - num_tokens = decode_tiles * token_per_chunk - mm_inputs["encoder_prompt_token_ids"] = [image_token_id - ] * num_tokens - mm_inputs["encoder_prompt"] = image_token * num_tokens - - return mm_inputs - - def _get_num_image_in_last_group(self, prompt_token_ids: list[int]) -> int: - num_images = 0 - for token_id in prompt_token_ids[::-1]: - if token_id == self.info.get_hf_config().image_token_index: - num_images += 1 - elif num_images > 0: - break - return num_images - - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - tokenizer = self.info.get_tokenizer() - if mm_data: - num_tiles = [ - self.info.get_num_tiles_per_image(img.height, img.width) - for img in mm_data["images"] - ] - processed_outputs = super()._call_hf_processor( - prompt, mm_data, mm_kwargs, tok_kwargs) - processed_outputs["num_tiles"] = torch.tensor(num_tiles) - for k in ('pixel_values', 'aspect_ratio_ids', "aspect_ratio_mask"): - processed_outputs[k] = processed_outputs[k].squeeze(0) - - processed_token_ids = processed_outputs.pop("input_ids") - start_idx, end_idx = 0, processed_token_ids.size(1) - processed_prompt_text = tokenizer.decode(processed_token_ids[0]) - - hf_processor = self.info.get_hf_processor() - bos_token = hf_processor.bos_token - # Remove the bos_token from the start of prompt, - # because we all know there would be image_token. - if processed_prompt_text.startswith(bos_token): - start_idx += 1 - # Remove the bos_token from the end of prompt, - # because text is empty in this case. - if processed_prompt_text.endswith(bos_token): - end_idx -= 1 - processed_outputs[ - "input_ids"] = processed_token_ids[:, start_idx:end_idx] - else: - processed_outputs = tokenizer(prompt, - add_special_tokens=False, - return_tensors="pt") - return processed_outputs - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - aspect_ratio_ids=MultiModalFieldConfig.batched("image"), - aspect_ratio_mask=MultiModalFieldConfig.batched("image"), - num_tiles=MultiModalFieldConfig.batched("image"), - ) - - def create_encoder_prompt( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - ) -> Union[str, list[int]]: - data = mm_data.get("image", []) - num_images = 1 if isinstance(data, Image) else len(data) - image_token_id = self.info.get_hf_config().image_token_index - return [image_token_id] * num_images - - def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargsItems, - ) -> Sequence[PromptUpdate]: - token_per_chunk = self.info.get_token_per_chunk_from_config() - image_token_id = self.info.get_hf_config().image_token_index - - def get_replacement_mllama(item_idx): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - num_tile = self.info.get_num_tiles_per_image( - image_height=image_size.height, - image_width=image_size.width, - ) - num_tokens = num_tile * token_per_chunk - return [image_token_id] * num_tokens - - return [ - PromptReplacement( - modality="image", - target=[image_token_id], - replacement=get_replacement_mllama, - ) - ] - - -def _prepare_aspect_ratio_attention_mask( - aspect_ratio_mask: torch.Tensor, - num_patches: int, - target_length: int, - dtype: torch.dtype, -) -> torch.Tensor: - # Expand aspect ratio mask to target_length - batch_size, max_num_tiles = aspect_ratio_mask.shape - attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, - 1).to(dtype) - attention_mask = attention_mask.repeat(1, 1, target_length, 1) - - # Mask padding patches - pad_patches = target_length - num_patches - attention_mask[:, :, -pad_patches:] = 0 - - # Invert the mask (0 -> 1, 1 -> 0) - attention_mask = 1 - attention_mask - - # Reshape to 2D and create 4D attention mask - # (batch_size, 1, max_num_tiles*target_length, max_num_tiles*target_length) - attention_mask = attention_mask.reshape(batch_size, - max_num_tiles * target_length, 1) - attention_mask = attention_mask @ attention_mask.transpose( - -1, -2) * torch.finfo(dtype).min - attention_mask = attention_mask.unsqueeze(1) - - return attention_mask - - -class ColumnParallelConv2dPatch(torch.nn.Module): - """Conv2D Patching layer with model parallelism. - Column parallel over unfolded input. - Arguments: - in_channels: Input channels. - out_channels: Output channels. - kernel_size: Size of convolution kernel. - stride (default 1): Stride for convolution. - bias (default False): Use bias in Conv2d. - Input: (bsz, in_channels, width, height) - Output: (bsz, num_tokens, out_channels) - """ - - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: Union[int, tuple[int, int]], - stride: Union[int, tuple[int, int]], - bias: bool = False, - ) -> None: - super().__init__() - if isinstance(kernel_size, int): - kernel_size = (kernel_size, kernel_size) - self._unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=stride) - self._linear = ColumnParallelLinear( - in_channels * kernel_size[0] * kernel_size[1], - out_channels, - bias=bias, - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self._unfold(x) - x = x.permute(0, 2, 1) - x, _ = self._linear(x) - return x - - -class MllamaPrecomputedAspectRatioEmbedding(nn.Module): - - def __init__(self, - config: config_mllama.MllamaVisionConfig, - is_gated: bool = True): - super().__init__() - self.max_num_tiles = config.max_num_tiles - self.hidden_size = config.hidden_size - self.max_aspect_ratio_id = config.max_aspect_ratio_id - self.is_gated = is_gated - - self.embedding = nn.Embedding(self.max_aspect_ratio_id + 1, - self.max_num_tiles * self.hidden_size) - if is_gated: - self.gate = nn.Parameter(torch.zeros(1)) - - def forward(self, hidden_state: torch.Tensor, - aspect_ratio_ids: torch.Tensor) -> torch.Tensor: - embeddings = self.embedding(aspect_ratio_ids) - embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, - self.hidden_size) - - if self.is_gated: - embeddings = embeddings * self.gate.tanh() - - hidden_state = hidden_state + embeddings - return hidden_state - - -class MllamaPrecomputedPositionEmbedding(nn.Module): - - def __init__(self, config: config_mllama.MllamaVisionConfig): - super().__init__() - self.max_num_tiles = config.max_num_tiles - self.max_aspect_ratio_id = config.max_aspect_ratio_id - self.num_patches = (config.image_size // config.patch_size)**2 + 1 - self.hidden_size = config.hidden_size - self.scale = config.hidden_size**-0.5 - - self.gate = nn.Parameter(torch.zeros(1)) - - # position embedding - position_embedding = torch.randn(self.num_patches, self.hidden_size) - self.embedding = nn.Parameter(self.scale * position_embedding) - - # tile position embedding - self.tile_embedding = nn.Embedding( - self.max_aspect_ratio_id + 1, - self.max_num_tiles * self.num_patches * self.hidden_size) - - def forward(self, hidden_state: torch.Tensor, - aspect_ratio_ids: torch.Tensor) -> torch.Tensor: - # position embeddings - gated_position_embedding = (1 - self.gate.tanh()) * self.embedding - hidden_state = hidden_state + gated_position_embedding.view( - 1, 1, self.num_patches, self.hidden_size) - - # precomputed tile position embeddings - tile_position_embedding = self.tile_embedding(aspect_ratio_ids) - batch_size = hidden_state.shape[0] - tile_position_embedding = tile_position_embedding.reshape( - batch_size, self.max_num_tiles, self.num_patches, self.hidden_size) - gated_tile_position_embedding = self.gate.tanh( - ) * tile_position_embedding - hidden_state = hidden_state + gated_tile_position_embedding - - return hidden_state - - -# TODO: support other attention backends for attention in vision model -class MllamaVisionSdpaAttention(nn.Module): - - def __init__(self, - config: config_mllama.MllamaVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): - super().__init__() - - tensor_parallel_size = get_tp_group().world_size - self.embed_dim = config.hidden_size - self.num_heads = config.attention_heads - self.head_dim = config.hidden_size // config.attention_heads - self.num_local_heads = self.num_heads // tensor_parallel_size - self.q_size = self.num_local_heads * self.head_dim - self.kv_size = self.num_local_heads * self.head_dim - - self.qkv_proj = QKVParallelLinear( - self.embed_dim, - self.head_dim, - self.num_heads, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - self.o_proj = RowParallelLinear( - self.num_heads * self.head_dim, - self.embed_dim, - bias=False, - input_is_parallel=True, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) - - # Use unified MultiHeadAttention with automatic backend selection - self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim, - 1.0 / math.sqrt(self.head_dim)) - - def forward( - self, - hidden_state: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_state) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - - # Use unified MultiHeadAttention with automatic backend selection - attn_output = self.attn(q, k, v) - - attn_output = attn_output.reshape(attn_output.shape[0], - attn_output.shape[1], -1) - output, _ = self.o_proj(attn_output) - return output - - -class MllamaVisionEncoderLayer(nn.Module): - - def __init__( - self, - config: config_mllama.MllamaVisionConfig, - quant_config: Optional[QuantizationConfig], - prefix: str = "", - is_gated: bool = False, - ) -> None: - super().__init__() - - self.hidden_size = config.hidden_size - self.num_attention_heads = config.attention_heads - self.is_gated = is_gated - self.intermediate_size = config.intermediate_size - - self.self_attn = MllamaVisionSdpaAttention( - config, quant_config=quant_config, prefix=f"{prefix}.self_attn") - self.mlp = CLIPMLP(config, - quant_config=quant_config, - prefix=f"{prefix}.mlp") - - self.input_layernorm = nn.LayerNorm(self.hidden_size, - eps=config.norm_eps) - self.post_attention_layernorm = nn.LayerNorm(self.hidden_size, - eps=config.norm_eps) - - # there used to be an if else here, no code path - if is_gated: - self.gate_attn = nn.Parameter(torch.ones(1) * math.pi / 4) - self.gate_ffn = nn.Parameter(torch.ones(1) * math.pi / 4) - - def forward( - self, - hidden_state: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - ): - # Self Attention - residual = hidden_state - hidden_state = self.input_layernorm(hidden_state) - hidden_state = self.self_attn(hidden_state, - attention_mask=attention_mask) - gate_attn = 1 if not self.is_gated else self.gate_attn.tanh() - hidden_state = residual + gate_attn * hidden_state - - # Feed forward - residual = hidden_state - hidden_state = self.post_attention_layernorm(hidden_state) - hidden_state = self.mlp(hidden_state) - gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh() - hidden_state = residual + gate_ffn * hidden_state - - return hidden_state - - -class MllamaVisionEncoder(nn.Module): - - def __init__( - self, - config: config_mllama.MllamaVisionConfig, - quant_config: Optional[QuantizationConfig], - num_layers: int = 32, - is_gated: bool = False, - output_hidden_states=None, - prefix: str = "", - ) -> None: - super().__init__() - self.config = config - self.layers = nn.ModuleList([ - MllamaVisionEncoderLayer(config, - quant_config=quant_config, - is_gated=is_gated, - prefix=f"{prefix}.layers.{layer_idx}") - for layer_idx in range(num_layers) - ]) - self.output_hidden_states = output_hidden_states or [] - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - ) -> Union[BaseModelOutput]: - encoder_states = () - - for i, encoder_layer in enumerate(self.layers): - if i in self.output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) - hidden_states = encoder_layer( - hidden_states, - attention_mask, - ) - - if len(self.layers) - 1 in self.output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) - - return hidden_states, encoder_states - - -class MllamaVisionModel(nn.Module): - - def __init__( - self, - config: config_mllama.MllamaVisionConfig, - quant_config: Optional[QuantizationConfig], - prefix: str = "", - ) -> None: - super().__init__() - - self.image_size = config.image_size - self.patch_size = config.patch_size - self.max_num_tiles = config.max_num_tiles - self.hidden_size = config.hidden_size - self.in_channels = config.num_channels - self.intermediate_layers_indices = config.intermediate_layers_indices - - self.num_patches = (self.image_size // self.patch_size)**2 + 1 - self.scale = config.hidden_size**-0.5 - - self.patch_embedding = ColumnParallelConv2dPatch( - in_channels=config.num_channels, - out_channels=self.hidden_size, - kernel_size=self.patch_size, - stride=self.patch_size, - bias=False, - ) - - self.class_embedding = nn.Parameter(self.scale * - torch.randn(self.hidden_size)) - self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding( - config) - - self.pre_tile_positional_embedding = \ - MllamaPrecomputedAspectRatioEmbedding(config, is_gated=True) - self.post_tile_positional_embedding = \ - MllamaPrecomputedAspectRatioEmbedding(config, is_gated=True) - - # layer norms - self.layernorm_pre = nn.LayerNorm(self.hidden_size) - self.layernorm_post = nn.LayerNorm(self.hidden_size) - - # encoders - self.transformer = MllamaVisionEncoder( - config, - quant_config, - config.num_hidden_layers, - is_gated=False, - output_hidden_states=config.intermediate_layers_indices, - prefix=f"{prefix}.transformer", - ) - self.global_transformer = MllamaVisionEncoder( - config, - quant_config, - config.num_global_layers, - is_gated=True, - prefix=f"{prefix}.global_transformer", - ) - - def apply_class_embedding(self, - hidden_state: torch.Tensor) -> torch.Tensor: - batch_size, _, hidden_size = hidden_state.shape - class_embedding = self.class_embedding.expand(batch_size, 1, - hidden_size) - hidden_state = torch.cat([class_embedding, hidden_state], dim=1) - return hidden_state - - def forward(self, pixel_values: torch.Tensor, - aspect_ratio_ids: torch.Tensor, - aspect_ratio_mask: torch.Tensor) -> torch.Tensor: - batch_size, num_concurrent_media, num_tiles, num_channels, \ - height, width = pixel_values.shape - - pixel_values = pixel_values.reshape( - batch_size * num_concurrent_media * num_tiles, num_channels, - height, width) - aspect_ratio_ids = aspect_ratio_ids.reshape( - batch_size * num_concurrent_media, -1) - - # patch embedding - patch_embeds = self.patch_embedding( - pixel_values.to(self.layernorm_pre.weight.dtype)) - hidden_state = patch_embeds - hidden_state = ps.get_tp_group().all_gather(hidden_state) - - # tile embeddings - _, num_patches, dim = hidden_state.shape - hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, - num_tiles, -1, dim) - hidden_state = self.pre_tile_positional_embedding( - hidden_state, aspect_ratio_ids) - - # apply cls token - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media * num_tiles, num_patches, dim) - hidden_state = self.apply_class_embedding(hidden_state) - num_patches += 1 - - # apply position embeddings - hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, - num_tiles, num_patches, dim) - hidden_state = self.gated_positional_embedding(hidden_state, - aspect_ratio_ids) - - # apply encoder - hidden_state = self.layernorm_pre(hidden_state) - - # Compute the number of tokens to pad - num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8 - # Compute padding tuple for pad function - padding = ( - 0, 0, 0, num_padding_patches - ) # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2) - # Pad the tensor - hidden_state = F.pad(hidden_state, padding, mode="constant", value=0) - slice_index = -num_padding_patches if num_padding_patches > 0 else None - - attention_mask = aspect_ratio_mask.reshape( - batch_size * num_concurrent_media, -1) - attention_mask = _prepare_aspect_ratio_attention_mask( - aspect_ratio_mask=attention_mask, - num_patches=self.num_patches, - target_length=hidden_state.shape[2], - dtype=self.layernorm_pre.weight.dtype, - ) - - hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, - dim) - output = self.transformer( - hidden_state, - attention_mask=attention_mask, - ) - hidden_state, intermediate_hidden_states = output[0], output[1] - intermediate_hidden_states = torch.stack(intermediate_hidden_states, - dim=-1) - - # apply global encoder - hidden_state = self.layernorm_post(hidden_state) - hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, - num_tiles, - num_patches + num_padding_patches, - dim) - hidden_state = self.post_tile_positional_embedding( - hidden_state, aspect_ratio_ids) - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media, - num_tiles * (num_patches + num_padding_patches), dim) - hidden_state = self.global_transformer( - hidden_state, attention_mask=attention_mask)[0] - hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, - num_tiles, - num_patches + num_padding_patches, - dim) - hidden_state = hidden_state[:, :, :slice_index] - - # adding intermediate layer outputs - hidden_state = hidden_state.reshape(batch_size, num_concurrent_media, - num_tiles, num_patches, dim) - intermediate_hidden_states = intermediate_hidden_states.reshape( - batch_size * num_concurrent_media, num_tiles, - num_patches + num_padding_patches, -1) - intermediate_hidden_states = intermediate_hidden_states[:, :, : - slice_index] - intermediate_hidden_states = intermediate_hidden_states.reshape( - batch_size, num_concurrent_media, num_tiles, num_patches, -1) - hidden_state = torch.cat([hidden_state, intermediate_hidden_states], - dim=-1) - return hidden_state - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - ] - params_dict = dict(self.named_parameters()) - updated_params: set[str] = set() - for name, loaded_weight in weights: - if 'patch_embedding._linear.weight' in name: - loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1) - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - param = params_dict[name] - updated_params.add(name) - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - param = params_dict.pop(name) - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - updated_params.add(name) - return updated_params - - -class MllamaTextRMSNorm(nn.Module): - - def __init__(self, hidden_size, eps=1e-6): - """ - MllamaTextRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + - self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - def extra_repr(self): - return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" - - -class MllamaTextCrossAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__( - self, - config: Optional[config_mllama.MllamaTextConfig] = None, - layer_idx: Optional[int] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.config = config - self.pipeline_parallel_rank = get_pp_group().rank_in_group - self.tensor_parallel_size = get_tp_group().world_size - self.num_heads = config.num_attention_heads - self.num_key_value_heads = config.num_key_value_heads - - self.num_local_heads = self.num_heads // self.tensor_parallel_size - self.num_local_key_value_heads = \ - self.num_key_value_heads // self.tensor_parallel_size - self.hidden_size = config.hidden_size - self.head_dim = config.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - - self.layer_idx = layer_idx - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.q_local_size = self.num_local_heads * self.head_dim - self.kv_local_size = self.num_local_key_value_heads * self.head_dim - - self.qkv_proj = QKVCrossParallelLinear( - self.hidden_size, - self.head_dim, - self.num_heads, - self.num_key_value_heads, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - - self.o_proj = RowParallelLinear( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - input_is_parallel=True, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) - # vllm.model_executor.layers.layernorm.RMSNorm has precision issue, - # use huggingface's instead - self.q_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps) - self.k_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps) - self.scaling = self.head_dim**-0.5 - - self.attn = Attention( - self.num_local_heads, - self.head_dim, - self.scaling, - self.num_local_key_value_heads, - prefix=f"{prefix}.attn", - attn_type=AttentionType.ENCODER_DECODER, - ) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor], - kv_range_for_decode: Optional[list[tuple[int, int]]], - cross_attention_states: Optional[torch.Tensor], - ) -> torch.Tensor: - q, k, v = self.qkv_proj(hidden_states, cross_attention_states) - if cross_attention_states is not None: - k = k.view(-1, self.num_local_key_value_heads, self.head_dim) - v = v.view(-1, self.num_local_key_value_heads, self.head_dim) - k = self.k_norm(k) - - q = q.view(-1, self.num_local_heads, self.head_dim) - q = self.q_norm(q) - - if attention_mask is not None: - output = self._attention_with_mask(q, k, v, attention_mask, - kv_range_for_decode) - else: - output = self.attn( - q.view(-1, self.num_local_heads * self.head_dim), k, v) - out, _ = self.o_proj(output) - return out - - def _attention_with_mask( - self, - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - attention_mask: torch.Tensor, - kv_range_for_decode: list[tuple[int, int]], - ) -> torch.Tensor: - kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank] - attn_metadata: AttentionMetadata = get_forward_context().attn_metadata - # Skip writing kv-cache for the initial profiling run. - # TODO (NickLucche) replace with custom attn bias and use standard attn - if len(kv_cache.shape) > 1: - i = torch.ones(1, dtype=torch.float32) - if self.attn.backend in (_Backend.FLASH_ATTN, - _Backend.FLASH_ATTN_VLLM_V1): - cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) - cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) - torch.ops._C_cache_ops.reshape_and_cache_flash( - cached_k, - cached_v, - kv_cache[0], - kv_cache[1], - attn_metadata. - cross_slot_mapping, # type: ignore[union-attr] - "auto", - i, - i, - ) - elif self.attn.backend in (_Backend.XFORMERS, _Backend.ROCM_FLASH, - _Backend.TORCH_SDPA): - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_local_key_value_heads, self.head_dim) - cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) - cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) - PagedAttention.write_to_paged_cache( - cached_k, cached_v, key_cache, value_cache, - attn_metadata.cross_slot_mapping, "auto", i, i) - else: - raise ValueError( - f"Unsupported Attention backend {self.attn.backend} " - "enum found. Expected the Attention backend to be " - "FLASH_ATTN, FLASH_ATTN_VLLM_V1, " - "XFORMERS or TORCH_SDPA.") - - # We have to call torch.sdpa for prefill when using a - # custom cross-attention mask. Because the mask is not a - # standard causal mask, neither a block diagonal mask which - # can be optimized by xformers.BlockDiagonalMask. - # The mask is specially calculated for supporting multi - # images and interleaved images. - q_len = q.shape[0] - kv_len = k.shape[0] - q = q.transpose(0, 1).view(self.num_local_key_value_heads, - self.num_key_value_groups, q_len, - self.head_dim).contiguous() - k = k.transpose(0, - 1)[:, - None, :, :].expand(self.num_local_key_value_heads, - self.num_key_value_groups, - kv_len, - self.head_dim).contiguous() - v = v.transpose(0, - 1)[:, - None, :, :].expand(self.num_local_key_value_heads, - self.num_key_value_groups, - kv_len, - self.head_dim).contiguous() - attention_mask = attention_mask.view(1, 1, q_len, kv_len) - output = F.scaled_dot_product_attention(q, - k, - v, - attn_mask=attention_mask, - is_causal=False) - output = output.permute(2, 0, 1, 3).reshape( - q_len, self.num_local_heads * self.head_dim) - return output - - -class MllamaCrossAttentionDecoderLayer(torch.nn.Module): - """Cross-attention transformer block with tanh-gated attention - and feedforward.""" - - def __init__( - self, - config: config_mllama.MllamaTextConfig, - layer_idx: int, - quant_config: Optional[QuantizationConfig], - prefix: str = "", - ) -> None: - super().__init__() - - self.layer_idx = layer_idx - self.cross_attn = MllamaTextCrossAttention( - config=config, - layer_idx=layer_idx, - quant_config=quant_config, - prefix=f"{prefix}.cross_attn", - ) - - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.cross_attn_attn_gate = torch.nn.Parameter(torch.zeros(1)) - - self.mlp = LlamaMLP( - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.cross_attn_mlp_gate = torch.nn.Parameter(torch.zeros(1)) - - def forward( - self, - hidden_states: torch.Tensor, - cross_attention_states: torch.Tensor, - cross_attention_mask: torch.Tensor, - kv_range_for_decode: Optional[list[tuple[int, int]]], - full_text_row_masked_out_mask: torch.Tensor, - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - - hidden_states = self.cross_attn( - hidden_states=hidden_states, - attention_mask=cross_attention_mask, - kv_range_for_decode=kv_range_for_decode, - cross_attention_states=cross_attention_states, - ) - hidden_states = full_text_row_masked_out_mask * hidden_states - hidden_states = residual + self.cross_attn_attn_gate.tanh( - ) * hidden_states - - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = full_text_row_masked_out_mask * hidden_states - hidden_states = residual + self.cross_attn_mlp_gate.tanh( - ) * hidden_states - return hidden_states - - -class MllamaTextModel(nn.Module): - config_class = config_mllama.MllamaTextConfig - base_model_prefix = "model" - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config.text_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.vocab_size = config.vocab_size - self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8, - config.hidden_size) - self.cross_attention_layers = config.cross_attention_layers - - layers = [] - for layer_idx in range(config.num_hidden_layers): - if layer_idx in self.cross_attention_layers: - layers.append( - MllamaCrossAttentionDecoderLayer( - config, - layer_idx, - quant_config=quant_config, - prefix=f"{prefix}.layers.{layer_idx}", - )) - else: - # TODO: force LlamaDecoderLayer to config.attention_bias=False - layers.append( - LlamaDecoderLayer( - config, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.layers.{layer_idx}", - )) - - self.layers = nn.ModuleList(layers) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.LongTensor, - positions: Optional[torch.LongTensor], - cross_attention_states: Optional[torch.LongTensor], - cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[list[tuple[int, int]]], - full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, - torch.Tensor]], - skip_cross_attention: bool, - ) -> torch.Tensor: - inputs_embeds = self.embed_tokens(input_ids) - hidden_states = inputs_embeds - - for idx, decoder_layer in enumerate(self.layers): - if idx in self.cross_attention_layers: - if not skip_cross_attention: - hidden_states = decoder_layer( - hidden_states=hidden_states, - cross_attention_states=cross_attention_states, - cross_attention_mask=cross_attention_mask, - kv_range_for_decode=kv_range_for_decode, - full_text_row_masked_out_mask= - full_text_row_masked_out_mask, - ) - else: - hidden_states, residual = decoder_layer( - positions=positions, - hidden_states=hidden_states, - residual=None, - ) - hidden_states = hidden_states + residual - hidden_states = self.norm(hidden_states) - return hidden_states - - -class MllamaForCausalLM(nn.Module): - config_class = config_mllama.MllamaTextConfig - base_model_prefix = "language_model" - _no_split_modules = [ - "MllamaCrossAttentionDecoderLayer", "MllamaSelfAttentionDecoderLayer" - ] - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config.text_config - quant_config = vllm_config.quant_config - self.quant_config = quant_config - - self.vocab_size = config.vocab_size - self.model = MllamaTextModel(vllm_config=vllm_config, - prefix=f"{prefix}.model") - self.lm_head = ParallelLMHead( - config.vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE, - quant_config=quant_config, - prefix=f"{prefix}.lm_head", - ) - - def forward( - self, - input_ids: torch.LongTensor, - positions: Optional[torch.LongTensor], - cross_attention_states: Optional[torch.LongTensor], - cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[list[tuple[int, int]]], - full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, - torch.Tensor]], - skip_cross_attention: bool, - ) -> torch.Tensor: - hidden_states = self.model( - input_ids=input_ids, - positions=positions, - cross_attention_states=cross_attention_states, - cross_attention_mask=cross_attention_mask, - kv_range_for_decode=kv_range_for_decode, - full_text_row_masked_out_mask=full_text_row_masked_out_mask, - skip_cross_attention=skip_cross_attention, - ) - return hidden_states - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".gate_proj", 0), - (".gate_up_proj", ".up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - updated_params: set[str] = set() - for name, loaded_weight in weights: - if 'patch_embedding.weight' in name: - name = name.replace('patch_embedding.weight', - 'patch_embedding._linear.weight') - loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1) - if (self.quant_config is not None and - (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else - loaded_weight[0]) - weight_loader(param, loaded_weight) - updated_params.add(scale_name) - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - param = params_dict[name] - updated_params.add(name) - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - orig_name = name - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - logger.debug("Missing name %s, orig name %s", name, - orig_name) - continue - - param = params_dict.pop(name) - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - updated_params.add(name) - return updated_params - - -@MULTIMODAL_REGISTRY.register_processor(MllamaMultiModalProcessor, - info=MllamaProcessingInfo, - dummy_inputs=MllamaDummyInputsBuilder) -class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsV0Only): - packed_modules_mapping = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"] - } - - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - # mapping for new names in checkpoint saved after transformers v4.52 - "model.vision_model.": "vision_model.", - "model.multi_modal_projector.": "multi_modal_projector.", - "model.language_model.": "language_model.model.", - "lm_head.": "language_model.lm_head.", - }, - orig_to_new_suffix={ - "patch_embedding.weight": "patch_embedding._linear.weight", - }, - ) - - @classmethod - def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: - if modality.startswith("image"): - return "<|image|>" - - raise ValueError("Only image modality is supported") - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config: MllamaConfig = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - self.vocab_size = config.text_config.vocab_size - self.hidden_size = config.text_config.hidden_size - self.max_num_tiles = config.vision_config.max_num_tiles - self.vision_output_dim = config.vision_config.vision_output_dim - self.pad_token_id = \ - config.pad_token_id if config.pad_token_id is not None else -1 - self.image_size = config.vision_config.image_size - self.image_token_id = config.image_token_index - - self.vision_model = MllamaVisionModel(config.vision_config, - quant_config, - prefix=maybe_prefix( - prefix, "vision_model")) - self.language_model = MllamaForCausalLM( - vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model"), - ) - self.multi_modal_projector = ColumnParallelLinear( - config.vision_config.vision_output_dim, - config.text_config.hidden_size, - bias=True, - quant_config=quant_config, - gather_output=True, - prefix=maybe_prefix(prefix, "multi_modal_projector"), - ) - self.logits_processor = LogitsProcessor(config.output_hidden_states, - config.text_config.vocab_size) - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.language_model.lm_head, - hidden_states, sampling_metadata) - return logits - - def unpack_data(self, - image_data: Union[list[torch.Tensor], torch.Tensor], - padding_value=0) -> torch.Tensor: - if isinstance(image_data, torch.Tensor): - # torch.Tensor - return image_data - else: - assert isinstance( - image_data[0], - torch.Tensor), "Image data is not properly batched." - # list[torch.Tensor] - bsz = len(image_data) - max_length = max(t.size(0) for t in image_data) - trailing_dims = image_data[0].shape[1:] - for data in image_data: - cur_trailing_dims = data.shape[1:] - assert cur_trailing_dims == trailing_dims - output_tensor = torch.full((bsz, max_length, *trailing_dims), - padding_value, - dtype=image_data[0].dtype, - device=image_data[0].device) - for i, t in enumerate(image_data): - output_tensor[i, :t.size(0)] = t - return output_tensor - - def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[MllamaImagePixelInputs]: - # tensor with the same shape will be batched together by - # MultiModalKwargs.batch, so pixel_values here can be: - # - list[torch.Tensor]: - # with shape (num_image, num_tiles, 3, image_res, image_res) - # - torch.Tensor: - # with shape (bs, num_image, num_tiles, 3, image_res, image_res) - pixel_values: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "pixel_values", None) - image_embeds: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "image_embeds", None) - aspect_ratio_ids: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "aspect_ratio_ids", None) - aspect_ratio_mask: Optional[Union[list[list[torch.Tensor]], - list[torch.Tensor], - torch.Tensor]] = kwargs.pop( - "aspect_ratio_mask", None) - - if pixel_values is None and image_embeds is None: - return None - - if pixel_values is not None and image_embeds is not None: - raise ValueError( - "Both pixel values and image embeds are provided.") - - if pixel_values is not None: - assert aspect_ratio_ids is not None - assert aspect_ratio_mask is not None - - return MllamaImagePixelInputs( - type="pixel_values", - data=self.unpack_data(pixel_values), - aspect_ratio_ids=self.unpack_data(aspect_ratio_ids), - aspect_ratio_mask=self.unpack_data(aspect_ratio_mask)) - - if image_embeds is not None: - raise NotImplementedError - - raise AssertionError("This line should be unreachable.") - - def _get_and_validate_encoder_lens( - self, - encoder_seq_lens: list[int], - num_tiles: list[list[int]], - num_tokens_per_tile: int, - ) -> list[int]: - # Get the actual number of encoder tokens for each sample. - # Because attn_metadata.encoder_seq_lens only counts the last - # group of images for each sample, which is used to cheat the - # block manager to allocate blocks for those images only. - # See MllamaMultiModalProcessor for more details. - actual_encoder_seq_lens = [ - sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles - ] - - # remove 0 encoder len entries for text-only requests for these - # assertions - attn_metadata_lens = [x for x in encoder_seq_lens if x > 0] - assert len(actual_encoder_seq_lens) == len(attn_metadata_lens) - for actual_len, last_group_len in zip(actual_encoder_seq_lens, - attn_metadata_lens): - assert actual_len >= last_group_len - - return actual_encoder_seq_lens - - def flat_encoder_result(self, cross_attention_states: torch.Tensor, - attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: list[int]): - - cross_attention_states_flat = torch.zeros( - sum(actual_encoder_seq_lens), - cross_attention_states.shape[-1], - device=cross_attention_states.device, - dtype=cross_attention_states.dtype) - start_pos = 0 - for seq_len, vision_token_in_batch in zip(actual_encoder_seq_lens, - cross_attention_states): - end_pos = start_pos + seq_len - cross_attention_states_flat[ - start_pos:end_pos] = vision_token_in_batch[:seq_len] - start_pos = end_pos - cross_attention_states = cross_attention_states_flat - return cross_attention_states - - def get_language_model(self) -> torch.nn.Module: - return self.language_model - - def get_cross_attention_states( - self, - image_inputs: MllamaImagePixelInputs, - attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: list[int], - ) -> tuple[torch.Tensor]: - # NOTE: llama's reference implementation runs vision model on CPU - pixel_values = image_inputs['data'] - aspect_ratio_ids = image_inputs['aspect_ratio_ids'] - aspect_ratio_mask = image_inputs['aspect_ratio_mask'] - cross_attention_states = self.vision_model(pixel_values, - aspect_ratio_ids, - aspect_ratio_mask) - cross_attention_states, _ = self.multi_modal_projector( - cross_attention_states) - - bsz, _, _, _, image_token_dim = tuple(cross_attention_states.shape) - cross_attention_states = cross_attention_states.view( - bsz, -1, image_token_dim) - - cross_attention_states = self.flat_encoder_result( - cross_attention_states, attn_metadata, actual_encoder_seq_lens) - - return cross_attention_states - - def get_cross_attention_mask( - self, - input_ids: torch.Tensor, - attn_metadata: AttentionMetadata, - num_tiles: list[list[int]], - num_tokens_per_tile: int, - dtype: torch.dtype, - ) -> tuple[torch.Tensor, torch.Tensor]: - token_ids = input_ids.tolist() - start = 0 - batch_token_ids = [] - for seq_len in attn_metadata.seq_lens: - batch_token_ids.append(token_ids[start:start + seq_len]) - start += seq_len - sparse_mask = [ - get_cross_attention_token_mask(t, self.image_token_id) - for t in batch_token_ids - ] - - # Skip generating cross-attention mask if all samples - # are text-only or have only 1 leading image. - if skip_attention_mask(sparse_mask): - return None, None - - dense_mask, tile_range_for_decode = \ - convert_sparse_cross_attention_mask_to_dense( - sparse_mask, num_tiles, attn_metadata.seq_lens) - cross_attention_mask = \ - convert_dense_cross_attention_mask_to_tensor( - dense_mask, num_tokens_per_tile, input_ids.device, dtype) - kv_range_for_decode = [[ - t[0] * num_tokens_per_tile, t[1] * num_tokens_per_tile - ] for t in tile_range_for_decode] - - return cross_attention_mask, kv_range_for_decode - - def get_full_text_row_masked_out_mask( - self, - attn_metadata: AttentionMetadata, - device: torch.device, - ) -> torch.Tensor: - full_text_row_masked_out_mask = torch.ones( - (attn_metadata.num_prefill_tokens, 1), dtype=torch.bool) - start_pos = 0 - for seq_len, encoder_seq_len in zip(attn_metadata.seq_lens, - attn_metadata.encoder_seq_lens): - if encoder_seq_len == 0: - full_text_row_masked_out_mask[start_pos:start_pos + - seq_len] = False - start_pos += seq_len - full_text_row_masked_out_mask = full_text_row_masked_out_mask.to( - device) - return full_text_row_masked_out_mask - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - **kwargs: object, - ) -> Union[CausalLMOutputWithPast]: - attn_metadata = get_forward_context().attn_metadata - if attn_metadata.num_prefill_tokens > 0 and \ - attn_metadata.num_decode_tokens > 0: - raise ValueError("Chunk prefill not supported") - image_inputs = self._parse_and_validate_image_input(**kwargs) - cross_attention_states = None - cross_attention_mask = None - kv_range_for_decode = None - - # For 1) text-only prefill and decode, 2) image-present decode. - if image_inputs is None: - full_text_row_masked_out_mask = ( - attn_metadata.encoder_seq_lens_tensor - != 0).reshape(-1, 1).to(input_ids.device) - skip_cross_attention = attn_metadata.max_encoder_seq_len == 0 - - # For image-present prefill. - else: - skip_cross_attention = False - - num_tiles = [t.tolist() for t in kwargs.pop("num_tiles")] - num_tokens_per_tile = calc_token_per_chunk(self.image_size) - - actual_encoder_seq_lens = self._get_and_validate_encoder_lens( - attn_metadata.encoder_seq_lens, - num_tiles, - num_tokens_per_tile, - ) - - cross_attention_states = self.get_cross_attention_states( - image_inputs, attn_metadata, actual_encoder_seq_lens) - - full_text_row_masked_out_mask = \ - self.get_full_text_row_masked_out_mask( - attn_metadata, input_ids.device) - - cross_attention_mask, kv_range_for_decode = \ - self.get_cross_attention_mask( - input_ids, attn_metadata, num_tiles, - num_tokens_per_tile, cross_attention_states.dtype) - - outputs = self.language_model( - input_ids=input_ids, - positions=positions, - cross_attention_states=cross_attention_states, - cross_attention_mask=cross_attention_mask, - kv_range_for_decode=kv_range_for_decode, - full_text_row_masked_out_mask=full_text_row_masked_out_mask, - skip_cross_attention=skip_cross_attention, - ) - - return outputs - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) - - def get_mm_mapping(self) -> MultiModelKeys: - """ - Get the module prefix in multimodal models - """ - return MultiModelKeys.from_string_field( - language_model="language_model", - connector="multi_modal_projector", - tower_model="vision_model") - - -def skip_attention_mask(sparse_mask: list[list[int]]) -> bool: - for mask in sparse_mask: - # Skip text-only samples. - if len(mask) == 0: - continue - # If the sample contains more than 1 images, - # we can't skip mask. - if len(mask) != 1: - return False - # If the sample contains only 1 image, - # but the image is not the leading one, - # we can't skip mask. - if mask[0][0] != 0 or mask[0][1] != -1: - return False - return True - - -def convert_sparse_cross_attention_mask_to_dense( - sparse_mask: list[list[list[int]]], - num_tiles: list[list[int]], - lengths: list[int], -) -> tuple[np.ndarray, list[tuple[int, int]]]: - total_length = sum(lengths) - total_tiles = sum([sum(tiles) for tiles in num_tiles]) - dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64) - # A list of ranges, range[i] = [start, end] means that the i-th image will - # use tiles[start, end] for cross-attention decoding. - tile_range_for_decode = [] - - seq_start = 0 - tile_start = 0 - - # sparse_mask has an [] entry for each sequence that does not have images, - # but num_tiles does not have these entries... - num_tiles_idx = 0 - for masks, length in zip(sparse_mask, lengths): - if len(masks) == 0: - # Text only - continue - - tiles = num_tiles[num_tiles_idx] - num_tiles_idx += 1 - ts, td = -1, 0 - for mask, tile in zip(masks, tiles): - if len(mask) != 2: - continue - start, end = mask - end = min(end, length) - if end == -1: - end = length - if end == length: - if ts == -1: - ts = tile_start - td += tile - dense_mask[seq_start + start:seq_start + end, - tile_start:tile_start + tile] = 1 - tile_start += tile - assert ts != -1 - assert td != 0 - tile_range_for_decode.append((ts, ts + td)) - seq_start += length - assert num_tiles_idx == len(num_tiles) - - return dense_mask, tile_range_for_decode - - -def convert_dense_cross_attention_mask_to_tensor( - cross_attention_token_mask: np.ndarray, - num_tokens_per_tile: int, - device: torch.device, - dtype: torch.dtype, -) -> torch.Tensor: - mask = torch.tensor(cross_attention_token_mask, dtype=dtype, device=device) - mask = mask.repeat_interleave(num_tokens_per_tile, dim=1) - - mask = 1.0 - mask - mask = mask.masked_fill(mask.to(torch.bool), torch.finfo(dtype).min) - - ninf = torch.finfo(dtype).min - full_text_mask = ((mask != ninf).any(dim=-1).type_as(mask)[..., None]) - mask *= full_text_mask - # (num_prompt_tokens, num_encoder_tokens) - return mask diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 85759df369850..6bb65ed6debc6 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -52,6 +52,7 @@ _TEXT_GENERATION_MODELS = { # baichuan-13b, lower case 'c' in the class name "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), "BailingMoeForCausalLM": ("bailing_moe", "BailingMoeForCausalLM"), + "BailingMoeV2ForCausalLM": ("bailing_moe", "BailingMoeV2ForCausalLM"), "BambaForCausalLM": ("bamba", "BambaForCausalLM"), "BloomForCausalLM": ("bloom", "BloomForCausalLM"), "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), @@ -146,10 +147,6 @@ _TEXT_GENERATION_MODELS = { "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"), "XverseForCausalLM": ("llama", "LlamaForCausalLM"), "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"), - # [Encoder-decoder] - "BartModel": ("bart", "BartForConditionalGeneration"), - "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), - "MBartForConditionalGeneration": ("bart", "MBartForConditionalGeneration"), } _EMBEDDING_MODELS = { @@ -236,6 +233,7 @@ _MULTIMODAL_MODELS = { "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"), "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"), + "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501 "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 @@ -262,16 +260,12 @@ _MULTIMODAL_MODELS = { "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), + "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"), "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501 "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501 "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501 "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 # [Encoder-decoder] - "DonutForConditionalGeneration": ("donut", "DonutForConditionalGeneration"), - "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501 - "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501 - "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501 - "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"), "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501 } diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index a386f47e1929f..4f51441e28efa 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -229,7 +229,8 @@ class MultiModalProcessingInfo(BaseProcessingInfo): def get_max_image_tokens(self) -> int: width, height = self.get_max_image_size() processor = self.get_hf_processor() - mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {} + multimodal_config = self.ctx.model_config.multimodal_config + mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {} mm_tokens = processor._get_num_multimodal_tokens( image_sizes=([height, width], ), **mm_processor_kwargs) image_tokens = mm_tokens["num_image_tokens"][0] @@ -380,8 +381,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1 mm_positions = torch.where(mm_token_type_ids == 1)[1] images = mm_items.get_items("image", ImageProcessorItems) - mm_processor_kwargs = (self.info.ctx.model_config.mm_processor_kwargs - or {}) + multimodal_config = self.info.ctx.model_config.multimodal_config + mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {} image_sizes = [] for item_idx in range(len(images)): image_size = images.get_image_size(item_idx) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index ad911ebedf895..371ca817d5f92 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -662,7 +662,7 @@ def pad_and_concat_to_dim3( max_len = max(f.shape[-1] for f in features) # Ensure all features have dim=3 features = [f.view(-1, *f.shape[-2:]) for f in features] - # Pad and oncatenate: + # Pad and concatenate: # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)] features = [F.pad(f, (0, max_len - f.shape[-1])) for f in features] return torch.cat(features) diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index f3b273eb41e8f..d7e9d402a1f97 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -106,7 +106,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]: return librosa.load(filepath, sr=None) - def encode_base64(self, media: tuple[npt.NDArray, float]) -> str: + def encode_base64(self, media: tuple[npt.NDArray, int]) -> str: audio, sr = media with BytesIO() as buffer: diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index ffc69a2db60a4..bad6c0c3d9db2 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -209,7 +209,7 @@ class MultiModalProfiler(Generic[_I]): if processor.pad_dummy_encoder_prompt: num_tokens_to_pad = max(total_len, seq_len) - total_len encoder_prompt_token_ids.extend([0] * num_tokens_to_pad) - # NOTE: Whisper and Donut allows total_len > seq_len. + # NOTE: Whisper allows total_len > seq_len. elif total_len > seq_len and not envs.VLLM_USE_V1: # `max_num_batched_tokens` is defined by `SchedulerConfig` logger.warning_once( diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index e09c97de576ef..b308366fca282 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -310,7 +310,7 @@ class MediaConnector: def encode_audio_base64( audio: np.ndarray, - sampling_rate: float, + sampling_rate: int, ) -> str: """Encode audio as base64.""" audio_io = AudioMediaIO() diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index df6e19da82ca2..fb2dcac49ee93 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -121,14 +121,6 @@ class OpenCVVideoBackend(VideoLoader): original_fps = cap.get(cv2.CAP_PROP_FPS) duration = total_frames_num / original_fps if original_fps > 0 else 0 - # Use transformers transformers.video_utils.VideoMetadata format - metadata = { - "total_num_frames": total_frames_num, - "fps": original_fps, - "duration": duration, - "video_backend": "opencv" - } - # resample video to target num_frames full_read = num_frames == -1 or total_frames_num < num_frames if full_read: @@ -159,6 +151,20 @@ class OpenCVVideoBackend(VideoLoader): assert i == num_frames, (f"Expected reading {num_frames} frames, " f"but only loaded {i} frames from video.") + # Use transformers transformers.video_utils.VideoMetadata format + # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata + # can cause incorrect timestamp calculation without num_frames=-1. + metadata = { + "total_num_frames": num_frames, + "fps": original_fps, + "duration": duration, + "video_backend": "opencv", + "frames_indices": list(range(num_frames)), + # extra field used to control hf processor's video + # sampling behavior + "do_sample_frames": num_frames == total_frames_num, + } + return frames, metadata @@ -170,7 +176,7 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): cls, data: bytes, num_frames: int = -1, - requested_fps: int = 2, + fps: int = 2, max_duration: int = 300, **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: @@ -185,14 +191,6 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): original_fps = cap.get(cv2.CAP_PROP_FPS) duration = total_frames_num / original_fps if original_fps > 0 else 0 - # Use transformers transformers.video_utils.VideoMetadata format - metadata = { - "total_num_frames": total_frames_num, - "fps": original_fps, - "duration": duration, - "video_backend": "opencv_dynamic" - } - # resample video to target num_frames max_frame_idx = total_frames_num - 1 duration = duration or round(max_frame_idx / original_fps) + 1 @@ -201,14 +199,13 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140 frame_indices: Union[range, list[int]] if duration <= max_duration: - n = int(math.floor(duration * requested_fps)) + n = int(math.floor(duration * fps)) frame_indices = sorted({ - min(max_frame_idx, - int(math.ceil(i * original_fps / requested_fps))) + min(max_frame_idx, int(math.ceil(i * original_fps / fps))) for i in range(n) }) else: - num_samples = int(max_duration * requested_fps) + num_samples = int(max_duration * fps) if num_samples >= total_frames_num: frame_indices = range(total_frames_num) else: @@ -241,6 +238,16 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): f"Expected reading {len(frame_indices)} frames, " f"but only loaded {i} frames from video.") + # Use transformers transformers.video_utils.VideoMetadata format + metadata = { + "total_num_frames": total_frames_num, + "fps": original_fps, + "duration": duration, + "video_backend": "opencv_dynamic", + "frames_indices": list(frame_indices), + "do_sample_frames": False, + } + return frames, metadata diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 32208e7fff018..67ef058df10f1 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -163,13 +163,15 @@ class XPUPlatform(Platform): vllm_config.scheduler_config.max_num_batched_tokens = max( vllm_config.scheduler_config.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) + from vllm.v1.attention.backends.utils import set_kv_cache_layout - if (envs.VLLM_KV_CACHE_LAYOUT is None - or envs.VLLM_KV_CACHE_LAYOUT != "NHD"): - os.environ["VLLM_KV_CACHE_LAYOUT"] = "NHD" - logger.info( - "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; " - "only NHD layout is supported by XPU attention kernels.") + set_kv_cache_layout("NHD") + logger.info("Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; " + "only NHD layout is supported by XPU attention kernels.") + + @classmethod + def support_hybrid_kv_cache(cls) -> bool: + return True @classmethod def is_pin_memory_available(cls): diff --git a/vllm/test_utils.py b/vllm/test_utils.py index 23679b8228d6f..91dcc2fd84e17 100644 --- a/vllm/test_utils.py +++ b/vllm/test_utils.py @@ -36,7 +36,6 @@ MODELS_ON_S3 = [ "llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf", # "meta-llama/Llama-2-7b-hf", - "meta-llama/Llama-3.2-11B-Vision-Instruct", "meta-llama/Llama-3.2-1B", "meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Meta-Llama-3-8B", diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py index d09c5fa924fb0..3a97f2c056181 100644 --- a/vllm/transformers_utils/chat_templates/registry.py +++ b/vllm/transformers_utils/chat_templates/registry.py @@ -35,7 +35,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = { "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja", "chameleon": CHAT_TEMPLATES_DIR / "template_basic.jinja", "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja", - "florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja", "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja", "minicpmv": _get_minicpmv_chat_template_fallback, "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja", diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index fd19d33ca0c89..cafc43f6b7673 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -90,11 +90,6 @@ _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = { "internvl_chat": { "has_no_defaults_at_init": True }, - # transformers regards mllama as is_encoder_decoder=False - # vllm needs is_encoder_decoder=True to enable cross-attention - "mllama": { - "is_encoder_decoder": True - }, "NVLM_D": { "has_no_defaults_at_init": True }, diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index 372200027bf95..2a06a9b7d11e4 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -68,7 +68,7 @@ class TritonPlaceholder(types.ModuleType): def __init__(self): super().__init__("triton") - self.__version__ = "3.3.0" + self.__version__ = "3.4.0" self.jit = self._dummy_decorator("jit") self.autotune = self._dummy_decorator("autotune") self.heuristics = self._dummy_decorator("heuristics") diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 90cdd396209c7..38d92f01192b1 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -67,23 +67,6 @@ def _missing(*_: Any, **__: Any) -> NoReturn: "package to enable FP8 kernels.") -def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None: - """Return the *new* symbol if it exists, otherwise the *old* one.""" - if hasattr(module, new): - return getattr(module, new) - if hasattr(module, old): - # TODO(wentao): deprecate old symbol in the future. - logger.warning_once( - "Found legacy DeepGEMM symbol `%s`. Please upgrade the `deep_gemm` " - "package so that `%s` is available. Support for the legacy symbol " - "will be removed in a future vLLM release.", - old, - new, - ) - return getattr(module, old) - return None - - _fp8_gemm_nt_impl: Callable[..., Any] | None = None _grouped_impl: Callable[..., Any] | None = None _grouped_masked_impl: Callable[..., Any] | None = None @@ -109,14 +92,9 @@ def _lazy_init() -> None: _dg = importlib.import_module("deep_gemm") - _fp8_gemm_nt_impl = _resolve_symbol(_dg, "fp8_gemm_nt", - "gemm_fp8_fp8_bf16_nt") - _grouped_impl = _resolve_symbol( - _dg, "m_grouped_fp8_gemm_nt_contiguous", - "m_grouped_gemm_fp8_fp8_bf16_nt_contiguous") - _grouped_masked_impl = _resolve_symbol( - _dg, "fp8_m_grouped_gemm_nt_masked", - "m_grouped_gemm_fp8_fp8_bf16_nt_masked") + _fp8_gemm_nt_impl = getattr(_dg, "fp8_gemm_nt", None) + _grouped_impl = getattr(_dg, "m_grouped_fp8_gemm_nt_contiguous", None) + _grouped_masked_impl = getattr(_dg, "fp8_m_grouped_gemm_nt_masked", None) def fp8_gemm_nt(*args, **kwargs): diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 6017445402eca..78af8d28f8892 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -210,9 +210,14 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): sm_scale, num_kv_splits, ) - returned_lse = lse[:, :H].contiguous( - ) if self.need_to_return_lse_for_decode else lse - return out[:, :H].contiguous(), returned_lse + + if H < MAX_HEADS: + # Extract the subsets of the outputs + returned_lse = lse[:, :H].contiguous( + ) if self.need_to_return_lse_for_decode else lse + out = out[:, :H] + + return out, returned_lse def _sm100_forward_decode( self, @@ -228,11 +233,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): self._workspace.ensure_size(attn_metadata, self._num_kv_splits) # Run MLA - # Clone q_nope and q_pe to make sure strides computation is correct. - # TODO: Check if we really need it - q_nope = q_nope.clone() - q_pe = q_pe.clone() - o, lse = self._sm100_cutlass_mla_decode( q_nope, q_pe, diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 549af1a062252..150e38553e4bb 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -12,7 +12,6 @@ from vllm.attention.ops.flashmla import (flash_mla_with_kvcache, is_flashmla_supported) from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.platforms.cuda import CudaPlatform from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonDecodeMetadata, MLACommonImpl, @@ -156,18 +155,8 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): logits_soft_cap, attn_type, kv_sharing_target_layer_name, **mla_args) - assert is_flashmla_supported(), \ - "FlashMLA is not supported on this device" - - # disallow FlashMLA on NVIDIA Blackwell (SM 10.0+) GPUs - # context: - # https://github.com/deepseek-ai/FlashMLA/issues/83 - # https://github.com/vllm-project/vllm/issues/24513 - if CudaPlatform.has_device_capability(100): - raise NotImplementedError( - "FlashMLA is temporarily disabled on Blackwell (SM 10.0). " - "Please use CUTLASS_MLA or TRITON_MLA instead. " - "Example: `export VLLM_ATTENTION_BACKEND=CUTLASS_MLA`") + is_supported, reason = is_flashmla_supported() + assert is_supported, reason unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index c1814b4ba27cc..ec1c5c8060ab3 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -5,8 +5,8 @@ import enum import functools from abc import abstractmethod from dataclasses import dataclass, fields, make_dataclass -from typing import (TYPE_CHECKING, Any, ClassVar, Generic, Optional, Protocol, - TypeVar) +from typing import (TYPE_CHECKING, Any, ClassVar, Generic, Literal, Optional, + Protocol, TypeVar, Union, get_args) import numpy as np import torch @@ -30,7 +30,12 @@ from vllm.logger import init_logger from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) -_KV_CACHE_LAYOUT_OVERRIDE = None +KVCacheLayoutType = Literal["NHD", "HND"] +_KV_CACHE_LAYOUT_OVERRIDE: Union[KVCacheLayoutType, None] = None + + +def is_valid_kv_cache_layout(value: str) -> bool: + return value in get_args(KVCacheLayoutType) @dataclass @@ -296,12 +301,13 @@ def get_kv_cache_layout(): if cache_layout is None: cache_layout = get_kv_connector_cache_layout() else: + assert is_valid_kv_cache_layout(cache_layout) logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \ "detected. Setting KV cache layout to %s.", cache_layout) return cache_layout -def set_kv_cache_layout(cache_layout: str): +def set_kv_cache_layout(cache_layout: KVCacheLayoutType): global _KV_CACHE_LAYOUT_OVERRIDE _KV_CACHE_LAYOUT_OVERRIDE = cache_layout diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index f939da8c5b5c3..f225b73264049 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -5,7 +5,7 @@ import os from collections import defaultdict, deque from collections.abc import Iterable, Sequence -from dataclasses import astuple, dataclass +from dataclasses import dataclass from typing import Any, Callable, NewType, Optional, Union from vllm import envs @@ -116,8 +116,8 @@ class PrefixCachingMetrics: This function is called with information gathered when new requests are being scheduled and are looking for computed blocks. - When there are more than `interval` requests, the oldest set of - requests are removed from the metrics. + When there are more than `max_recent_requests` requests, the oldest set + of requests are removed from the metrics. Args: stats: The prefix cache stats. @@ -370,7 +370,6 @@ class FreeKVCacheBlockQueue: """ if len(blocks) == 0: return - self.num_free_blocks += len(blocks) last_block = self.fake_free_list_tail.prev_free_block assert last_block is not None, ( @@ -385,6 +384,8 @@ class FreeKVCacheBlockQueue: last_block.next_free_block = self.fake_free_list_tail self.fake_free_list_tail.prev_free_block = last_block + self.num_free_blocks += len(blocks) + def get_all_free_blocks(self) -> list[KVCacheBlock]: """Get all free blocks in the free list. Mainly used for testing. @@ -811,59 +812,21 @@ def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int: return page_sizes.pop() -def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, - kv_cache_spec: dict[str, KVCacheSpec], - available_memory: int) -> KVCacheConfig: +def _get_kv_cache_groups_uniform_type( + kv_cache_specs: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]: """ Generates the KV cache configuration for a model with one type of KV cache. Divide the available memory equally among all layers. Args: - vllm_config: The global VllmConfig - kv_cache_spec: The kv cache spec of each attention layer in the model - available_memory: Memory available for KV cache in bytes. + kv_cache_specs: The kv cache spec of each attention layer in the model Returns: - The generated KVCacheConfig + The generated KVCacheGroupSpecs """ - page_size = get_uniform_page_size(kv_cache_spec) - num_blocks = get_num_blocks(vllm_config, len(kv_cache_spec), - available_memory, page_size) - - per_layer_size = page_size * num_blocks - # All layers have the same KV cache spec, so we create one kv cache group - # for all layers. - grouped_layer_names = [list(kv_cache_spec.keys())] - - # Each layer uses a separate Tensor to store its KV cache. - kv_cache_tensors = [ - KVCacheTensor(size=per_layer_size, shared_by=[layer_name]) - for layer_name in kv_cache_spec - ] - - kv_cache_config = KVCacheConfig( - num_blocks=num_blocks, - kv_cache_tensors=kv_cache_tensors, - kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec, - grouped_layer_names), - ) - - num_tokens = num_blocks * vllm_config.cache_config.block_size - if vllm_config.parallel_config.decode_context_parallel_size > 1: - num_tokens *= vllm_config.parallel_config.decode_context_parallel_size - logger.info( - "Multiplying the GPU KV cache size by the dcp_world_size %d.", - vllm_config.parallel_config.decode_context_parallel_size) - - num_tokens_str = f"{num_tokens:,}" - logger.info("GPU KV cache size: %s tokens", num_tokens_str) - max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" - max_concurrency = get_max_concurrency_for_kv_cache_config( - vllm_config, kv_cache_config) - logger.info("Maximum concurrency for %s tokens per request: %.2fx", - max_model_len_str, max_concurrency) - return kv_cache_config + return create_kv_cache_group_specs(kv_cache_specs, + [list(kv_cache_specs.keys())]) def is_kv_cache_page_size_uniform( @@ -888,11 +851,10 @@ def is_kv_cache_type_attention_free( return not kv_cache_spec -def _get_kv_cache_config_uniform_page_size( - vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], - available_memory: int) -> KVCacheConfig: +def _get_kv_cache_groups_uniform_page_size( + kv_cache_spec: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]: """ - Generates the KV cache configuration for hybrid models with multiple + Generates the KV cache groups for hybrid models with multiple attention types but still with a uniform page size (physical memory per block per layer) for all layers. @@ -949,11 +911,9 @@ def _get_kv_cache_config_uniform_page_size( memory per block is the same for all groups. Args: - vllm_config: The global VllmConfig kv_cache_spec: The KVCacheSpec of each attention layer in the model - available_memory: Memory available for KV cache in bytes. Returns: - The generated KVCacheConfig + The generated KVCacheGroupSpecs """ # Group all layers by kv_cache_spec. # E.g., 2 full attention layers and 3 sliding window attention layers, @@ -966,7 +926,7 @@ def _get_kv_cache_config_uniform_page_size( # group identical. Add padding to the last group of each type if necessary. # E.g., (full.0, full.1), (sw.0, sw.1, sw.2) # split to 3 groups with 2 layers each: - # (full.0, full.1), (sw.0, sw.1), (sw.2, padding). + # (full.0, full.1), (sw.0, sw.2), (sw.1, padding). # FIXME(Chen): At the moment of writing this code (2025-06-02), all # open-source hybrid model follows a n:1 pattern between different attention # types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and @@ -984,19 +944,60 @@ def _get_kv_cache_config_uniform_page_size( num_padding_layers, num_padding_layers / len(layers) * 100, ) - for i in range(0, len(layers), group_size): - grouped_layers.append(layers[i:i + group_size]) - kv_cache_groups = create_kv_cache_group_specs(kv_cache_spec, - grouped_layers) + num_groups = cdiv(len(layers), group_size) + # In PP case, say if we have + # - stage 0: full.0, sw.0, sw.1 + # - stage 1: full.1, sw.2, sw.3 + # We should have 3 groups: (full.0, full.1), (sw.0, sw.2), (sw.1, sw.3) + # It can't be (full.0, full.1), (sw.0, sw.1), (sw.2, sw.3) because + # the 3 groups in stage 0 will be (full.0), (sw.0, sw.1), (empty group) + # and it will be padded to (full.0, padding), (sw.0, sw.1), + # (padding, padding) to ensure the number of layers in each group is + # the same and will cause memory waste. + # To avoid this, we assign layers[i::num_groups] to the i-th group + # instead of layers[i * group_size: (i + 1) * group_size] + for i in range(num_groups): + grouped_layers.append(layers[i::num_groups]) + return create_kv_cache_group_specs(kv_cache_spec, grouped_layers) + + +def get_kv_cache_config_from_groups(vllm_config: VllmConfig, + kv_cache_groups: list[KVCacheGroupSpec], + kv_cache_specs: dict[str, KVCacheSpec], + available_memory: int) -> KVCacheConfig: + """ + Generate the KV cache configuration from the KV cache groups and spec + of each layer. + + Args: + vllm_config: The global VllmConfig + kv_cache_groups: The KV cache groups + kv_cache_specs: The KV cache spec of each attention layer in the model + available_memory: Memory available for KV cache in bytes + Returns: + The generated KVCacheConfig + """ + if len(kv_cache_groups) == 0: + # Attention free models do not have KV cache. + # Return num_blocks=1 as BlockPool always needs a null_block. + return KVCacheConfig( + num_blocks=1, + kv_cache_tensors=[], + kv_cache_groups=kv_cache_groups, + ) # Determine how model runners should initialize the KV cache tensors. # We will have group_size memory pools, each is shared by one layer from # each group. As layers of different groups have different block table, # they will use different parts of the shared Tensor. - # The memory layout in the example will be: - # full.0, sw.0, sw.2: share a Tensor with size=available_memory//2 - # full.1, sw.1: share another Tensor with size=available_memory//2 - page_size = get_uniform_page_size(kv_cache_spec) + # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2), + # (sw.1, padding) will be: (group_size = 2) + # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2 + # full.1, sw.2: share another Tensor with size=available_memory//2 + group_size = max(len(group.layer_names) for group in kv_cache_groups) + + page_size = get_uniform_page_size(kv_cache_specs) + assert group_size > 0, "group_size must be greater than 0" num_blocks = get_num_blocks(vllm_config, group_size, available_memory, page_size) per_memory_pool_size = page_size * num_blocks @@ -1004,8 +1005,8 @@ def _get_kv_cache_config_uniform_page_size( for i in range(group_size): shared_by = [] for j in range(len(kv_cache_groups)): - if i < len(grouped_layers[j]): - shared_by.append(grouped_layers[j][i]) + if i < len(kv_cache_groups[j].layer_names): + shared_by.append(kv_cache_groups[j].layer_names[i]) kv_cache_tensors.append( KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by)) @@ -1019,7 +1020,12 @@ def _get_kv_cache_config_uniform_page_size( [group.kv_cache_spec.block_size for group in kv_cache_groups]) # Print the KV cache size and maximum concurrency. - num_tokens = num_blocks // len(grouped_layers) * min_block_size + num_tokens = num_blocks // len(kv_cache_groups) * min_block_size + if vllm_config.parallel_config.decode_context_parallel_size > 1: + num_tokens *= vllm_config.parallel_config.decode_context_parallel_size + logger.info( + "Multiplying the GPU KV cache size by the dcp_world_size %d.", + vllm_config.parallel_config.decode_context_parallel_size) num_tokens_str = f"{num_tokens:,}" logger.info("GPU KV cache size: %s tokens", num_tokens_str) max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" @@ -1030,10 +1036,6 @@ def _get_kv_cache_config_uniform_page_size( return kv_cache_config -def _get_kv_cache_config_attention_free() -> KVCacheConfig: - return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[]) - - def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): """ This function tries to convert the KV cache specs to one type if the model @@ -1087,72 +1089,112 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): "convert the KV cache specs to one unified type.") -def get_kv_cache_config( - vllm_config: VllmConfig, - kv_cache_spec: dict[str, KVCacheSpec], - available_memory: int, -) -> KVCacheConfig: +def get_kv_cache_groups( + vllm_config: VllmConfig, + kv_cache_spec: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]: """ - Generates the KV cache configuration for a model. + Split the layers in the model into groups with the same KV cache spec. Args: vllm_config: The global VllmConfig kv_cache_spec: The kv cache spec of each attention layer in the model - available_memory: Memory available for KV cache in bytes. Returns: - The generated KVCacheConfigs + The generated KVCacheGroups """ - check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager: unify_hybrid_kv_cache_specs(kv_cache_spec) if is_kv_cache_type_attention_free(kv_cache_spec): - # This returns a kv_cache config with 0 kv_cache groups and 1 block - # to allow for the KVCache manager to handle attention free models. - return _get_kv_cache_config_attention_free() + # This returns an empty list to allow for the KVCacheManager to handle + # attention free models. + return [] elif is_kv_cache_type_uniform(kv_cache_spec): # KV cache of all layers are the same, which is true for # most models. Allocate the same amount of memory for # each layer. - return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec, - available_memory) + return _get_kv_cache_groups_uniform_type(kv_cache_spec) elif is_kv_cache_page_size_uniform(kv_cache_spec): # Model contains multiple attention types, but KV cache of all layers # have the same physical memory per block per layer. Split the layers # into groups with the same number of layers, and thus same total page # size. - return _get_kv_cache_config_uniform_page_size(vllm_config, - kv_cache_spec, - available_memory) + return _get_kv_cache_groups_uniform_page_size(kv_cache_spec) raise NotImplementedError -def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]): +def get_kv_cache_configs(vllm_config: VllmConfig, + kv_cache_specs: list[dict[str, KVCacheSpec]], + available_memory: list[int]) -> list[KVCacheConfig]: """ - Make the KV cache configurations for each worker consistent, so that all - workers can be controlled by the same KVCacheManager. - This function verifies that the layer group of each worker are the same, - and changes the num_blocks of each worker to the smallest among all workers. + Generates the KV cache configurations for a model. + Since we use a shared centralized controller for all workers, we need the + `kv_cache_config` to be consistent across all workers to make sure + the KV cache allocation can be applied to all workers. However, different + workers may have different memory available, and different type of layers + (when pipeline parallel is enabled). To handle the difference between + workers, the current implementation is: + 1. Merge the KV cache specs of all workers to get the KVCacheSpecs for + the whole model. + 2. Generate the KV cache groups based on the layer ratio of the whole model. + 3. Generate the KV cache configs for each worker based on the KV cache + grouping strategy. (This is reasonable because the layer ratio of + different PP stages are similar.) + 4. Change the num_blocks of each worker to the smallest among all workers. Args: - kv_cache_configs: The KV cache configurations for each worker. Will be - in-place modified to make them consistent. + vllm_config: The global VllmConfig + kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker. + available_memory: Memory available for KV cache in bytes for each + worker. + + Returns: + The generated KVCacheConfigs for each worker. """ - # Sort the kv cache groups by their KV cache spec. - # This can avoid the inconsistency caused by the order of groups. - for kv_cache_config in kv_cache_configs: - kv_cache_config.kv_cache_groups.sort(key=lambda x: (type( - x.kv_cache_spec).__name__, astuple(x.kv_cache_spec))) + # Check if the available memory is enough for each worker. + for kv_cache_spec_one_worker, available_memory_one_worker in zip( + kv_cache_specs, available_memory): + check_enough_kv_cache_memory(vllm_config, kv_cache_spec_one_worker, + available_memory_one_worker) - # Verify that the groups of each rank are the same. - for kv_cache_config in kv_cache_configs[1:]: - for group_rank_0, group_rank_i in zip( - kv_cache_configs[0].kv_cache_groups, - kv_cache_config.kv_cache_groups): - assert group_rank_0.kv_cache_spec == group_rank_i.kv_cache_spec + # Merge the KV cache specs of all workers. Different PP stages may have + # different layer names, and different TP ranks of the same PP stage should + # have the same KV cache spec. + merged_kv_cache_specs: dict[str, KVCacheSpec] = {} + for kv_cache_spec_one_worker in kv_cache_specs: + for layer_name, layer_spec in kv_cache_spec_one_worker.items(): + if layer_name not in merged_kv_cache_specs: + merged_kv_cache_specs[layer_name] = layer_spec + else: + assert merged_kv_cache_specs[layer_name] == layer_spec, ( + "The KV cache specs for the same layer are different " + "across workers. This is not supported yet.") + global_kv_cache_groups = get_kv_cache_groups(vllm_config, + merged_kv_cache_specs) + + kv_cache_configs: list[KVCacheConfig] = [] + for kv_cache_spec_one_worker, available_memory_one_worker in zip( + kv_cache_specs, available_memory): + kv_cache_groups_one_worker: list[KVCacheGroupSpec] = [] + for group in global_kv_cache_groups: + group_layer_names_one_worker = [ + layer_name for layer_name in group.layer_names + if layer_name in kv_cache_spec_one_worker + ] + kv_cache_groups_one_worker.append( + KVCacheGroupSpec(group_layer_names_one_worker, + group.kv_cache_spec)) + assert sum( + len(group.layer_names) for group in + kv_cache_groups_one_worker) == len(kv_cache_spec_one_worker), ( + "Some layers are not assigned to any group.") + kv_cache_configs.append( + get_kv_cache_config_from_groups(vllm_config, + kv_cache_groups_one_worker, + kv_cache_spec_one_worker, + available_memory_one_worker)) # Change the num_blocks of each rank to the smallest among all ranks. We # do not need to shrink the tensor size because it is valid to only use the diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 42d3e5c68b4c8..c431843de6baa 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -10,19 +10,19 @@ from vllm.v1.request import Request, RequestStatus def remove_all(lst: list, items_to_remove: set) -> list: """Remove all items from a list that are in the items_to_remove set. - + This method optimizes for the common case of removing a single item, falling back to list comprehension for multiple items. - + Args: lst: The list to remove items from items_to_remove: Set of items to remove - + Returns: Either the modified original list (for single item removal) or a new list (for multiple item removal). Callers should use the returned value. - + Note: For single item removal, this modifies the original list in-place and returns it. For multiple items, it creates and returns a new list. diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 995e70385be89..a022e9c0d7058 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -29,10 +29,9 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket, resolve_obj_by_qualname, set_process_title) -from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_config, +from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_configs, get_request_block_hasher, - init_none_hash, - unify_kv_cache_configs) + init_none_hash) from vllm.v1.core.sched.interface import SchedulerInterface from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler @@ -129,6 +128,9 @@ class EngineCore: log_stats=self.log_stats, ) self.use_spec_decode = vllm_config.speculative_config is not None + if self.scheduler.connector is not None: # type: ignore + self.model_executor.init_kv_output_aggregator( + self.scheduler.connector.get_finished_count()) # type: ignore self.mm_registry = mm_registry = MULTIMODAL_REGISTRY self.mm_receiver_cache = engine_receiver_cache_from_config( @@ -191,18 +193,9 @@ class EngineCore: available_gpu_memory = [0] * len(kv_cache_specs) assert len(kv_cache_specs) == len(available_gpu_memory) - # Get the kv cache tensor size - kv_cache_configs = [ - get_kv_cache_config(vllm_config, kv_cache_spec_one_worker, - available_gpu_memory_one_worker) - for kv_cache_spec_one_worker, available_gpu_memory_one_worker in - zip(kv_cache_specs, available_gpu_memory) - ] - # Since we use a shared centralized controller, we need the - # `kv_cache_config` to be consistent across all workers to make sure - # all the memory operators can be applied to all workers. - unify_kv_cache_configs(kv_cache_configs) + kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs, + available_gpu_memory) # All workers have the same kv_cache_config except layer names, so use # an arbitrary one to initialize the scheduler. diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 02c8c61cb9093..14ac1e3e5afa8 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -373,17 +373,17 @@ class OutputProcessor: 1) Compute stats for logging 2) Detokenize 3) Create and handle RequestOutput objects: - * If there is a queue (for usage with AsyncLLM), + * If there is a queue (for usage with AsyncLLM), put the RequestOutput objects into the queue for handling by the per-request generate() tasks. - * If there is no queue (for usage with LLMEngine), + * If there is no queue (for usage with LLMEngine), return a list of RequestOutput objects. NOTE FOR DEVELOPERS vLLM V1 minimizes the number of python loops over the full - batch to ensure system overheads are minimized. This is the + batch to ensure system overheads are minimized. This is the only function that should loop over EngineCoreOutputs. If you need to touch every element of the batch, do it from diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index f3fad15b750ad..327b4e2705485 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -498,7 +498,7 @@ class Processor: assert isinstance(mm_processor, EncDecMultiModalProcessor) if mm_processor.pad_dummy_encoder_prompt: - return # Skip encoder length check for Whisper and Donut + return # Skip encoder length check for Whisper if model_config.is_multimodal_model: suggestion = ( diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index f566c9aee0c54..3aa373f12b609 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -26,7 +26,6 @@ from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel) from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) -from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, get_pp_group, get_tp_group) from vllm.executor.multiproc_worker_utils import ( @@ -135,8 +134,6 @@ class MultiprocExecutor(Executor): self.output_rank = self._get_output_rank() self.has_connector = self.vllm_config.kv_transfer_config is not None - self.kv_output_aggregator = KVOutputAggregator( - self.parallel_config.world_size) def start_worker_monitor(self): workers = self.workers diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index 59c9b56625a95..aadb5fd1dddd5 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -51,8 +51,6 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor): # KV connector setup self.has_connector = self.vllm_config.kv_transfer_config is not None - self.kv_output_aggregator = KVOutputAggregator( - self.parallel_config.world_size) @property def max_concurrent_batches(self) -> int: diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 347185d8341ee..b30036a6f8e80 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -169,15 +169,11 @@ class PrometheusStatLogger(StatLoggerBase): model_name = vllm_config.model_config.served_model_name max_model_len = vllm_config.model_config.max_model_len - if (len(self.engine_indexes) > 1 - and vllm_config.speculative_config is not None): - raise NotImplementedError("Prometheus metrics with Spec Decoding " - "with >1 EngineCore per AsyncLLM is not " - "supported yet.") - spec_decode_labelvalues = [ - vllm_config.model_config.served_model_name, - str(self.engine_indexes[0]) - ] + spec_decode_labelvalues: dict[int, list[str]] = { + idx: [model_name, str(idx)] + for idx in engine_indexes + } + self.spec_decoding_prom = self._spec_decoding_cls( vllm_config.speculative_config, labelnames, spec_decode_labelvalues) @@ -206,40 +202,46 @@ class PrometheusStatLogger(StatLoggerBase): # # GPU cache # - # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc - # TODO: in 0.10, only enable if show_hidden_metrics=True - gauge_gpu_cache_usage = self._gauge_cls( - name="vllm:gpu_cache_usage_perc", - documentation=( - "GPU KV-cache usage. 1 means 100 percent usage." - "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), - multiprocess_mode="mostrecent", - labelnames=labelnames) - self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage, - engine_indexes, - model_name) + # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc + # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 + # TODO: remove in 0.12.0 + if self.show_hidden_metrics: + gauge_gpu_cache_usage = self._gauge_cls( + name="vllm:gpu_cache_usage_perc", + documentation=( + "GPU KV-cache usage. 1 means 100 percent usage." + "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), + multiprocess_mode="mostrecent", + labelnames=labelnames) + self.gauge_gpu_cache_usage = make_per_engine( + gauge_gpu_cache_usage, engine_indexes, model_name) - # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries - # TODO: in 0.10, only enable if show_hidden_metrics=True - counter_gpu_prefix_cache_queries = self._counter_cls( - name="vllm:gpu_prefix_cache_queries", - documentation=( - "GPU prefix cache queries, in terms of number of queried" - "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."), - labelnames=labelnames) - self.counter_gpu_prefix_cache_queries = make_per_engine( - counter_gpu_prefix_cache_queries, engine_indexes, model_name) + # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries + # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 + # TODO: remove in 0.12.0 + if self.show_hidden_metrics: + counter_gpu_prefix_cache_queries = self._counter_cls( + name="vllm:gpu_prefix_cache_queries", + documentation=( + "GPU prefix cache queries, in terms of number of queried" + "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." + ), + labelnames=labelnames) + self.counter_gpu_prefix_cache_queries = make_per_engine( + counter_gpu_prefix_cache_queries, engine_indexes, model_name) - # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits - # TODO: in 0.10, only enable if show_hidden_metrics=True - counter_gpu_prefix_cache_hits = self._counter_cls( - name="vllm:gpu_prefix_cache_hits", - documentation=( - "GPU prefix cache hits, in terms of number of cached " - "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), - labelnames=labelnames) - self.counter_gpu_prefix_cache_hits = make_per_engine( - counter_gpu_prefix_cache_hits, engine_indexes, model_name) + # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits + # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 + # TODO: remove in 0.12.0 + if self.show_hidden_metrics: + counter_gpu_prefix_cache_hits = self._counter_cls( + name="vllm:gpu_prefix_cache_hits", + documentation=( + "GPU prefix cache hits, in terms of number of cached " + "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_hits = make_per_engine( + counter_gpu_prefix_cache_hits, engine_indexes, model_name) gauge_kv_cache_usage = self._gauge_cls( name="vllm:kv_cache_usage_perc", @@ -513,15 +515,17 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_scheduler_waiting[engine_idx].set( scheduler_stats.num_waiting_reqs) - self.gauge_gpu_cache_usage[engine_idx].set( - scheduler_stats.kv_cache_usage) + if self.show_hidden_metrics: + self.gauge_gpu_cache_usage[engine_idx].set( + scheduler_stats.kv_cache_usage) self.gauge_kv_cache_usage[engine_idx].set( scheduler_stats.kv_cache_usage) - self.counter_gpu_prefix_cache_queries[engine_idx].inc( - scheduler_stats.prefix_cache_stats.queries) - self.counter_gpu_prefix_cache_hits[engine_idx].inc( - scheduler_stats.prefix_cache_stats.hits) + if self.show_hidden_metrics: + self.counter_gpu_prefix_cache_queries[engine_idx].inc( + scheduler_stats.prefix_cache_stats.queries) + self.counter_gpu_prefix_cache_hits[engine_idx].inc( + scheduler_stats.prefix_cache_stats.hits) self.counter_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries) @@ -530,7 +534,7 @@ class PrometheusStatLogger(StatLoggerBase): if scheduler_stats.spec_decoding_stats is not None: self.spec_decoding_prom.observe( - scheduler_stats.spec_decoding_stats) + scheduler_stats.spec_decoding_stats, engine_idx) if iteration_stats is None: return diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py index 2aa8962f5739c..282e6f65e7abe 100644 --- a/vllm/v1/spec_decode/metrics.py +++ b/vllm/v1/spec_decode/metrics.py @@ -140,27 +140,32 @@ class SpecDecodingProm: self, speculative_config: Optional[SpeculativeConfig], labelnames: list[str], - labelvalues: list[str], + per_engine_labelvalues: dict[int, list[str]], ): self.spec_decoding_enabled = speculative_config is not None if not self.spec_decoding_enabled: return - self.counter_spec_decode_num_drafts = \ - self._counter_cls( - name="vllm:spec_decode_num_drafts", - documentation="Number of spec decoding drafts.", - labelnames=labelnames).labels(*labelvalues) - self.counter_spec_decode_num_draft_tokens = \ - self._counter_cls( - name="vllm:spec_decode_num_draft_tokens", - documentation="Number of draft tokens.", - labelnames=labelnames,).labels(*labelvalues) - self.counter_spec_decode_num_accepted_tokens = \ - self._counter_cls( - name="vllm:spec_decode_num_accepted_tokens", - documentation="Number of accepted tokens.", - labelnames=labelnames).labels(*labelvalues) + counter_drafts = self._counter_cls( + name="vllm:spec_decode_num_drafts", + documentation="Number of spec decoding drafts.", + labelnames=labelnames) + self.counter_spec_decode_num_drafts = make_per_engine( + counter_drafts, per_engine_labelvalues) + + counter_draft_tokens = self._counter_cls( + name="vllm:spec_decode_num_draft_tokens", + documentation="Number of draft tokens.", + labelnames=labelnames) + self.counter_spec_decode_num_draft_tokens = make_per_engine( + counter_draft_tokens, per_engine_labelvalues) + + counter_accepted_tokens = self._counter_cls( + name="vllm:spec_decode_num_accepted_tokens", + documentation="Number of accepted tokens.", + labelnames=labelnames) + self.counter_spec_decode_num_accepted_tokens = make_per_engine( + counter_accepted_tokens, per_engine_labelvalues) assert speculative_config is not None num_spec_tokens = (speculative_config.num_speculative_tokens @@ -171,21 +176,36 @@ class SpecDecodingProm: documentation="Accepted tokens per draft position.", labelnames=pos_labelnames, ) - self.counter_spec_decode_num_accepted_tokens_per_pos: list[ - prometheus_client.Counter] = [] - for pos in range(num_spec_tokens): - pos_labelvalues = labelvalues + [str(pos)] - self.counter_spec_decode_num_accepted_tokens_per_pos.append( - base_counter.labels(*pos_labelvalues)) + self.counter_spec_decode_num_accepted_tokens_per_pos: dict[ + int, list[prometheus_client.Counter]] = { + idx: [ + base_counter.labels(*lv, str(pos)) + for pos in range(num_spec_tokens) + ] + for idx, lv in per_engine_labelvalues.items() + } - def observe(self, spec_decoding_stats: SpecDecodingStats): + def observe(self, + spec_decoding_stats: SpecDecodingStats, + engine_idx: int = 0): if not self.spec_decoding_enabled: return - self.counter_spec_decode_num_drafts.inc(spec_decoding_stats.num_drafts) - self.counter_spec_decode_num_draft_tokens.inc( + self.counter_spec_decode_num_drafts[engine_idx].inc( + spec_decoding_stats.num_drafts) + self.counter_spec_decode_num_draft_tokens[engine_idx].inc( spec_decoding_stats.num_draft_tokens) - self.counter_spec_decode_num_accepted_tokens.inc( + self.counter_spec_decode_num_accepted_tokens[engine_idx].inc( spec_decoding_stats.num_accepted_tokens) for pos, counter in enumerate( - self.counter_spec_decode_num_accepted_tokens_per_pos): + self. + counter_spec_decode_num_accepted_tokens_per_pos[engine_idx]): counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos]) + + +def make_per_engine(counter: prometheus_client.Counter, + per_engine_labelvalues: dict[int, list[str]]): + """Create a counter for each label value.""" + return { + idx: counter.labels(*labelvalues) + for idx, labelvalues in per_engine_labelvalues.items() + } diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index b87c4fe09bb90..daee91ec404fe 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -19,6 +19,7 @@ from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.gpu_worker import (Worker, init_worker_distributed_environment) +from vllm.v1.worker.utils import is_residual_scattered_for_sp logger = init_logger(__name__) @@ -107,18 +108,29 @@ class CPUWorker(Worker): scheduler_output: "SchedulerOutput", ) -> Optional[ModelRunnerOutput]: intermediate_tensors = None + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + num_input_tokens = self.model_runner._get_num_input_tokens( + num_scheduled_tokens) + all_gather_tensors = { + "residual": + not is_residual_scattered_for_sp(self.vllm_config, + num_input_tokens) + } if not get_pp_group().is_first_rank: intermediate_tensors = IntermediateTensors( get_pp_group().recv_tensor_dict( - all_gather_group=get_tp_group())) + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors)) output = self.model_runner.execute_model(scheduler_output, intermediate_tensors) if not get_pp_group().is_last_rank: assert isinstance(output, IntermediateTensors) - get_pp_group().send_tensor_dict(output.tensors, - all_gather_group=get_tp_group()) + get_pp_group().send_tensor_dict( + output.tensors, + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors) return None assert isinstance(output, ModelRunnerOutput) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9c629c784bfff..f58f26c3d7995 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -60,6 +60,7 @@ from vllm.v1.worker.gpu_worker_states import RequestState from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorModelRunnerMixin) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin +from vllm.v1.worker.utils import is_residual_scattered_for_sp from .utils import AttentionGroup, MultiModalBudget, bind_kv_cache @@ -563,6 +564,25 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logits_indices=logits_indices, ) + def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int: + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH + and hasattr(self, "cudagraph_batch_sizes") + and self.cudagraph_batch_sizes + and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): + # Use CUDA graphs. + # Add padding to the batch size. + return self.vllm_config.pad_for_cudagraph(num_scheduled_tokens) + + # Eager mode. + # Pad tokens to multiple of tensor_parallel_size when + # enabled collective fusion for SP + tp_size = self.vllm_config.parallel_config.tensor_parallel_size + if (self.compilation_config.pass_config.enable_sequence_parallelism + and tp_size > 1): + return round_up(num_scheduled_tokens, tp_size) + return num_scheduled_tokens + def _preprocess( self, scheduler_output: "SchedulerOutput", @@ -573,24 +593,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): Optional[IntermediateTensors], dict[str, Any]]: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH - and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): - # Use CUDA graphs. - # Add padding to the batch size. - num_input_tokens = self.vllm_config.pad_for_cudagraph( - num_scheduled_tokens) - else: - # Eager mode. - # Pad tokens to multiple of tensor_parallel_size when - # enabled collective fusion for SP - tp_size = self.vllm_config.parallel_config.tensor_parallel_size - if self.compilation_config.pass_config. \ - enable_sequence_parallelism and tp_size > 1: - num_input_tokens = round_up(num_scheduled_tokens, tp_size) - else: - num_input_tokens = num_scheduled_tokens - + num_input_tokens = self._get_num_input_tokens(num_scheduled_tokens) # Padding for DP num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens) num_input_tokens += num_pad @@ -937,8 +940,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert not self.is_pooling_model if not get_pp_group().is_last_rank: + all_gather_tensors = { + "residual": + not is_residual_scattered_for_sp( + self.vllm_config, num_input_tokens) + } get_pp_group().send_tensor_dict( - hidden_states.tensors, all_gather_group=get_tp_group()) + hidden_states.tensors, + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors) logits = None else: sample_hidden_states = hidden_states[logits_indices] diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 37dd431fd68f8..6855526583f04 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -32,6 +32,7 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput) from vllm.v1.utils import report_usage_stats from vllm.v1.worker.gpu_model_runner import GPUModelRunner +from vllm.v1.worker.utils import is_residual_scattered_for_sp from vllm.v1.worker.worker_base import WorkerBase logger = init_logger(__name__) @@ -428,10 +429,19 @@ class Worker(WorkerBase): ) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]: intermediate_tensors = None forward_pass = scheduler_output.total_num_scheduled_tokens > 0 + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + num_input_tokens = self.model_runner._get_num_input_tokens( + num_scheduled_tokens) + all_gather_tensors = { + "residual": + not is_residual_scattered_for_sp(self.vllm_config, + num_input_tokens) + } if forward_pass and not get_pp_group().is_first_rank: intermediate_tensors = IntermediateTensors( get_pp_group().recv_tensor_dict( - all_gather_group=get_tp_group())) + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors)) output = self.model_runner.execute_model(scheduler_output, intermediate_tensors) @@ -444,7 +454,8 @@ class Worker(WorkerBase): "external_launcher") and not get_pp_group().is_last_rank get_pp_group().send_tensor_dict(output.tensors, - all_gather_group=get_tp_group()) + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors) kv_connector_output = output.kv_connector_output if not kv_connector_output: diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index be05d02ff29fe..5ac7470c1ac90 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Optional import torch from vllm.attention.backends.abstract import AttentionBackend -from vllm.config import ModelConfig, SchedulerConfig +from vllm.config import ModelConfig, SchedulerConfig, VllmConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config @@ -288,3 +288,28 @@ def bind_kv_cache( for layer_name, kv_cache in kv_caches.items(): # NOTE: Use list because of v0 PP virtual engine. forward_context[layer_name].kv_cache = [kv_cache] + + +def is_residual_scattered_for_sp(vllm_config: VllmConfig, + num_input_tokens: int) -> bool: + """Check if the residual tensor is scattered for sequence parallelism. + + The residual tensor is scattered across tensor parallel ranks when sequence + parallelism and tensor parallelism is enabled, and the number of + input tokens is one of the compilation sizes. + """ + if not vllm_config.compilation_config.pass_config.\ + enable_sequence_parallelism: + return False + + tp = vllm_config.parallel_config.tensor_parallel_size + + if tp == 1: + return False + + # When sequence parallelism is enabled, we always pad num_input_tokens + # to be a multiple of tensor_parallel_size (tp) earlier. + assert num_input_tokens % tp == 0 + + # Currently, SP is only enabled for static size fx graphs. + return (num_input_tokens in vllm_config.compilation_config.compile_sizes) diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py deleted file mode 100644 index 12fd25f4de2ad..0000000000000 --- a/vllm/worker/enc_dec_model_runner.py +++ /dev/null @@ -1,553 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -import itertools -from typing import Any, Dict, List, Optional, Tuple, Type, cast - -import torch -import torch.distributed - -from vllm.attention.backends.abstract import (AttentionBackend, - AttentionMetadata) -from vllm.attention.backends.utils import PAD_SLOT_ID -from vllm.attention.selector import (get_env_variable_attn_backend, - get_global_forced_attn_backend) -from vllm.config import VllmConfig -from vllm.forward_context import set_forward_context -from vllm.inputs import INPUT_REGISTRY, InputRegistry -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, - MultiModalRegistry) -from vllm.platforms import _Backend -from vllm.sampling_params import SamplingParams -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad -from vllm.worker.model_runner import (GPUModelRunnerBase, - ModelInputForGPUBuilder, - ModelInputForGPUWithSamplingMetadata) -from vllm.worker.model_runner_base import ( - _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict) -from vllm.worker.utils import assert_enc_dec_mr_supported_scenario - -logger = init_logger(__name__) -LORA_WARMUP_RANK = 8 - - -@dataclasses.dataclass(frozen=True) -class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata): - """ - Used by the EncoderDecoderModelRunner. - """ - encoder_input_tokens: Optional[torch.Tensor] = None - encoder_input_positions: Optional[torch.Tensor] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "inputs_embeds": self.inputs_embeds, - "input_positions": self.input_positions, - "encoder_input_tokens": self.encoder_input_tokens, - "encoder_input_positions": self.encoder_input_positions, - "virtual_engine": self.virtual_engine, - "request_ids_to_seq_ids": self.request_ids_to_seq_ids, - "finished_requests_ids": self.finished_requests_ids, - "multi_modal_kwargs": self.multi_modal_kwargs, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "EncoderDecoderModelInput": - return cast( - EncoderDecoderModelInput, - super().from_broadcasted_tensor_dict(tensor_dict, attn_backend)) - - -class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): - _model_input_cls: Type[EncoderDecoderModelInput] = ( - EncoderDecoderModelInput) - _builder_cls: Type[ModelInputForGPUBuilder] = (ModelInputForGPUBuilder) - - def __init__( - self, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - input_registry: InputRegistry = INPUT_REGISTRY, - mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - ): - ''' - EncoderDecoderModelRunner constructor. - - `lora_config` is unused (since these features are not yet supported - for encoder/decoder models) but these arguments are present here for - compatibility with the base-class constructor. - ''' - self._maybe_force_supported_attention_backend() - - super().__init__( - vllm_config=vllm_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=is_driver_worker, - input_registry=input_registry, - mm_registry=mm_registry, - ) - - # Crash for unsupported encoder/scenarios - assert_enc_dec_mr_supported_scenario(self) - - def _maybe_force_supported_attention_backend(self): - ''' - Force vLLM to use the XFormers attention backend, - which is currently the only supported option. - ''' - - def raise_backend_err(): - # The user has specified an attention backend override - # which is invalid for encoder/decoder models - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_BACKEND) - - maybe_env_var_forced_backend = get_env_variable_attn_backend() - maybe_global_forced_backend = get_global_forced_attn_backend() - is_forced_by_global = maybe_global_forced_backend is not None - is_forced_by_env_var = maybe_env_var_forced_backend is not None - if is_forced_by_global: # noqa: SIM102 - # Backend override enforced by global variable takes - # precedence over vLLM backend environment variable. - if maybe_global_forced_backend not in\ - [_Backend.XFORMERS, _Backend.FLASH_ATTN]: - raise_backend_err() - elif is_forced_by_env_var: # noqa: SIM102 - # Backend override enforced by vLLM backend - # environment variable - if maybe_env_var_forced_backend not in\ - [_Backend.XFORMERS, _Backend.FLASH_ATTN]: - raise_backend_err() - - def _list_to_int32_tensor( - self, - _list: List[int], - ) -> torch.Tensor: - return torch.tensor(_list, dtype=torch.int32, device=self.device) - - def _list_to_long_tensor( - self, - _list: List[int], - ) -> torch.Tensor: - return torch.tensor(_list, dtype=torch.long, device=self.device) - - def _empty_int32_tensor(self) -> torch.Tensor: - return self._list_to_int32_tensor([]) - - def _empty_long_tensor(self) -> torch.Tensor: - return self._list_to_long_tensor([]) - - @torch.inference_mode() - def execute_model( - self, - model_input: EncoderDecoderModelInput, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError("num_steps > 1 is not supported in " - "EncoderDecoderModelRunner") - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - if (model_input.attn_metadata is not None - and model_input.attn_metadata.prefill_metadata is None - and model_input.attn_metadata.decode_metadata.use_cuda_graph): - if model_input.inputs_embeds is None: - assert model_input.input_tokens is not None - graph_batch_size = model_input.input_tokens.shape[0] - model_executable = ( - self.graph_runners[model_input.virtual_engine][( - graph_batch_size, False)]) - else: - graph_batch_size = model_input.inputs_embeds.shape[0] - model_executable = ( - self.graph_runners[model_input.virtual_engine][( - graph_batch_size, True)]) - else: - model_executable = self.model - - seqlen_agnostic_kwargs = { - "finished_requests_ids": model_input.finished_requests_ids, - "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, - } if self.has_inner_state else {} - - multi_modal_kwargs = model_input.multi_modal_kwargs or {} - with set_forward_context(model_input.attn_metadata, self.vllm_config, - model_input.virtual_engine): - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - inputs_embeds=model_input.inputs_embeds, - positions=model_input.input_positions, - encoder_input_ids=model_input.encoder_input_tokens, - encoder_positions=model_input.encoder_input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs( - multi_modal_kwargs, - device=self.device, - ), - **seqlen_agnostic_kwargs, - ) - - logits = self.model.compute_logits(hidden_or_intermediate_states, - model_input.sampling_metadata) - - if not self.is_driver_worker: - return [] - - if model_input.async_callback is not None: - model_input.async_callback() - - # Sample the next token. - output: SamplerOutput = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - - return [output] - - def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> EncoderDecoderModelInput: - return EncoderDecoderModelInput.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - ) - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> EncoderDecoderModelInput: - """Prepare the model input based on a given sequence group, including - metadata for the sampling step. - - Since chunked prefill is not supported for encoder/decoder models, - `input_tokens` is assumed to be either entirely prefill tokens or - entirely decode tokens. - - """ - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - ( - attn_metadata, - encoder_input_tokens_tensor, - encoder_input_positions_tensor, - ) = (self._prepare_encoder_model_input_tensors(seq_group_metadata_list, - model_input)) - # Inject attn_metadata encoder/cross-attention fields & - # encoder input tokens/positions into model_input. - # Frozen dataclass fields cannot be modified, so use - # dataclasses.replace to construct a new model input - # instance. - model_input = dataclasses.replace( - model_input, - attn_metadata=attn_metadata, - encoder_input_tokens=encoder_input_tokens_tensor, - encoder_input_positions=encoder_input_positions_tensor, - ) - - generators = self.get_generators(finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, - model_input.seq_lens, - model_input.query_lens, - self.device, - self.pin_memory, - generators=generators) - is_prompt = (seq_group_metadata_list[0].is_prompt - if seq_group_metadata_list else None) - return dataclasses.replace(model_input, - sampling_metadata=sampling_metadata, - is_prompt=is_prompt, - virtual_engine=virtual_engine) - - @torch.inference_mode() - def profile_run(self) -> None: - # Enable top-k sampling to reflect the accurate memory usage. - sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - - # This represents the maximum number of different requests - # that will have unique loras, and therefore the max amount of - # memory consumption. Create dummy lora request copies from the - # lora request passed in, which contains a lora from the lora - # warmup path. - dummy_lora_requests: List[LoRARequest] = [] - dummy_lora_requests_per_seq: List[LoRARequest] = [] - if self.lora_config: - dummy_lora_requests = self._add_dummy_loras( - self.lora_config.max_loras) - assert len(dummy_lora_requests) == self.lora_config.max_loras - dummy_lora_requests_per_seq = [ - dummy_lora_requests[idx % len(dummy_lora_requests)] - for idx in range(max_num_seqs) - ] - - # Profile memory usage with max_num_sequences sequences and the total - # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - - max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( - self.model_config) - if max_mm_tokens > 0: - logger.info("Starting profile run for multi-modal models.") - - batch_size = 0 - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - batch_size += seq_len - - decoder_dummy_data = self.input_registry \ - .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry, - is_encoder_data=False) - encoder_dummy_data = self.input_registry \ - .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry, - is_encoder_data=True) - - # Having more tokens is over-conservative but otherwise fine - assert len( - decoder_dummy_data.seq_data.prompt_token_ids - ) >= seq_len, ( - f"Expected at least {seq_len} dummy tokens for profiling, " - f"but got: {len(decoder_dummy_data.seq_data.prompt_token_ids)}" - ) - - assert decoder_dummy_data.multi_modal_data is None or \ - encoder_dummy_data.multi_modal_data is None, ( - "Multi-modal data can't be provided in both encoder and decoder" - ) - - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: decoder_dummy_data.seq_data}, - sampling_params=sampling_params, - block_tables=None, - encoder_seq_data=encoder_dummy_data.seq_data, - cross_block_table=None, - lora_request=dummy_lora_requests_per_seq[group_id] - if dummy_lora_requests_per_seq else None, - multi_modal_data=decoder_dummy_data.multi_modal_data - or encoder_dummy_data.multi_modal_data, - multi_modal_placeholders=decoder_dummy_data. - multi_modal_placeholders - or encoder_dummy_data.multi_modal_placeholders) - seqs.append(seq) - - finished_requests_ids = [seq.request_id for seq in seqs] - model_input = self.prepare_model_input( - seqs, finished_requests_ids=finished_requests_ids) - intermediate_tensors = None - self.execute_model(model_input, None, intermediate_tensors) - torch.cuda.synchronize() - return - - def _prepare_encoder_model_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - model_input: EncoderDecoderModelInput, - ) -> Tuple[AttentionMetadata, Optional[torch.Tensor], - Optional[torch.Tensor]]: - """Helper method to prepare the encoder- and cross-attn-related - model inputs based on a given sequence group. These additional inputs - are used to augment an already-computed `EncoderDecoderModelInput` - data structure which already has decoder-related model inputs - populated. - - Sets the following attn_metadata fields: - * `num_encoder_tokens` - * `encoder_seq_lens` - * `encoder_seq_lens_tensor` - * `max_encoder_seq_len` - * `cross_slot_mapping` - * `cross_block_tables` - - Constructs a new model inputs data structure, based on - (1) the existing fields in the `model_inputs` argument, - and (2) the following additional fields which are - computed (or in the case of `attn_metadata`, updated) - by this function: - * attn_metadata - * encoder_input_tokens - * encoder_input_positions - - Arguments: - - * seq_group_metadata_list: list of sequence groups for which to - compute inputs - * model_inputs: model inputs data structure with decoder-oriented - fields already computed. - - Return: - - * Updated model inputs data structure - """ - - if len(seq_group_metadata_list) == 0: - return (model_input.attn_metadata, None, None) - - # Since we are not supporting chunked prefill either the entire - # batch is prefill or it is decode - is_prompt = seq_group_metadata_list[0].is_prompt - - # Build encoder inputs - encoder_seq_lens: List[int] = [] - if is_prompt: - # Prefill phase. - cross_block_tables = self._empty_int32_tensor().view( - len(seq_group_metadata_list), -1) - - # Extract input tokens/positions, cross-attention slot-mapping, - # & seq len from each sequence group metadata - ( - encoder_input_tokens, - encoder_input_positions, - cross_slot_mapping, - ) = ( - [], - [], - [], - ) - for seq_group_metadata in seq_group_metadata_list: - # Build seq lens - seq_len = seq_group_metadata.encoder_seq_data.get_len() - token_ids = seq_group_metadata.encoder_seq_data.get_token_ids() - encoder_seq_lens.append(seq_len) - - # Build slot mapping - is_profile_run = (seq_group_metadata.block_tables is None) - if is_profile_run: - # During memory profiling, the block tables are not - # initialized yet. In this case, we just use a dummy - # slot mapping. - # In embeddings, the block tables are {seq_id: None}. - cross_slot_mapping.extend([PAD_SLOT_ID] * seq_len) - else: - for i in range(0, seq_len): - block_number = seq_group_metadata.cross_block_table[ - i // self.block_size] - block_offset = i % self.block_size - slot = block_number * self.block_size + block_offset - cross_slot_mapping.append(slot) - - # Build encoder input tokens - encoder_input_tokens.extend(token_ids) - encoder_input_positions.extend(list(range(0, seq_len))) - - # Convert tokens/positions & cross-attention - # slot-mapping to encoder input tensors - encoder_input_tokens_tensor = self._list_to_long_tensor( - encoder_input_tokens) - encoder_input_positions_tensor = self._list_to_long_tensor( - encoder_input_positions) - cross_slot_mapping_tensor = self._list_to_long_tensor( - cross_slot_mapping) - - else: - # Decode phase. - encoder_input_tokens_tensor = self._empty_long_tensor() - encoder_input_positions_tensor = self._empty_long_tensor() - cross_slot_mapping_tensor = self._empty_long_tensor() - # Extract cross-attention block tables & - # seq len from each sequence group metadata. - # Cross-attention block tables are empty - # during vLLM memory profiling. - cross_block_tables = [] - for seq_group_metadata in seq_group_metadata_list: - for _ in range(len(seq_group_metadata.seq_data)): - encoder_seq_lens.append( - seq_group_metadata.encoder_seq_data.get_len()) - cross_block_table = seq_group_metadata.cross_block_table - cross_block_tables.append([] if ( - cross_block_table is None) else cross_block_table) - - if (model_input.attn_metadata is not None - and model_input.attn_metadata.use_cuda_graph): - # We will be using CUDA graph replay for this decode. - max_len_of_block_table = self.get_max_block_per_batch() - batch_size = len(encoder_seq_lens) - graph_batch_size = self.vllm_config.pad_for_cudagraph( - batch_size) - assert graph_batch_size >= batch_size - cuda_graph_pad_size = graph_batch_size - batch_size - # extend the cross_block_tables and encoder_seq_lens to match - # the graph_batch_size. - cross_block_tables.extend([[] - for _ in range(cuda_graph_pad_size) - ]) - encoder_seq_lens.extend( - itertools.repeat(1, cuda_graph_pad_size)) - - else: - max_len_of_block_table = max( - len(block_table) for block_table in cross_block_tables) - - cross_block_tables = make_tensor_with_pad( - cross_block_tables, - max_len=max_len_of_block_table, - pad=0, - dtype=torch.int32, - device=self.device, - ) - - # Compute encoder sequence lengths & encoder - # sequence starting offset tensors - max_encoder_seq_len = max(encoder_seq_lens, default=0) - encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens) - encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + - 1, - dtype=torch.int32, - device=self.device) - torch.cumsum(encoder_seq_lens_tensor, - dim=0, - dtype=encoder_seq_start_loc.dtype, - out=encoder_seq_start_loc[1:]) - - # Update attention metadata with encoder-oriented attributes - attn_metadata = model_input.attn_metadata - assert attn_metadata is not None - ( - attn_metadata.num_encoder_tokens, - attn_metadata.encoder_seq_lens, - attn_metadata.encoder_seq_lens_tensor, - attn_metadata.max_encoder_seq_len, - attn_metadata.encoder_seq_start_loc, - attn_metadata.cross_slot_mapping, - attn_metadata.cross_block_tables, - ) = ( - sum(encoder_seq_lens), - encoder_seq_lens, - encoder_seq_lens_tensor, - max_encoder_seq_len, - encoder_seq_start_loc, - cross_slot_mapping_tensor, - cross_block_tables, - ) - - return (attn_metadata, encoder_input_tokens_tensor, - encoder_input_positions_tensor) diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py deleted file mode 100644 index 512a1dca73701..0000000000000 --- a/vllm/worker/utils.py +++ /dev/null @@ -1,49 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -''' -Worker-related helper functions. -''' - -from vllm.utils import STR_NOT_IMPL_ENC_DEC_ERR_STRS -from vllm.worker.model_runner import GPUModelRunnerBase - - -def assert_enc_dec_mr_supported_scenario( - enc_dec_mr: GPUModelRunnerBase) -> None: - ''' - Asserted that the provided encoder/decoder model runner instance reflects - a supported scenario. - ''' - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - - if enc_dec_mr.cache_config.enable_prefix_caching: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE']) - - if enc_dec_mr.sliding_window is not None: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SWA']) - - if enc_dec_mr.scheduler_config.chunked_prefill_enabled: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[ - 'STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL']) - - if getattr(enc_dec_mr.model_config.hf_config, 'attn_logit_softcapping', - None) is not None: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP'] - ) - - if enc_dec_mr.lora_config is not None: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LORA']) - - if enc_dec_mr.parallel_config.pipeline_parallel_size > 1: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PP']) - - if enc_dec_mr.scheduler_config.num_lookahead_slots > 0: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC']) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 670f256c0bf65..12047bc390737 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -28,7 +28,6 @@ from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache, memory_profiling) from vllm.worker.cache_engine import CacheEngine -from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, WorkerInput) @@ -82,10 +81,7 @@ class Worker(LocalOrDistributedWorkerBase): "qwen3_next_mtp")) \ else {"return_hidden_states": True} - ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner - if self.model_config.is_encoder_decoder: - ModelRunnerClass = EncoderDecoderModelRunner - self.model_runner: GPUModelRunnerBase = ModelRunnerClass( + self.model_runner: GPUModelRunnerBase = ModelRunner( vllm_config=self.vllm_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker,