mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-14 15:57:03 +08:00
merge
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
commit
9f2becd3e6
@ -8,7 +8,7 @@ This benchmark aims to:
|
||||
|
||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||
|
||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||
|
||||
## Setup
|
||||
|
||||
|
||||
@ -1,24 +1,22 @@
|
||||
steps:
|
||||
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
||||
- label: "Build arm64 wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build CUDA 12.8 wheel"
|
||||
key: block-build-cu128-wheel
|
||||
|
||||
- label: "Build wheel - CUDA 12.8"
|
||||
depends_on: block-build-cu128-wheel
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-8
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
@ -30,12 +28,8 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build CUDA 12.6 wheel"
|
||||
key: block-build-cu126-wheel
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build wheel - CUDA 12.6"
|
||||
depends_on: block-build-cu126-wheel
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-6
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
@ -102,8 +96,6 @@ steps:
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
- build-wheel-cuda-12-8
|
||||
- build-wheel-cuda-12-6
|
||||
- build-wheel-cuda-12-9
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
|
||||
@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||
To download the wheel:
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download and upload the image:
|
||||
|
||||
\`\`\`
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai:latest
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai:latest-x86_64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker push vllm/vllm-openai:latest-aarch64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
|
||||
docker manifest push vllm/vllm-openai:latest
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
EOF
|
||||
@ -66,7 +66,6 @@ function cpu_tests() {
|
||||
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -x -v -s tests/models/multimodal/generation \
|
||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||
-m cpu_model"
|
||||
|
||||
|
||||
@ -394,6 +394,7 @@ steps:
|
||||
- pytest -v -s compile/test_async_tp.py
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
- pytest -v -s compile/test_noop_elimination.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@ -548,15 +549,6 @@ steps:
|
||||
commands: # LMEval+Transcription WER check
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
- label: Encoder Decoder tests # 12min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/encoder_decoder
|
||||
commands:
|
||||
- pytest -v -s encoder_decoder
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 23 min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
|
||||
32
.coveragerc
Normal file
32
.coveragerc
Normal file
@ -0,0 +1,32 @@
|
||||
[run]
|
||||
source = vllm
|
||||
omit =
|
||||
*/tests/*
|
||||
*/test_*
|
||||
*/__pycache__/*
|
||||
*/build/*
|
||||
*/dist/*
|
||||
*/vllm.egg-info/*
|
||||
*/third_party/*
|
||||
*/examples/*
|
||||
*/benchmarks/*
|
||||
*/docs/*
|
||||
|
||||
[report]
|
||||
exclude_lines =
|
||||
pragma: no cover
|
||||
def __repr__
|
||||
if self.debug:
|
||||
if settings.DEBUG
|
||||
raise AssertionError
|
||||
raise NotImplementedError
|
||||
if 0:
|
||||
if __name__ == .__main__.:
|
||||
class .*\bProtocol\):
|
||||
@(abc\.)?abstractmethod
|
||||
|
||||
[html]
|
||||
directory = htmlcov
|
||||
|
||||
[xml]
|
||||
output = coverage.xml
|
||||
12
.github/CODEOWNERS
vendored
12
.github/CODEOWNERS
vendored
@ -2,17 +2,20 @@
|
||||
# for more info about CODEOWNERS file
|
||||
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/fused_moe @mgoin
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/sample @22quinn @houseroad
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
@ -30,6 +33,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
/vllm/v1/core @heheda12345
|
||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||
@ -41,7 +45,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
|
||||
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
|
||||
/tests/evals @mgoin
|
||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||
/tests/prefix_caching @comaniac @KuntaiDu
|
||||
@ -101,4 +106,7 @@ mkdocs.yaml @hmellor
|
||||
/vllm/v1/worker/tpu* @NickLucche
|
||||
/vllm/platforms/tpu.py @NickLucche
|
||||
/vllm/v1/sample/tpu @NickLucche
|
||||
/vllm/tests/v1/tpu @NickLucche
|
||||
/vllm/tests/v1/tpu @NickLucche
|
||||
|
||||
# KVConnector installation files
|
||||
/requirements/kv_connectors.txt @NickLucche
|
||||
|
||||
@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
|
||||
# cmake --install . --component _C
|
||||
project(vllm_extensions LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
|
||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||
@ -779,6 +783,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Hadacore kernels
|
||||
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
|
||||
if(HADACORE_ARCHS)
|
||||
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${HADACORE_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
message(STATUS "Building hadacore")
|
||||
endif()
|
||||
|
||||
# if CUDA endif
|
||||
endif()
|
||||
|
||||
|
||||
@ -560,7 +560,7 @@ def save_configs(
|
||||
filename = os.path.join(save_dir, filename)
|
||||
print(f"Writing best config to {filename}...")
|
||||
with open(filename, "w") as f:
|
||||
json.dump(configs, f, indent=4)
|
||||
json.dump({"triton_version": triton.__version__, **configs}, f, indent=4)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
|
||||
@ -480,7 +480,6 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
||||
${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
|
||||
endif()
|
||||
|
||||
set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
|
||||
|
||||
target_compile_options(${GPU_MOD_NAME} PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
|
||||
|
||||
@ -347,6 +347,8 @@ std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
|
||||
int64_t open_mem_handle(torch::Tensor& mem_handle);
|
||||
void free_shared_buffer(int64_t buffer);
|
||||
|
||||
torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace);
|
||||
|
||||
#ifdef USE_ROCM
|
||||
fptr_t init_custom_qr(int64_t rank, int64_t world_size,
|
||||
std::optional<int64_t> qr_max_size = std::nullopt);
|
||||
|
||||
@ -146,6 +146,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
|
||||
|
||||
using ElementAB = typename Gemm::ElementAB;
|
||||
using ElementD = typename Gemm::ElementD;
|
||||
using ElementBlockScale = typename Gemm::ElementBlockScale;
|
||||
|
||||
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
|
||||
|
||||
@ -166,26 +167,29 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
|
||||
ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
|
||||
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
|
||||
|
||||
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
|
||||
auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
|
||||
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
|
||||
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
|
||||
auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
|
||||
auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
|
||||
auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
|
||||
auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());
|
||||
|
||||
auto mainloop_args = [&](){
|
||||
// layout_SFA and layout_SFB cannot be swapped since they are deduced.
|
||||
if (swap_ab) {
|
||||
return typename GemmKernel::MainloopArguments{
|
||||
b_ptr, b_stride, a_ptr, a_stride,
|
||||
b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
|
||||
};
|
||||
}
|
||||
else {
|
||||
return typename GemmKernel::MainloopArguments{
|
||||
a_ptr, a_stride, b_ptr, b_stride,
|
||||
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
|
||||
};
|
||||
}
|
||||
}();
|
||||
typename GemmKernel::MainloopArguments mainloop_args{};
|
||||
mainloop_args.layout_SFA = layout_SFA;
|
||||
mainloop_args.layout_SFB = layout_SFB;
|
||||
if (swap_ab) {
|
||||
mainloop_args.ptr_A = b_ptr;
|
||||
mainloop_args.dA = b_stride;
|
||||
mainloop_args.ptr_B = a_ptr;
|
||||
mainloop_args.dB = a_stride;
|
||||
mainloop_args.ptr_SFA = b_scales_ptr;
|
||||
mainloop_args.ptr_SFB = a_scales_ptr;
|
||||
} else {
|
||||
mainloop_args.ptr_A = a_ptr;
|
||||
mainloop_args.dA = a_stride;
|
||||
mainloop_args.ptr_B = b_ptr;
|
||||
mainloop_args.dB = b_stride;
|
||||
mainloop_args.ptr_SFA = a_scales_ptr;
|
||||
mainloop_args.ptr_SFB = b_scales_ptr;
|
||||
}
|
||||
auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
|
||||
|
||||
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
|
||||
|
||||
@ -125,6 +125,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
|
||||
|
||||
using ElementAB = typename Gemm::ElementAB;
|
||||
using ElementD = typename Gemm::ElementD;
|
||||
using ElementBlockScale = typename Gemm::ElementBlockScale;
|
||||
|
||||
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
|
||||
|
||||
@ -143,17 +144,20 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
|
||||
LayoutSFB layout_SFB =
|
||||
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
|
||||
|
||||
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
|
||||
auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
|
||||
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
|
||||
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
|
||||
auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
|
||||
auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
|
||||
auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
|
||||
auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());
|
||||
|
||||
auto mainloop_args = [&](){
|
||||
return typename GemmKernel::MainloopArguments{
|
||||
a_ptr, a_stride, b_ptr, b_stride,
|
||||
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
|
||||
};
|
||||
}();
|
||||
typename GemmKernel::MainloopArguments mainloop_args{};
|
||||
mainloop_args.ptr_A = a_ptr;
|
||||
mainloop_args.dA = a_stride;
|
||||
mainloop_args.ptr_B = b_ptr;
|
||||
mainloop_args.dB = b_stride;
|
||||
mainloop_args.ptr_SFA = a_scales_ptr;
|
||||
mainloop_args.layout_SFA = layout_SFA;
|
||||
mainloop_args.ptr_SFB = b_scales_ptr;
|
||||
mainloop_args.layout_SFB = layout_SFB;
|
||||
auto prob_shape = cute::make_shape(m, n, k, 1);
|
||||
|
||||
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
|
||||
|
||||
@ -115,6 +115,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
|
||||
|
||||
using ElementAB = typename Gemm::ElementAB;
|
||||
using ElementD = typename Gemm::ElementD;
|
||||
using ElementBlockScale = typename Gemm::ElementBlockScale;
|
||||
|
||||
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
|
||||
|
||||
@ -135,17 +136,20 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
|
||||
LayoutSFB layout_SFB =
|
||||
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
|
||||
|
||||
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
|
||||
auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
|
||||
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
|
||||
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
|
||||
auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
|
||||
auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
|
||||
auto a_scales_ptr = static_cast<ElementBlockScale const*>(a_scales.data_ptr());
|
||||
auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());
|
||||
|
||||
auto mainloop_args = [&](){
|
||||
return typename GemmKernel::MainloopArguments{
|
||||
a_ptr, a_stride, b_ptr, b_stride,
|
||||
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
|
||||
};
|
||||
}();
|
||||
typename GemmKernel::MainloopArguments mainloop_args{};
|
||||
mainloop_args.ptr_A = a_ptr;
|
||||
mainloop_args.dA = a_stride;
|
||||
mainloop_args.ptr_B = b_ptr;
|
||||
mainloop_args.dB = b_stride;
|
||||
mainloop_args.ptr_SFA = a_scales_ptr;
|
||||
mainloop_args.layout_SFA = layout_SFA;
|
||||
mainloop_args.ptr_SFB = b_scales_ptr;
|
||||
mainloop_args.layout_SFB = layout_SFB;
|
||||
auto prob_shape = cute::make_shape(m, n, k, 1);
|
||||
|
||||
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
|
||||
|
||||
817
csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
Normal file
817
csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
Normal file
@ -0,0 +1,817 @@
|
||||
// clang-format off
|
||||
// Adapted from: https://github.com/meta-pytorch/applied-ai/blob/main/kernels/cuda/inference/hadamard_transform/hadamard_transform_cuda.cu
|
||||
|
||||
/***********
|
||||
Copyright 2024 Meta
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
||||
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
|
||||
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
***********/
|
||||
|
||||
#include <torch/all.h>
|
||||
#include <stdint.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <mma.h>
|
||||
#include <cuda/annotated_ptr>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include "core/registration.h"
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
namespace hadacore {
|
||||
|
||||
#ifndef __CUDACC__
|
||||
#define __launch_bounds__(x,y)
|
||||
#endif
|
||||
|
||||
#define MAX_WARPS_PER_SM 48
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
using b16 = uint16_t;
|
||||
using b32 = uint32_t;
|
||||
|
||||
constexpr int launch_configs_big[7][3] = {
|
||||
// default
|
||||
{2, 1, 24},
|
||||
{2, 2, 16},
|
||||
{2, 4, 8},
|
||||
{2, 8, 4},
|
||||
{2, 16, 3},
|
||||
{4, 16, 2},
|
||||
{8, 16, 1}
|
||||
// // extra coalescing
|
||||
// {2, 1, 24},
|
||||
// {2, 2, 16},
|
||||
// {2, 4, 8},
|
||||
// {2, 8, 4},
|
||||
// {4, 8, 3},
|
||||
// {8, 8, 2},
|
||||
// {16, 8, 1}
|
||||
// // less coalescing
|
||||
// {2, 1, 24},
|
||||
// {2, 2, 16},
|
||||
// {2, 4, 8},
|
||||
// {2, 8, 4},
|
||||
// {1, 32, 1},
|
||||
// {2, 32, 1},
|
||||
// {4, 32, 1}
|
||||
};
|
||||
|
||||
// a 4x2, b 2x2, c 2x2
|
||||
template <torch::ScalarType dtype>
|
||||
__device__ __forceinline__ void mma_m16_n8_k16_b16_b16_b16_noacc(b32 a0, b32 a1, b32 a2, b32 a3, b32 b0, b32 b1, b32& c0, b32& c1){
|
||||
static_assert(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16);
|
||||
// d, a, b, c
|
||||
b32 zero = 0;
|
||||
if constexpr(dtype == torch::ScalarType::Half) {
|
||||
asm (
|
||||
"mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
|
||||
"{%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%8, %9};\n\t"
|
||||
: "=r"(c0), "=r"(c1) : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(b0), "r"(b1), "r"(zero), "r"(zero)
|
||||
);
|
||||
} else {
|
||||
b32 temp0, temp1, temp2, temp3;
|
||||
asm (
|
||||
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
|
||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n\t"
|
||||
: "=r"(temp0), "=r"(temp1), "=r"(temp2), "=r"(temp3) : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(b0), "r"(b1), "r"(zero), "r"(zero), "r"(zero), "r"(zero)
|
||||
);
|
||||
asm ("cvt.rn.bf16x2.f32 %0, %1, %2;\n\t" : "=r"(c0) : "r"(temp1), "r"(temp0));
|
||||
asm ("cvt.rn.bf16x2.f32 %0, %1, %2;\n\t" : "=r"(c1) : "r"(temp3), "r"(temp2));
|
||||
}
|
||||
}
|
||||
|
||||
// a 4x2, b 4x2, c 4x2
|
||||
template <torch::ScalarType dtype>
|
||||
__device__ __forceinline__ void mma_m16_n16_k16_b16_b16_b16_noacc(b32 a0, b32 a1, b32 a2, b32 a3, b32 b0, b32 b1, b32 b2, b32 b3, b32& c0, b32& c1, b32& c2, b32& c3){
|
||||
mma_m16_n8_k16_b16_b16_b16_noacc<dtype>(a0, a1, a2, a3, b0, b1, c0, c1);
|
||||
mma_m16_n8_k16_b16_b16_b16_noacc<dtype>(a0, a1, a2, a3, b2, b3, c2, c3);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void matrix_transpose_m8_n8_b16_inplace(b32& a0) {
|
||||
asm (
|
||||
"movmatrix.sync.aligned.m8n8.trans.b16 "
|
||||
"%0, %1;\n\t"
|
||||
: "=r"(a0) : "r"(a0)
|
||||
);
|
||||
}
|
||||
|
||||
#define p_p(i) ((val_1p[i] & 0x0000FFFF) | val_1p[i] << 16)
|
||||
#define p_n(i) ((val_1p[i] & 0x0000FFFF) | val_1n[i] << 16)
|
||||
#define n_p(i) ((val_1n[i] & 0x0000FFFF) | val_1p[i] << 16)
|
||||
#define n_n(i) ((val_1n[i] & 0x0000FFFF) | val_1n[i] << 16)
|
||||
|
||||
template<int64_t num_chunks, int64_t warps_per_block, int64_t log_had_size, int64_t blocks_per_sm, bool enable_mask, torch::ScalarType dtype>
|
||||
__global__ void __launch_bounds__(32 * warps_per_block, blocks_per_sm)
|
||||
// a is column major, b is row major
|
||||
hadamard_transform_kernel(b16* a, b16* out, int total_num_chunks) {
|
||||
static_assert(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16, "Only fp16 and bf16 supported currently");
|
||||
|
||||
b32 b_frag_all[num_chunks][4]; // for all chunks, holds matrix fragment (which takes 4 regs of b16x2 * 32 threads)
|
||||
|
||||
int64_t blockid = blockIdx.x * warps_per_block + threadIdx.x / 32;
|
||||
int64_t threadid = threadIdx.x % 32;
|
||||
extern __shared__ b32 bfrag_arr[]; // num_chunks * warps_per_block * 128
|
||||
int64_t real_num_chunks = ((blockid + 1) * num_chunks) > total_num_chunks ? (total_num_chunks - (blockid * num_chunks)) : num_chunks;
|
||||
int64_t diff_num_chunks = real_num_chunks - num_chunks;
|
||||
|
||||
b32* a_start_ptr = (b32*) (a + blockid * num_chunks * 256); // offset a to where this warp starts
|
||||
b32* out_start_ptr = (b32*) (out + blockid * num_chunks * 256);
|
||||
b32* a_ptr = a_start_ptr + threadid * 4;
|
||||
b32* b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * 128 + threadid * 4;
|
||||
|
||||
#if (__CUDA_ARCH__ < 900) // SM80, SM89
|
||||
uint64_t cache_policy;
|
||||
asm volatile(
|
||||
"createpolicy.fractional.L2::evict_first.b64 %0, 1.0;\n"
|
||||
: "=l"(cache_policy)
|
||||
);
|
||||
#endif
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t k = 0; k < num_chunks; k++) {
|
||||
size_t shared_ptr = __cvta_generic_to_shared(b_frag_ptr);
|
||||
#if (__CUDA_ARCH__ >= 900) // SM90
|
||||
asm volatile(
|
||||
"cp.async.cg.shared.global [%0], [%1], 16;\n"
|
||||
"cp.async.commit_group;\n"
|
||||
:: "l"(shared_ptr), "l"(a_ptr)
|
||||
);
|
||||
#else // SM80, SM89
|
||||
asm volatile(
|
||||
"cp.async.cg.shared.global.L2::cache_hint.L2::256B [%0], [%1], 16, %2;\n"
|
||||
"cp.async.commit_group;\n"
|
||||
:: "l"(shared_ptr), "l"(a_ptr), "l"(cache_policy)
|
||||
);
|
||||
#endif
|
||||
|
||||
a_ptr += 128;
|
||||
b_frag_ptr += 128;
|
||||
}
|
||||
|
||||
// generate hadamard 16x16 (up to 2 of them)
|
||||
constexpr b16 fp16_1p[4] = {0b0011100110101000, 0b0011100000000000, 0b0011010110101000, 0b0011010000000000};
|
||||
constexpr b16 fp16_1n[4] = {0b1011100110101000, 0b1011100000000000, 0b1011010110101000, 0b1011010000000000};
|
||||
constexpr b16 bf16_1p[4] = {0b0011111100110101, 0b0011111100000000, 0b0011111010110101, 0b0011111010000000};
|
||||
constexpr b16 bf16_1n[4] = {0b1011111100110101, 0b1011111100000000, 0b1011111010110101, 0b1011111010000000};
|
||||
|
||||
#define val_type_1p(i) (((dtype) == torch::ScalarType::Half) ? (fp16_1p[i]) : (bf16_1p[i]))
|
||||
#define val_type_1n(i) (((dtype) == torch::ScalarType::Half) ? (fp16_1n[i]) : (bf16_1n[i]))
|
||||
constexpr b16 val_1p[4] = {val_type_1p(0), val_type_1p(1), val_type_1p(2), val_type_1p(3)};
|
||||
constexpr b16 val_1n[4] = {val_type_1n(0), val_type_1n(1), val_type_1n(2), val_type_1n(3)};
|
||||
|
||||
constexpr b32 p_p[4] = {p_p(0), p_p(1), p_p(2), p_p(3)};
|
||||
constexpr b32 p_n[4] = {p_n(0), p_n(1), p_n(2), p_n(3)};
|
||||
constexpr b32 n_p[4] = {n_p(0), n_p(1), n_p(2), n_p(3)};
|
||||
constexpr b32 n_n[4] = {n_n(0), n_n(1), n_n(2), n_n(3)};
|
||||
const b32 had_16_p1[4][4] = {
|
||||
{
|
||||
0b10001000010001000010001000010001,
|
||||
0b00000000000000000000000000000000,
|
||||
0b00000000000000000000000000000000,
|
||||
0b10001000010001000010001000010001
|
||||
},
|
||||
{
|
||||
0b11001100100010000011001100100010,
|
||||
0b00000000000000000000000000000000,
|
||||
0b00000000000000000000000000000000,
|
||||
0b11001100100010000011001100100010
|
||||
},
|
||||
{
|
||||
0b11111111101010101100110010011001,
|
||||
0b00000000000000000000000000000000,
|
||||
0b00000000000000000000000000000000,
|
||||
0b11111111101010101100110010011001
|
||||
},
|
||||
{
|
||||
0b11111111101010101100110010011001,
|
||||
0b11111111101010101100110010011001,
|
||||
0b11111111101010101100110010011001,
|
||||
0b00000000010101010011001101100110
|
||||
}
|
||||
};
|
||||
const b32 had_16_p2[4][4] = {
|
||||
{
|
||||
0b10000000010000000010000000010000,
|
||||
0b00000000000000000000000000000000,
|
||||
0b00000000000000000000000000000000,
|
||||
0b10000000010000000010000000010000
|
||||
},
|
||||
{
|
||||
0b11000000100001000011000000100001,
|
||||
0b00000000000000000000000000000000,
|
||||
0b00000000000000000000000000000000,
|
||||
0b11000000100001000011000000100001
|
||||
},
|
||||
{
|
||||
0b11110000101001011100001110010110,
|
||||
0b00000000000000000000000000000000,
|
||||
0b00000000000000000000000000000000,
|
||||
0b11110000101001011100001110010110
|
||||
},
|
||||
{
|
||||
0b11110000101001011100001110010110,
|
||||
0b11110000101001011100001110010110,
|
||||
0b11110000101001011100001110010110,
|
||||
0b00001111010110100011110001101001
|
||||
}
|
||||
};
|
||||
const b32 had_16_mask[3][4] = {
|
||||
{
|
||||
0b10001000010001000010001000010001,
|
||||
0b00000000000000000000000000000000,
|
||||
0b00000000000000000000000000000000,
|
||||
0b10001000010001000010001000010001
|
||||
},
|
||||
{
|
||||
0b11001100110011000011001100110011,
|
||||
0b00000000000000000000000000000000,
|
||||
0b00000000000000000000000000000000,
|
||||
0b11001100110011000011001100110011
|
||||
},
|
||||
{
|
||||
0b11111111111111111111111111111111,
|
||||
0b00000000000000000000000000000000,
|
||||
0b00000000000000000000000000000000,
|
||||
0b11111111111111111111111111111111
|
||||
}
|
||||
};
|
||||
b32 had_frag[8];
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < 2; i++) {
|
||||
int64_t c_log_h = (i == 0) ? MIN(4, log_had_size) : log_had_size % 4;
|
||||
#pragma unroll
|
||||
for (int64_t j = 0; j < 4; j++) {
|
||||
if (c_log_h < 4) {
|
||||
bool mask = had_16_mask[c_log_h - 1][j] & (1 << (31 - threadid));
|
||||
if (!mask) {
|
||||
had_frag[i * 4 + j] = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
bool pred1 = had_16_p1[c_log_h - 1][j] & (1 << (31 - threadid));
|
||||
bool pred2 = had_16_p2[c_log_h - 1][j] & (1 << (31 - threadid));
|
||||
b32 val = pred1 ? (pred2 ? p_p[c_log_h - 1] : p_n[c_log_h - 1]) : (pred2 ? n_p[c_log_h - 1] : n_n[c_log_h - 1]);
|
||||
had_frag[i * 4 + j] = val;
|
||||
}
|
||||
if constexpr(log_had_size <= 4 || log_had_size % 4 == 0) break;
|
||||
}
|
||||
|
||||
// log had size above 8, only used for above 2^8 = 256 size
|
||||
constexpr int64_t part8_log_had_size = log_had_size - 8;
|
||||
|
||||
b32* a_chunk_ptr = a_start_ptr; // first chunk starts at this warp's data starts
|
||||
b32* out_chunk_ptr = out_start_ptr;
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t l = 0; l < 2; l++) {
|
||||
if constexpr(log_had_size <= 8) { // l == 0 guaranteed, redundant simplified version of else body, to help compiler warnings
|
||||
b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * 128;
|
||||
} else {
|
||||
b_frag_ptr = bfrag_arr + (blockid % warps_per_block) * num_chunks * (l == 0 ? 128 : (128 >> part8_log_had_size));
|
||||
}
|
||||
|
||||
if (l == 1) {
|
||||
if constexpr(log_had_size > 8) {
|
||||
__syncthreads(); // sync between first and second iterations if above size 256
|
||||
|
||||
if constexpr(log_had_size >= 12) {
|
||||
// sizes 4k and above
|
||||
|
||||
// a + threadblock offset + warp offset
|
||||
// can then index into all chunks owned by this warp
|
||||
b32* store = bfrag_arr + (128 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block));
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t j = 0; j < 4; j++) {
|
||||
#pragma unroll
|
||||
for (int64_t k = 0; k < num_chunks; k++) {
|
||||
// here, j represents register, and k represents 8-offset/chunk
|
||||
uint64_t real_chunk_num = (num_chunks - (threadid % num_chunks) + k) % num_chunks; // chunk at which you have target thread #'s data
|
||||
|
||||
int64_t real_thread_id = (threadid / num_chunks) * num_chunks + k; // target thread #
|
||||
int64_t chunk_idx = 128 * real_chunk_num; // index due to fetching from another chunk (chunk in which this thread has the target thread's original data)
|
||||
int64_t thread_group_idx = (real_thread_id / 4) * 16; // index due to fetching from another group of num_chunk threads (since shuffle is between num_chunk threads)
|
||||
int64_t thread_idx = (real_thread_id % 4) * 2; // index due to original thread's position within the group of num_chunk threads
|
||||
int64_t reg_idx = (j / 2) * 8 + (j % 2); // index due to target register
|
||||
int64_t idx = chunk_idx + thread_group_idx + thread_idx + reg_idx; // final index
|
||||
|
||||
// fix idx for majorness
|
||||
int64_t rowidx = idx % (1 << part8_log_had_size);
|
||||
int64_t colidx = idx >> part8_log_had_size;
|
||||
|
||||
// store[rowidx * 128 + colidx] = data;
|
||||
b32 data = store[rowidx * 128 + colidx];
|
||||
|
||||
// compiler generates excessive instructions, so we manually do the if statement
|
||||
#pragma unroll
|
||||
for (uint64_t i = 0; i < num_chunks; i++) {
|
||||
asm volatile (
|
||||
"{\n\t"
|
||||
" .reg .pred p0;\n\t"
|
||||
" setp.eq.s64 p0, %1, %2;\n\t"
|
||||
" @p0 mov.b32 %0, %3;\n\t"
|
||||
"}\n\t"
|
||||
: "+r"(b_frag_all[i][j]) // Output operand %0
|
||||
: "l"(real_chunk_num), "l"(i), "r"(data) // Input operands %1, %2, %3
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t j = 0; j < 4; j++) {
|
||||
#pragma unroll
|
||||
for (int64_t k = 1; k < num_chunks; k++) {
|
||||
int64_t threadid_contig = threadid % num_chunks;
|
||||
int64_t threadid_mul = threadid / num_chunks;
|
||||
int64_t threadid2 = (threadid_contig + num_chunks - k) % num_chunks + threadid_mul * num_chunks; // thread to give your data to
|
||||
b_frag_all[k][j] = __shfl_sync(0xFFFFFFFF, b_frag_all[k][j], threadid2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t k = 0; k < num_chunks; k++) {
|
||||
if constexpr(enable_mask) {
|
||||
if (k >= real_num_chunks)
|
||||
break;
|
||||
}
|
||||
if (l == 0) {
|
||||
// bad fix for k not being recognized as a constexpr by compiler
|
||||
// asm("cp.async.wait_group %0;\n" :: "n"(num_chunks - k - 1));
|
||||
#define SWITCH_WAIT_ASYNC_LOAD_GROUP(i) case i: asm volatile("cp.async.wait_group %0;\n" :: "n"(num_chunks - i - 1)); break;
|
||||
if constexpr(enable_mask) {
|
||||
switch(k + diff_num_chunks) {
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(0)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(1)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(2)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(3)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(4)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(5)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(6)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(7)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(8)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(9)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(10)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(11)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(12)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(13)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(14)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(15)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(16)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(17)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(18)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(19)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(20)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(21)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(22)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(23)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(24)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(25)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(26)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(27)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(28)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(29)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(30)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(31)
|
||||
}
|
||||
} else {
|
||||
switch(k) {
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(0)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(1)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(2)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(3)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(4)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(5)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(6)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(7)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(8)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(9)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(10)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(11)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(12)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(13)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(14)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(15)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(16)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(17)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(18)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(19)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(20)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(21)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(22)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(23)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(24)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(25)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(26)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(27)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(28)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(29)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(30)
|
||||
SWITCH_WAIT_ASYNC_LOAD_GROUP(31)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (l == 0) {
|
||||
// loading for the first iteration
|
||||
|
||||
// thread 0 loads [t0r0, t16r1, t0r2, t16r3]
|
||||
// thread 16 loads [t0r1, t16r0, t0r3, t16r2]
|
||||
// allows full coalescing, same for t1/t17, t2/t18, etc.
|
||||
#pragma unroll
|
||||
for (int64_t j = 0; j < 4; j++) {
|
||||
int64_t reg = ((threadid & 16) == 0) ? j : (j / 2 * 2 + (1 - j % 2));
|
||||
int64_t real_thread_id = (reg == 0 || reg == 2) ? threadid : (threadid ^ 16);
|
||||
int64_t real_row = real_thread_id % 4;
|
||||
int64_t real_col = real_thread_id / 4;
|
||||
b_frag_all[k][j] = b_frag_ptr[(real_row + (reg % 2) * 4) + (real_col + (j / 2) * 8) * 8];
|
||||
}
|
||||
|
||||
// for t16 swap r0/r1 and r2/r3 to have [t16r0, t0r1, t16r2, t0r3]
|
||||
// so registers are in right order, same for t17, t18, etc.
|
||||
if ((threadid & 16) != 0) {
|
||||
b32 temp = b_frag_all[k][0];
|
||||
b_frag_all[k][0] = b_frag_all[k][1];
|
||||
b_frag_all[k][1] = temp;
|
||||
|
||||
temp = b_frag_all[k][2];
|
||||
b_frag_all[k][2] = b_frag_all[k][3];
|
||||
b_frag_all[k][3] = temp;
|
||||
}
|
||||
|
||||
// t0 and t16 swap r1 and r3 to have their own data,
|
||||
// same for t1/t17, t2/18, etc.
|
||||
#pragma unroll
|
||||
for (int64_t j = 1; j < 4; j += 2) {
|
||||
b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], 16);
|
||||
}
|
||||
} else if constexpr(log_had_size > 8) { // condition is redundant to help compiler warnings
|
||||
if constexpr(log_had_size < 12) {
|
||||
// sizes 512, 1k, and 2k
|
||||
|
||||
// for 512:
|
||||
// thread 0 loads [t0r0, t0r1, t16r2, t16r3]
|
||||
// thread 16 loads [t0r2, t0r3, t16r0, t16r1]
|
||||
// same for t1/t17, t2/t18, etc.
|
||||
// for 1k and 2k:
|
||||
// thread 0 loads [t0r0, t0r1, t1r2, t1r3]
|
||||
// thread 1 loads [t0r2, t0r3, t1r0, t1r1]
|
||||
// same for t2/t3, t4/t5, etc.
|
||||
// allows full coalescing for 512 and 1k, 16x coalescing for 2k
|
||||
constexpr int64_t xor_val = log_had_size == 9 ? 16 : 1;
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t j = 0; j < 4; j++) {
|
||||
int64_t reg = ((threadid & xor_val) == 0) ? j : (j + 2) % 4;
|
||||
int64_t real_thread_id = reg < 2 ? threadid : (threadid ^ xor_val);
|
||||
int64_t idx = (real_thread_id / 4 * 16) + (real_thread_id % 4 * 2) + (reg / 2 * 8) + (reg % 2);
|
||||
int64_t rowidx = idx % (1 << part8_log_had_size);
|
||||
int64_t colidx = idx >> part8_log_had_size;
|
||||
b_frag_all[k][j] = b_frag_ptr[rowidx * 128 + colidx];
|
||||
}
|
||||
|
||||
if ((threadid & xor_val) != 0) {
|
||||
b32 temp = b_frag_all[k][0];
|
||||
b_frag_all[k][0] = b_frag_all[k][2];
|
||||
b_frag_all[k][2] = temp;
|
||||
|
||||
temp = b_frag_all[k][1];
|
||||
b_frag_all[k][1] = b_frag_all[k][3];
|
||||
b_frag_all[k][3] = temp;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t j = 2; j < 4; j++) {
|
||||
b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], xor_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (l == 1) {
|
||||
// for second iteration, we load 2 consecutive b16s (1 b32) per register,
|
||||
// but tensor core register layout requires 2 b16s that are in the
|
||||
// same column/consecutive rows to be in the same register, so do the swap
|
||||
b32 f0 = ((b_frag_all[k][1] & 0xFFFF) << 16) | (b_frag_all[k][0] & 0xFFFF);
|
||||
b32 f1 = ((b_frag_all[k][3] & 0xFFFF) << 16) | (b_frag_all[k][2] & 0xFFFF);
|
||||
b32 f2 = (b_frag_all[k][1] & 0xFFFF0000) | (b_frag_all[k][0] >> 16);
|
||||
b32 f3 = (b_frag_all[k][3] & 0xFFFF0000) | (b_frag_all[k][2] >> 16);
|
||||
b_frag_all[k][0] = f0;
|
||||
b_frag_all[k][1] = f1;
|
||||
b_frag_all[k][2] = f2;
|
||||
b_frag_all[k][3] = f3;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for(int64_t i = 0, remaining_log_had_size = log_had_size - l * 8; i < 2 && remaining_log_had_size > 0; i++) {
|
||||
int64_t had_off = ((remaining_log_had_size < 4) && !(log_had_size <= 4 || log_had_size % 4 == 0)) ? 4 : 0;
|
||||
mma_m16_n16_k16_b16_b16_b16_noacc<dtype>(had_frag[had_off + 0], had_frag[had_off + 1], had_frag[had_off + 2], had_frag[had_off + 3], b_frag_all[k][0], b_frag_all[k][1], b_frag_all[k][2], b_frag_all[k][3], b_frag_all[k][0], b_frag_all[k][1], b_frag_all[k][2], b_frag_all[k][3]);
|
||||
|
||||
remaining_log_had_size -= 4;
|
||||
if (remaining_log_had_size <= 0 && i == 0) {
|
||||
// TODO: consider different storing so no need for transpose
|
||||
matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][0]);
|
||||
matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][1]);
|
||||
matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][2]);
|
||||
matrix_transpose_m8_n8_b16_inplace(b_frag_all[k][3]);
|
||||
} else {
|
||||
// swap and use output directly as b_frag for next iteration as an actually free transpose
|
||||
b32 temp = b_frag_all[k][1];
|
||||
b_frag_all[k][1] = b_frag_all[k][2];
|
||||
b_frag_all[k][2] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
if (l == 1) {
|
||||
// invert swap from above for second iteration
|
||||
b32 f0 = ((b_frag_all[k][2] & 0xFFFF) << 16) | (b_frag_all[k][0] & 0xFFFF);
|
||||
b32 f1 = (b_frag_all[k][2] & 0xFFFF0000) | (b_frag_all[k][0] >> 16);
|
||||
b32 f2 = ((b_frag_all[k][3] & 0xFFFF) << 16) | (b_frag_all[k][1] & 0xFFFF);
|
||||
b32 f3 = (b_frag_all[k][3] & 0xFFFF0000) | (b_frag_all[k][1] >> 16);
|
||||
b_frag_all[k][0] = f0;
|
||||
b_frag_all[k][1] = f1;
|
||||
b_frag_all[k][2] = f2;
|
||||
b_frag_all[k][3] = f3;
|
||||
}
|
||||
|
||||
if (l == 0) {
|
||||
// inverse of coalesced load for first iteration to store result
|
||||
#pragma unroll
|
||||
for (int64_t j = 1; j < 4; j += 2) {
|
||||
b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], 16);
|
||||
}
|
||||
|
||||
if ((threadid & 16) != 0) {
|
||||
b32 temp = b_frag_all[k][0];
|
||||
b_frag_all[k][0] = b_frag_all[k][1];
|
||||
b_frag_all[k][1] = temp;
|
||||
|
||||
temp = b_frag_all[k][2];
|
||||
b_frag_all[k][2] = b_frag_all[k][3];
|
||||
b_frag_all[k][3] = temp;
|
||||
}
|
||||
|
||||
// if only going up to 256 size, store directly back to global memory,
|
||||
// otherwise store back to shared memory for next iteration
|
||||
b32* store = (log_had_size <= 8) ? out_chunk_ptr : b_frag_ptr;
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t j = 0; j < 4; j++) {
|
||||
int64_t reg = ((threadid & 16) == 0) ? j : (j / 2 * 2 + (1 - j % 2));
|
||||
int64_t real_thread_id = (reg == 0 || reg == 2) ? threadid : (threadid ^ 16);
|
||||
int64_t real_row = real_thread_id % 4;
|
||||
int64_t real_col = real_thread_id / 4;
|
||||
store[(real_row + (reg % 2) * 4) + (real_col + (reg / 2) * 8) * 8] = b_frag_all[k][j];
|
||||
}
|
||||
} else if constexpr(log_had_size > 8) { // condition is redundant to help compiler warnings
|
||||
if (log_had_size < 12) {
|
||||
// inverse of coalesced load for sizes 512, 1k and 2k to store result
|
||||
constexpr int xor_val = log_had_size == 9 ? 16 : 1;
|
||||
#pragma unroll
|
||||
for (int64_t j = 2; j < 4; j++) {
|
||||
b_frag_all[k][j] = __shfl_xor_sync(0xFFFFFFFF, b_frag_all[k][j], xor_val);
|
||||
}
|
||||
|
||||
if ((threadid & xor_val) != 0) {
|
||||
b32 temp = b_frag_all[k][0];
|
||||
b_frag_all[k][0] = b_frag_all[k][2];
|
||||
b_frag_all[k][2] = temp;
|
||||
|
||||
temp = b_frag_all[k][1];
|
||||
b_frag_all[k][1] = b_frag_all[k][3];
|
||||
b_frag_all[k][3] = temp;
|
||||
}
|
||||
|
||||
b32* store = (b32*)(out + (blockid / warps_per_block) * (num_chunks * warps_per_block) * 256 + (256 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block) + k));
|
||||
#pragma unroll
|
||||
for (int64_t j = 0; j < 4; j++) {
|
||||
int64_t reg = ((threadid & xor_val) == 0) ? j : (j + 2) % 4;
|
||||
b32 data = b_frag_all[k][j];
|
||||
int64_t real_thread_id = reg < 2 ? threadid : (threadid ^ xor_val);
|
||||
int64_t idx = (real_thread_id / 4 * 16) + (real_thread_id % 4 * 2) + (reg / 2 * 8) + (reg % 2);
|
||||
int64_t rowidx = idx % (1 << part8_log_had_size);
|
||||
int64_t colidx = idx >> part8_log_had_size;
|
||||
store[rowidx * 128 + colidx] = data;
|
||||
}
|
||||
}
|
||||
// for size 4k and above, wait to process all chunks so a final store can be performed coalesced
|
||||
}
|
||||
|
||||
a_chunk_ptr += 128; // (only affects first 256 size) move on to next chunk by skipping 256 elements in b16 (= 128 in b32)
|
||||
out_chunk_ptr += 128;
|
||||
if constexpr(log_had_size > 8) {
|
||||
b_frag_ptr += (l == 0 ? 128 : (128 >> part8_log_had_size));
|
||||
} else { // else is redundant, simplified version of if body, to help compiler warnings
|
||||
b_frag_ptr += 128;
|
||||
}
|
||||
}
|
||||
if (log_had_size <= 8)
|
||||
break;
|
||||
}
|
||||
|
||||
if constexpr(log_had_size >= 12) {
|
||||
// for sizes 4k and above, perform final coalesced store after processing all chunks
|
||||
#pragma unroll
|
||||
for (int64_t j = 0; j < 4; j++) {
|
||||
#pragma unroll
|
||||
for (int64_t k = 1; k < num_chunks; k++) {
|
||||
int64_t threadid_contig = threadid % num_chunks;
|
||||
int64_t threadid_mul = threadid / num_chunks;
|
||||
int64_t threadid2 = (threadid_contig + k) % num_chunks + threadid_mul * num_chunks; // thread to give your data to
|
||||
b_frag_all[k][j] = __shfl_sync(0xFFFFFFFF, b_frag_all[k][j], threadid2);
|
||||
}
|
||||
}
|
||||
|
||||
// a + threadblock offset + warp offset
|
||||
// can then index into all chunks owned by this warp
|
||||
b32* store = bfrag_arr + (128 >> part8_log_had_size) * (num_chunks * (blockid % warps_per_block));
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t j = 0; j < 4; j++) {
|
||||
#pragma unroll
|
||||
for (int64_t k = 0; k < num_chunks; k++) {
|
||||
// here, j represents register, and k represents 8-offset/chunk
|
||||
int64_t real_chunk_num = (num_chunks - (threadid % num_chunks) + k) % num_chunks; // chunk at which you have target thread #'s data
|
||||
|
||||
// b32 data = b_frag_all[real_chunk_num][j]; // target thread data
|
||||
b32 data;
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < num_chunks; i++) {
|
||||
if (real_chunk_num == i) data = b_frag_all[i][j];
|
||||
}
|
||||
|
||||
int64_t real_thread_id = (threadid / num_chunks) * num_chunks + k; // target thread #
|
||||
int64_t chunk_idx = 128 * real_chunk_num; // index due to fetching from another chunk (chunk in which this thread has the target thread's original data)
|
||||
int64_t thread_group_idx = (real_thread_id / 4) * 16; // index due to fetching from another group of num_chunk threads (since shuffle is between num_chunk threads)
|
||||
int64_t thread_idx = (real_thread_id % 4) * 2; // index due to original thread's position within the group of num_chunk threads
|
||||
int64_t reg_idx = (j / 2) * 8 + (j % 2); // index due to target register
|
||||
int64_t idx = chunk_idx + thread_group_idx + thread_idx + reg_idx; // final index
|
||||
|
||||
// fix idx for majorness
|
||||
int64_t rowidx = idx % (1 << part8_log_had_size);
|
||||
int64_t colidx = idx >> part8_log_had_size;
|
||||
|
||||
store[rowidx * 128 + colidx] = data;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
store = ((b32*) out) + (blockid / warps_per_block) * (num_chunks * warps_per_block) * 128;
|
||||
int4* store4 = (int4*) store;
|
||||
int4* bfrag_arr4 = (int4*) bfrag_arr;
|
||||
// flush smem, simply linearly write to store
|
||||
// always divisible by 128*32b, so (32*4)*32b is ok
|
||||
#pragma unroll
|
||||
for (int64_t warp_off = 0; warp_off < (num_chunks * warps_per_block * 128 / 4); warp_off += 32 * warps_per_block) {
|
||||
int64_t total_off = warp_off + threadid + (blockid % warps_per_block) * 32;
|
||||
store4[total_off] = bfrag_arr4[total_off];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
constexpr int64_t ceil_div(int64_t a, int64_t b) {
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
|
||||
template <torch::ScalarType dtype, int64_t chunks_per_warp, int64_t warps_per_block, int64_t log_had_size, int64_t blocks_per_sm, bool check_masking = false>
|
||||
void __forceinline__ run_kernel(b16* a_mat, b16* out, int64_t num_chunks, cudaStream_t stream) {
|
||||
int64_t shared_size = chunks_per_warp * warps_per_block * 128 * 4;
|
||||
dim3 block_size = 32 * warps_per_block;
|
||||
|
||||
#define CHECK_SHARED_LIM() { \
|
||||
if (shared_size > 48 * 1024) { \
|
||||
C10_CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536)); \
|
||||
} \
|
||||
} \
|
||||
|
||||
if constexpr(check_masking) {
|
||||
if (num_chunks % (chunks_per_warp * warps_per_block) != 0) {
|
||||
dim3 grid_size = ceil_div(ceil_div(num_chunks, chunks_per_warp), warps_per_block);
|
||||
auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, true, dtype>;
|
||||
CHECK_SHARED_LIM();
|
||||
kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks);
|
||||
} else {
|
||||
dim3 grid_size = num_chunks / chunks_per_warp / warps_per_block;
|
||||
auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, false, dtype>;
|
||||
CHECK_SHARED_LIM();
|
||||
kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks);
|
||||
}
|
||||
} else {
|
||||
dim3 grid_size = num_chunks / chunks_per_warp / warps_per_block;
|
||||
auto kernel = hadamard_transform_kernel<chunks_per_warp, warps_per_block, log_had_size, blocks_per_sm, false, dtype>;
|
||||
CHECK_SHARED_LIM();
|
||||
kernel<<<dim3(grid_size), dim3(block_size), shared_size, stream>>>(a_mat, out, num_chunks);
|
||||
}
|
||||
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
||||
template <torch::ScalarType dtype>
|
||||
void run_fht(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream) {
|
||||
int64_t num_chunks = numel / 256; // caller required to ensure divisible by 256
|
||||
// for size 256, use (2, 1)
|
||||
// for size 32k use (8, 16)
|
||||
constexpr int64_t chunks_per_warp_small = 1;// 8;
|
||||
constexpr int64_t warps_per_block_small = 1;//2;//16;
|
||||
constexpr int64_t blocks_per_sm_small = 24;
|
||||
constexpr int64_t chunks_per_warp_large = 2;
|
||||
constexpr int64_t warps_per_block_large = 1;
|
||||
constexpr int64_t blocks_per_sm_large = 24;
|
||||
|
||||
b16* a_mat = (b16*) a_mat_ptr;
|
||||
b16* out = (b16*) out_ptr;
|
||||
|
||||
if (numel <= 256) {
|
||||
switch (had_size) {
|
||||
case (1<<1): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 1, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<2): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 2, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<3): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 3, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<4): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 4, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<5): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 5, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<6): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 6, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<7): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 7, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<8): run_kernel<dtype, chunks_per_warp_small, warps_per_block_small, 8, blocks_per_sm_small>(a_mat, out, num_chunks, stream); break;
|
||||
}
|
||||
} else {
|
||||
switch (had_size) {
|
||||
case (1<<1): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 1, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<2): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 2, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<3): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 3, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<4): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 4, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<5): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 5, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<6): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 6, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<7): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 7, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<8): run_kernel<dtype, chunks_per_warp_large, warps_per_block_large, 8, blocks_per_sm_large, true>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<9): run_kernel<dtype, launch_configs_big[0][0], launch_configs_big[0][1], 9 , launch_configs_big[0][2]>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<10): run_kernel<dtype, launch_configs_big[1][0], launch_configs_big[1][1], 10, launch_configs_big[1][2]>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<11): run_kernel<dtype, launch_configs_big[2][0], launch_configs_big[2][1], 11, launch_configs_big[2][2]>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<12): run_kernel<dtype, launch_configs_big[3][0], launch_configs_big[3][1], 12, launch_configs_big[3][2]>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<13): run_kernel<dtype, launch_configs_big[4][0], launch_configs_big[4][1], 13, launch_configs_big[4][2]>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<14): run_kernel<dtype, launch_configs_big[5][0], launch_configs_big[5][1], 14, launch_configs_big[5][2]>(a_mat, out, num_chunks, stream); break;
|
||||
case (1<<15): run_kernel<dtype, launch_configs_big[6][0], launch_configs_big[6][1], 15, launch_configs_big[6][2]>(a_mat, out, num_chunks, stream); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template void run_fht<torch::ScalarType::Half>(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream);
|
||||
template void run_fht<torch::ScalarType::BFloat16>(void* a_mat_ptr, void* out_ptr, int64_t numel, int64_t had_size, cudaStream_t stream);
|
||||
|
||||
} // namespace hadacore
|
||||
|
||||
constexpr bool is_power_of_two(int x) { return x && !(x & (x - 1)); }
|
||||
|
||||
torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
|
||||
auto dtype = x.scalar_type();
|
||||
TORCH_CHECK(dtype == torch::ScalarType::Half || dtype == torch::ScalarType::BFloat16, "Only fp16 and bf16 supported currently");
|
||||
TORCH_CHECK(x.is_cuda());
|
||||
|
||||
const int had_size = x.size(-1);
|
||||
TORCH_CHECK(is_power_of_two(had_size) && (had_size <= (1U << 15)),
|
||||
"Only power of two Hadamard sizes up to 2^15 are supported, got ", had_size);
|
||||
|
||||
const auto res_shape = x.sizes();
|
||||
x = x.reshape({-1, had_size});
|
||||
|
||||
auto numel = x.numel();
|
||||
if (numel % 256 != 0) {
|
||||
x = torch::nn::functional::pad(x, torch::nn::functional::PadFuncOptions({0, 0, 0, (256 - numel % 256) / had_size}));
|
||||
}
|
||||
|
||||
if (x.stride(-1) != 1) {
|
||||
x = x.contiguous();
|
||||
}
|
||||
torch::Tensor out = inplace ? x : torch::empty_like(x);
|
||||
|
||||
at::cuda::CUDAGuard device_guard{(char)x.get_device()};
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
|
||||
VLLM_DISPATCH_HALF_TYPES(x.scalar_type(), "hadacore_transform_runfht", [&] {
|
||||
auto constexpr SCALAR_TYPE = c10::CppTypeToScalarType<scalar_t>::value;
|
||||
hadacore::run_fht<SCALAR_TYPE>(x.data_ptr(), x.data_ptr(), x.numel(), had_size, stream);
|
||||
});
|
||||
|
||||
if (numel % 256 != 0) {
|
||||
out = out.index({torch::indexing::Slice(0, numel / had_size)});
|
||||
}
|
||||
|
||||
if (inplace && out.data_ptr() != x.data_ptr()) {
|
||||
x.copy_(out.view(res_shape));
|
||||
return x;
|
||||
}
|
||||
return out.reshape(res_shape);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("hadacore_transform", &hadacore_transform);
|
||||
}
|
||||
@ -30,6 +30,10 @@
|
||||
#define __HIP__GFX9__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
|
||||
#define __HIP__FP8MFMA__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__))
|
||||
#define __HIP__GFX11__
|
||||
#endif
|
||||
@ -51,6 +55,12 @@
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
|
||||
|
||||
enum class MFMAType {
|
||||
F16 = 0,
|
||||
Fp8 = 1,
|
||||
Fp4 = 2,
|
||||
};
|
||||
|
||||
#if defined(__HIP__GFX9__)
|
||||
|
||||
#define GCN_MFMA_INSTR1 __builtin_amdgcn_mfma_f32_16x16x4f32
|
||||
@ -112,6 +122,21 @@ __device__ __forceinline__ floatx4 gcn_mfma16x16x16_instr(const _B16x4& inpA,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int absz, int cbid, int blgp>
|
||||
__device__ __forceinline__ floatx4 gcn_mfma16x16x32_instr(const long& inpA,
|
||||
const long& inpB,
|
||||
const floatx4& inpC) {
|
||||
if constexpr (std::is_same<T, __hip_fp8_e4m3>::value) {
|
||||
return __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(inpA, inpB, inpC, absz,
|
||||
cbid, blgp);
|
||||
} else if constexpr (std::is_same<T, __hip_fp8_e5m2>::value) {
|
||||
return __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(inpA, inpB, inpC, absz,
|
||||
cbid, blgp);
|
||||
} else {
|
||||
static_assert(false, "unsupported 8b dtype");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ float to_float(const T& inp) {
|
||||
if constexpr (std::is_same<T, _Float16>::value) {
|
||||
@ -256,12 +281,44 @@ __device__ __forceinline__ _B16x8 convert_b8x8_custom(const _B8x8 input) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
typedef union u64_cvt {
|
||||
half f16x4[4];
|
||||
int16_t b16x4[4];
|
||||
_B8x8 b8x8;
|
||||
_B16x4 b64;
|
||||
int64_t i64;
|
||||
} _T8x8;
|
||||
|
||||
__device__ __forceinline__ _B8x8 convert_b16x8(const _B16x8& input,
|
||||
_T8x8& Mtemp) {
|
||||
_T8x8 Qtmp8x8;
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
floatx4 q_out = {0, 0, 0, 0};
|
||||
q_out = gcn_mfma16x16x16_instr<_Float16, 0, 0, 0>(Mtemp.b64, input.xy[i],
|
||||
q_out);
|
||||
Qtmp8x8.b16x4[i * 2] =
|
||||
__builtin_amdgcn_cvt_pk_fp8_f32(q_out[0], q_out[1], 0, false);
|
||||
Qtmp8x8.b16x4[i * 2 + 1] =
|
||||
__builtin_amdgcn_cvt_pk_fp8_f32(q_out[2], q_out[3], 0, false);
|
||||
}
|
||||
return Qtmp8x8.b8x8;
|
||||
}
|
||||
|
||||
__device__ float warpReduceMax(float val) {
|
||||
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
|
||||
val = max(
|
||||
val, __shfl_down(val, offset, WARP_SIZE)); // Using max() for reduction
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
// grid (num_seqs, num_partitions,num_kv_heads)
|
||||
// block (256)
|
||||
// clang-format off
|
||||
template <typename scalar_t, typename cache_t,
|
||||
vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
|
||||
int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
|
||||
int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO, MFMAType MFMA_TYPE>
|
||||
__global__
|
||||
__launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
@ -367,6 +424,10 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
|
||||
|
||||
int kphysical_block_number[TLOOP];
|
||||
#if defined(__HIP__FP8MFMA__)
|
||||
float q_max = 0;
|
||||
float q_scale = 1.0;
|
||||
#endif
|
||||
|
||||
// fetch k physical block numbers
|
||||
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
|
||||
@ -416,6 +477,15 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
Qlocal[qkhe_depth][qkratio].xy[i] =
|
||||
shared_logits[qkhe_depth][rowid][lane16id % GQA_RATIO]
|
||||
[2 * qkratio + i];
|
||||
#if defined(__HIP__FP8MFMA__)
|
||||
if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto &&
|
||||
MFMA_TYPE == MFMAType::Fp8) {
|
||||
scalar_t* qptr =
|
||||
reinterpret_cast<scalar_t*>(&Qlocal[qkhe_depth][qkratio].xy[i]);
|
||||
for (int k = 0; k < 4; k++)
|
||||
q_max = fmax(fabs(to_float<scalar_t>(qptr[k])), q_max);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -515,6 +585,14 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {
|
||||
// multiply by k_scale if fp8 kv cache
|
||||
scale2 *= *k_scale;
|
||||
#if defined(__HIP__FP8MFMA__)
|
||||
q_max = warpReduceMax(q_max);
|
||||
constexpr float FP8_E4M3_SCALE_TARGET = 224.0f;
|
||||
if constexpr (MFMA_TYPE == MFMAType::Fp8) {
|
||||
q_scale = q_max > 0 ? FP8_E4M3_SCALE_TARGET / q_max : 1.0f;
|
||||
scale2 /= q_scale;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
floatx4 d_out[TLOOP];
|
||||
@ -534,12 +612,41 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
auto Ktmp = Klocal[token_depth][qkhe_depth];
|
||||
_B8x16 Ktmp8x16 = *reinterpret_cast<_B8x16*>(&Ktmp);
|
||||
for (int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) {
|
||||
_B8x8 Ktmp8x8 = Ktmp8x16.xy[qkratio];
|
||||
_B16x8 Klocaltmp = convert_b8x8_custom<scalar_t>(Ktmp8x8);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
d_out[token_depth] = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
|
||||
Klocaltmp.xy[i], Qlocal[qkhe_depth][qkratio].xy[i],
|
||||
d_out[token_depth]);
|
||||
if constexpr (MFMA_TYPE == MFMAType::F16) {
|
||||
_B8x8 Ktmp8x8 = Ktmp8x16.xy[qkratio];
|
||||
_B16x8 Klocaltmp = convert_b8x8_custom<scalar_t>(Ktmp8x8);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
d_out[token_depth] = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
|
||||
Klocaltmp.xy[i], Qlocal[qkhe_depth][qkratio].xy[i],
|
||||
d_out[token_depth]);
|
||||
}
|
||||
} else {
|
||||
#if defined(__HIP__FP8MFMA__)
|
||||
_T8x8 Ktmp8x8, Qtmp8x8;
|
||||
Ktmp8x8.b8x8 = Ktmp8x16.xy[qkratio];
|
||||
|
||||
for (int n = 0; n < 2; n++) {
|
||||
scalar_t* qptr = reinterpret_cast<scalar_t*>(
|
||||
&Qlocal[qkhe_depth][qkratio].xy[n]);
|
||||
|
||||
Qtmp8x8.b16x4[n * 2] =
|
||||
vllm::fp8::scaled_vec_conversion<uint16_t, float2>(
|
||||
make_float2(to_float<scalar_t>(qptr[0]),
|
||||
to_float<scalar_t>(qptr[1])),
|
||||
q_scale);
|
||||
Qtmp8x8.b16x4[n * 2 + 1] =
|
||||
vllm::fp8::scaled_vec_conversion<uint16_t, float2>(
|
||||
make_float2(to_float<scalar_t>(qptr[2]),
|
||||
to_float<scalar_t>(qptr[3])),
|
||||
q_scale);
|
||||
}
|
||||
|
||||
d_out[token_depth] =
|
||||
gcn_mfma16x16x32_instr<__hip_fp8_e4m3, 0, 0, 0>(
|
||||
Ktmp8x8.i64, Qtmp8x8.i64, d_out[token_depth]);
|
||||
#else
|
||||
UNREACHABLE_CODE
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -629,17 +736,36 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
// disable rtz conversion due to its impact on accuracy.
|
||||
constexpr bool LOGITS_RTZ_CONVERSION = false;
|
||||
|
||||
#if defined(__HIP__FP8MFMA__)
|
||||
int rowid_8x8 = rowid / 2;
|
||||
int offset = rowid % 2;
|
||||
#endif
|
||||
|
||||
// write logits to shared mem
|
||||
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
|
||||
d_out[token_depth] *= inv_sum_scale;
|
||||
if constexpr (LOGITS_RTZ_CONVERSION) {
|
||||
// use rtz conversion for better performance, with negligible impact on
|
||||
// accuracy
|
||||
shared_logits[warpid][token_depth][lane16id][rowid] =
|
||||
from_floatx4_rtz<scalar_t>(d_out[token_depth]);
|
||||
if constexpr (MFMA_TYPE != MFMAType::Fp8) {
|
||||
if constexpr (LOGITS_RTZ_CONVERSION) {
|
||||
// use rtz conversion for better performance, with negligible impact on
|
||||
// accuracy
|
||||
shared_logits[warpid][token_depth][lane16id][rowid] =
|
||||
from_floatx4_rtz<scalar_t>(d_out[token_depth]);
|
||||
} else {
|
||||
shared_logits[warpid][token_depth][lane16id][rowid] =
|
||||
from_floatx4<scalar_t>(d_out[token_depth]);
|
||||
}
|
||||
} else {
|
||||
shared_logits[warpid][token_depth][lane16id][rowid] =
|
||||
from_floatx4<scalar_t>(d_out[token_depth]);
|
||||
#if defined(__HIP__FP8MFMA__)
|
||||
// cast _B16x4* to _B8x8*
|
||||
_T8x8& logits_8x8 = *reinterpret_cast<_T8x8*>(
|
||||
&shared_logits[warpid][token_depth][lane16id][rowid_8x8]);
|
||||
logits_8x8.b16x4[offset * 2] = __builtin_amdgcn_cvt_pk_fp8_f32(
|
||||
d_out[token_depth][0], d_out[token_depth][1], 0, false);
|
||||
logits_8x8.b16x4[offset * 2 + 1] = __builtin_amdgcn_cvt_pk_fp8_f32(
|
||||
d_out[token_depth][2], d_out[token_depth][3], 0, false);
|
||||
#else
|
||||
UNREACHABLE_CODE
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -692,19 +818,42 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
_B8x16 Vtmp8x16 = *reinterpret_cast<_B8x16*>(&Vtmp);
|
||||
for (int j = 0; j < ELEMS16_ELEMS8_RATIO; j++) {
|
||||
_B8x8 Vtmp8x8 = Vtmp8x16.xy[j];
|
||||
_B16x8 Vlocaltmp = convert_b8x8_custom<scalar_t>(Vtmp8x8);
|
||||
for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) {
|
||||
const int offset =
|
||||
rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO +
|
||||
j * ELEMS8_ELEMS4_RATIO + i;
|
||||
const int offset1 = offset % ROWS_PER_WARP;
|
||||
const int offset2 = offset / ROWS_PER_WARP;
|
||||
// output format is 16 qheads across 16 lanes, 16 head elems
|
||||
// spread across 4 rows
|
||||
tmp_out = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
|
||||
Vlocaltmp.xy[i],
|
||||
shared_logits[vtoken_depth][offset2][lane16id][offset1],
|
||||
tmp_out);
|
||||
if constexpr (MFMA_TYPE == MFMAType::F16) {
|
||||
_B16x8 Vlocaltmp = convert_b8x8_custom<scalar_t>(Vtmp8x8);
|
||||
for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) {
|
||||
const int offset =
|
||||
rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO +
|
||||
j * ELEMS8_ELEMS4_RATIO + i;
|
||||
const int offset1 = offset % ROWS_PER_WARP;
|
||||
const int offset2 = offset / ROWS_PER_WARP;
|
||||
// output format is 16 qheads across 16 lanes, 16 head elems
|
||||
// spread across 4 rows
|
||||
tmp_out = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
|
||||
Vlocaltmp.xy[i],
|
||||
shared_logits[vtoken_depth][offset2][lane16id][offset1],
|
||||
tmp_out);
|
||||
}
|
||||
} else {
|
||||
#if defined(__HIP__FP8MFMA__)
|
||||
for (int i = 0; i < ELEMS8_ELEMS4_RATIO / 2; i++) {
|
||||
const int offset =
|
||||
rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO +
|
||||
j * ELEMS8_ELEMS4_RATIO + i;
|
||||
const int offset1 = (offset % ROWS_PER_WARP) / 2;
|
||||
const int offset2 = offset / ROWS_PER_WARP;
|
||||
// output format is 16 qheads across 16 lanes, 16 head elems
|
||||
// spread across 4 rows
|
||||
tmp_out = gcn_mfma16x16x32_instr<__hip_fp8_e4m3, 0, 0, 0>(
|
||||
reinterpret_cast<_T8x8*>(&Vtmp8x8)->i64,
|
||||
reinterpret_cast<_T8x8*>(
|
||||
&shared_logits[vtoken_depth][offset2][lane16id]
|
||||
[offset1])
|
||||
->i64,
|
||||
tmp_out);
|
||||
}
|
||||
#else
|
||||
UNREACHABLE_CODE
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1570,7 +1719,8 @@ __device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) {
|
||||
// clang-format off
|
||||
template <typename scalar_t, typename cache_t,
|
||||
vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
|
||||
int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
|
||||
int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO,
|
||||
MFMAType MFMA_TYPE>
|
||||
__global__
|
||||
__launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
@ -2337,7 +2487,8 @@ __device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) {
|
||||
// clang-format off
|
||||
template <typename scalar_t, typename cache_t,
|
||||
vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
|
||||
int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
|
||||
int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO,
|
||||
MFMAType MFMA_TYPE>
|
||||
__global__
|
||||
__launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
@ -2969,7 +3120,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
template <typename scalar_t, typename cache_t,
|
||||
vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
|
||||
int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
|
||||
int GQA_RATIO>
|
||||
int GQA_RATIO, MFMAType MFMA_TYPE>
|
||||
__global__
|
||||
__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
@ -3041,7 +3192,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO) \
|
||||
paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
|
||||
HEAD_SIZE, NTHR, ALIBI_ENABLED, \
|
||||
GQA_RATIO> \
|
||||
GQA_RATIO, MFMA_TYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||
block_tables_ptr, seq_lens_ptr, query_start_loc_ptr, \
|
||||
@ -3069,7 +3220,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
|
||||
template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
|
||||
int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
|
||||
bool ALIBI_ENABLED>
|
||||
bool ALIBI_ENABLED, MFMAType MFMA_TYPE>
|
||||
void paged_attention_custom_launcher(
|
||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
@ -3225,7 +3376,7 @@ void paged_attention_custom_launcher(
|
||||
|
||||
template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
|
||||
int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
|
||||
bool ALIBI_ENABLED>
|
||||
bool ALIBI_ENABLED, MFMAType MFMA_TYPE>
|
||||
void paged_attention_custom_launcher_navi(
|
||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
@ -3397,74 +3548,77 @@ void paged_attention_custom_launcher_navi(
|
||||
}
|
||||
|
||||
#define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, \
|
||||
PSIZE, ALIBI_ENABLED) \
|
||||
PSIZE, ALIBI_ENABLED, MFMA_TYPE) \
|
||||
if (!is_navi) { \
|
||||
paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \
|
||||
OUTT, PSIZE, ALIBI_ENABLED>( \
|
||||
OUTT, PSIZE, ALIBI_ENABLED, MFMA_TYPE>( \
|
||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||
num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \
|
||||
max_seq_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); \
|
||||
} else { \
|
||||
paged_attention_custom_launcher_navi< \
|
||||
T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \
|
||||
paged_attention_custom_launcher_navi<T, KVT, KV_DTYPE, BLK_SIZE, \
|
||||
HEAD_SIZE, OUTT, PSIZE, \
|
||||
ALIBI_ENABLED, MFMA_TYPE>( \
|
||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||
num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \
|
||||
max_seq_len, alibi_slopes, k_scale, v_scale); \
|
||||
}
|
||||
|
||||
#define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \
|
||||
OUTT, PSIZE) \
|
||||
OUTT, PSIZE, MFMA_TYPE) \
|
||||
if (alibi_slopes) { \
|
||||
CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, \
|
||||
true); \
|
||||
true, MFMA_TYPE); \
|
||||
} else { \
|
||||
CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, \
|
||||
false); \
|
||||
false, MFMA_TYPE); \
|
||||
}
|
||||
|
||||
#if defined(__HIPCC__) && defined(__gfx90a__)
|
||||
#define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE) \
|
||||
#define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \
|
||||
MFMA_TYPE) \
|
||||
if (fp8_out_scale) { \
|
||||
TORCH_CHECK(false, "fp8 out scale unsupported for gfx90a"); \
|
||||
} else { \
|
||||
CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
|
||||
256); \
|
||||
256, MFMA_TYPE); \
|
||||
}
|
||||
#else
|
||||
#define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE) \
|
||||
#define CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \
|
||||
MFMA_TYPE) \
|
||||
if (fp8_out_scale) { \
|
||||
CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \
|
||||
uint8_t, 256); \
|
||||
uint8_t, 256, MFMA_TYPE); \
|
||||
} else { \
|
||||
CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
|
||||
256); \
|
||||
256, MFMA_TYPE); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE) \
|
||||
switch (block_size) { \
|
||||
case 16: \
|
||||
CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 16, HEAD_SIZE); \
|
||||
break; \
|
||||
case 32: \
|
||||
CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 32, HEAD_SIZE); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||
break; \
|
||||
#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE, MFMA_TYPE) \
|
||||
switch (block_size) { \
|
||||
case 16: \
|
||||
CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 16, HEAD_SIZE, MFMA_TYPE); \
|
||||
break; \
|
||||
case 32: \
|
||||
CALL_CUSTOM_LAUNCHER_OUT(T, KVT, KV_DTYPE, 32, HEAD_SIZE, MFMA_TYPE); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define CALL_CUSTOM_LAUNCHER_BLK_HEAD(T, KVT, KV_DTYPE) \
|
||||
switch (head_size) { \
|
||||
case 64: \
|
||||
CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 64); \
|
||||
break; \
|
||||
case 128: \
|
||||
CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 128); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported head size: ", head_size); \
|
||||
break; \
|
||||
#define CALL_CUSTOM_LAUNCHER_BLK_HEAD(T, KVT, KV_DTYPE, MFMA_TYPE) \
|
||||
switch (head_size) { \
|
||||
case 64: \
|
||||
CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 64, MFMA_TYPE); \
|
||||
break; \
|
||||
case 128: \
|
||||
CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, 128, MFMA_TYPE); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported head size: ", head_size); \
|
||||
break; \
|
||||
}
|
||||
|
||||
bool is_navi_gpu() {
|
||||
@ -3503,28 +3657,43 @@ void paged_attention(
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale,
|
||||
const std::optional<torch::Tensor>& fp8_out_scale) {
|
||||
const std::optional<torch::Tensor>& fp8_out_scale,
|
||||
const std::string& mfma_type) {
|
||||
// clang-format on
|
||||
bool is_navi = is_navi_gpu();
|
||||
|
||||
const int head_size = query.size(2);
|
||||
if (kv_cache_dtype == "auto") {
|
||||
if (query.dtype() == at::ScalarType::Half) {
|
||||
CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, _Float16,
|
||||
vllm::Fp8KVCacheDataType::kAuto);
|
||||
CALL_CUSTOM_LAUNCHER_BLK_HEAD(
|
||||
_Float16, _Float16, vllm::Fp8KVCacheDataType::kAuto, MFMAType::F16);
|
||||
} else if (query.dtype() == at::ScalarType::BFloat16) {
|
||||
CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, __hip_bfloat16,
|
||||
vllm::Fp8KVCacheDataType::kAuto);
|
||||
vllm::Fp8KVCacheDataType::kAuto,
|
||||
MFMAType::F16);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
|
||||
}
|
||||
} else if (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3") {
|
||||
if (query.dtype() == at::ScalarType::Half) {
|
||||
CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||
if (mfma_type == "fp8") {
|
||||
CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3,
|
||||
MFMAType::Fp8);
|
||||
} else {
|
||||
CALL_CUSTOM_LAUNCHER_BLK_HEAD(_Float16, uint8_t,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3,
|
||||
MFMAType::F16);
|
||||
}
|
||||
} else if (query.dtype() == at::ScalarType::BFloat16) {
|
||||
CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||
if (mfma_type == "fp8") {
|
||||
CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3,
|
||||
MFMAType::Fp8);
|
||||
} else {
|
||||
CALL_CUSTOM_LAUNCHER_BLK_HEAD(__hip_bfloat16, uint8_t,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3,
|
||||
MFMAType::F16);
|
||||
}
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
|
||||
}
|
||||
|
||||
@ -19,4 +19,5 @@ void paged_attention(
|
||||
const std::optional<torch::Tensor>& query_start_loc, int64_t block_size,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale);
|
||||
torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale,
|
||||
const std::string& mfma_type);
|
||||
|
||||
@ -48,7 +48,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
|
||||
" Tensor? alibi_slopes,"
|
||||
" str kv_cache_dtype,"
|
||||
" Tensor k_scale, Tensor v_scale,"
|
||||
" Tensor? fp8_out_scale) -> ()");
|
||||
" Tensor? fp8_out_scale,"
|
||||
" str mfma_type) -> ()");
|
||||
rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
|
||||
}
|
||||
|
||||
|
||||
@ -613,6 +613,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"int pad_slot_id) -> ()");
|
||||
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
|
||||
|
||||
// Hadamard transforms
|
||||
ops.def("hadacore_transform(Tensor! x, bool inplace) -> Tensor");
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// Compute per-token-group FP8 quantized tensor and scaling factor.
|
||||
ops.def(
|
||||
|
||||
@ -196,6 +196,7 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
# Flag to control whether to use pre-built vLLM wheels
|
||||
ARG VLLM_USE_PRECOMPILED=""
|
||||
ARG VLLM_MAIN_CUDA_VERSION=""
|
||||
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -213,6 +214,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||
&& export CMAKE_BUILD_TYPE=Release \
|
||||
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||
&& export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
|
||||
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||
&& sccache --show-stats \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||
@ -375,7 +377,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
# Install FlashInfer from source
|
||||
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||
# Keep this in sync with "flashinfer" extra in setup.py
|
||||
ARG FLASHINFER_GIT_REF="v0.3.0"
|
||||
ARG FLASHINFER_GIT_REF="v0.3.1"
|
||||
# Flag to control whether to compile FlashInfer AOT kernels
|
||||
# Set to "true" to enable AOT compilation:
|
||||
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
|
||||
|
||||
@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
|
||||
|
||||
|
||||
# build flashinfer for torch nightly from source around 10 mins
|
||||
# release version: v0.2.2.post1
|
||||
# release version: v0.3.1
|
||||
# todo(elainewy): cache flashinfer build result for faster build
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
echo "git clone flashinfer..." \
|
||||
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
||||
&& cd flashinfer \
|
||||
&& git checkout v0.2.2.post1 \
|
||||
&& git checkout v0.3.1 \
|
||||
&& git submodule update --init --recursive \
|
||||
&& echo "finish git clone flashinfer..." \
|
||||
&& rm -rf build \
|
||||
|
||||
@ -840,7 +840,6 @@ Some HF processors directly insert feature tokens without replacing anything in
|
||||
Examples:
|
||||
|
||||
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
|
||||
- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
|
||||
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
|
||||
|
||||
### Handling prompt updates unrelated to multi-modal data
|
||||
|
||||
@ -6,35 +6,33 @@ It can be quickly integrated with vLLM as a backend API server, enabling powerfu
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Setup vLLM environment
|
||||
Set up the vLLM environment by installing all required packages:
|
||||
|
||||
```bash
|
||||
pip install vllm streamlit openai
|
||||
```
|
||||
|
||||
## Deploy
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
1. Start the vLLM server with a supported chat completion model, e.g.
|
||||
|
||||
```bash
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
```bash
|
||||
vllm serve Qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
|
||||
- Install streamlit and openai:
|
||||
1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
|
||||
|
||||
```bash
|
||||
pip install streamlit openai
|
||||
```
|
||||
1. Start the streamlit web UI and start to chat:
|
||||
|
||||
- Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
|
||||
|
||||
- Start the streamlit web UI and start to chat:
|
||||
|
||||
```bash
|
||||
streamlit run streamlit_openai_chatbot_webserver.py
|
||||
|
||||
# or specify the VLLM_API_BASE or VLLM_API_KEY
|
||||
VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \
|
||||
```bash
|
||||
streamlit run streamlit_openai_chatbot_webserver.py
|
||||
|
||||
# start with debug mode to view more details
|
||||
streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug
|
||||
```
|
||||
# or specify the VLLM_API_BASE or VLLM_API_KEY
|
||||
VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \
|
||||
streamlit run streamlit_openai_chatbot_webserver.py
|
||||
|
||||

|
||||
# start with debug mode to view more details
|
||||
streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug
|
||||
```
|
||||
|
||||

|
||||
|
||||
@ -1,31 +1,31 @@
|
||||
# Integration with Hugging Face
|
||||
|
||||
This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
|
||||
This document describes how vLLM integrates with Hugging Face libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
|
||||
|
||||
Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`.
|
||||
Let's say we want to serve the popular Qwen model by running `vllm serve Qwen/Qwen2-7B`.
|
||||
|
||||
1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process:
|
||||
- If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
|
||||
- If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works.
|
||||
- If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.
|
||||
- If the `model` argument is a Hugging Face model ID consisting of a username and model name, vLLM will first try to use the config file from the Hugging Face local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the Hugging Face cache works.
|
||||
- If the `model` argument is a Hugging Face model ID but it is not found in the cache, vLLM will download the config file from the Hugging Face model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.
|
||||
|
||||
2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation.
|
||||
|
||||
3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
|
||||
- HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
|
||||
- The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
|
||||
- Hugging Face also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, Hugging Face will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
|
||||
- The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, Hugging Face will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
|
||||
|
||||
4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation.
|
||||
|
||||
5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs.
|
||||
|
||||
Beyond that, there are two more things vLLM depends on HuggingFace for.
|
||||
Beyond that, there are two more things vLLM depends on Hugging Face for.
|
||||
|
||||
1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
|
||||
1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
|
||||
|
||||
2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
|
||||
2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
|
||||
- It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
|
||||
|
||||
This completes the integration between vLLM and HuggingFace.
|
||||
This completes the integration between vLLM and Hugging Face.
|
||||
|
||||
In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
|
||||
In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the Hugging Face model hub or a local directory. It uses the config class from either vLLM, Hugging Face transformers, or loads the config class from the model's repository.
|
||||
|
||||
@ -165,7 +165,19 @@ There are scenarios where the PyTorch dependency cannot be easily installed with
|
||||
- Building vLLM with PyTorch nightly or a custom PyTorch build.
|
||||
- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.
|
||||
|
||||
To build vLLM using an existing PyTorch installation, it is recommended to use `uv`, because it has [a unique mechanism](https://docs.astral.sh/uv/concepts/projects/config/#disabling-build-isolation) for disabling build isolation for specific packages and vLLM leverages this mechanism to specify `torch` as the package to disable build isolation.
|
||||
To build vLLM using an existing PyTorch installation:
|
||||
|
||||
```bash
|
||||
# install PyTorch first, either from PyPI or from source
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
python use_existing_torch.py
|
||||
uv pip install -r requirements/build.txt
|
||||
uv pip install --no-build-isolation -e .
|
||||
```
|
||||
|
||||
Alternatively: if you are exclusively using `uv` to create and manage virtual environments, it has [a unique mechanism](https://docs.astral.sh/uv/concepts/projects/config/#disabling-build-isolation)
|
||||
for disabling build isolation for specific packages. vLLM can leverage this mechanism to specify `torch` as the package to disable build isolation for:
|
||||
|
||||
```bash
|
||||
# install PyTorch first, either from PyPI or from source
|
||||
|
||||
@ -228,7 +228,7 @@ outputs = llm.embed(["Follow the white rabbit."],
|
||||
print(outputs[0].outputs)
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference/embed_matryoshka_fy.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/pooling/embed_matryoshka_fy.py>
|
||||
|
||||
### Online Inference
|
||||
|
||||
@ -258,4 +258,4 @@ Expected output:
|
||||
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
|
||||
```
|
||||
|
||||
An OpenAI client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
|
||||
An OpenAI client example can be found here: <gh-file:examples/online_serving/pooling/openai_embedding_matryoshka_fy.py>
|
||||
|
||||
@ -328,10 +328,9 @@ th {
|
||||
| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | ✅︎ |
|
||||
| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
|
||||
| `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | |
|
||||
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
@ -425,9 +424,6 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
||||
!!! note
|
||||
Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
|
||||
|
||||
!!! note
|
||||
Some mBART models' config files do not have an `architecture` defined. Therefore, you need to use `--hf-overrides '{"architectures": ["MBartForConditionalGeneration"]}'` to explicitly specify the use of the `MBartForConditionalGeneration` architecture.
|
||||
|
||||
### Pooling Models
|
||||
|
||||
See [this page](./pooling_models.md) for more information on how to use pooling models.
|
||||
@ -530,7 +526,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
|
||||
```
|
||||
|
||||
!!! note
|
||||
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>.
|
||||
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/pooling/qwen3_reranker.py>.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
||||
@ -624,9 +620,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DonutForConditionalGeneration`<sup>^</sup> | Donut | T + I | `ByteDance/Dolphin`, `naver-clova-ix/donut-base-finetuned-docvqa`, etc. | | | |
|
||||
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ |
|
||||
| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
|
||||
@ -653,7 +647,6 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
|
||||
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
|
||||
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
|
||||
|
||||
@ -239,7 +239,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
|
||||
If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api])
|
||||
which will be treated as a single prompt to the model.
|
||||
|
||||
Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
|
||||
Code example: <gh-file:examples/online_serving/pooling/openai_embedding_client.py>
|
||||
|
||||
#### Multi-modal inputs
|
||||
|
||||
@ -313,7 +313,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
|
||||
example below for details.
|
||||
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
|
||||
Full example: <gh-file:examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py>
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
@ -421,7 +421,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
|
||||
|
||||
The input format is the same as [Embeddings API][embeddings-api], but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
|
||||
|
||||
Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
|
||||
Code example: <gh-file:examples/online_serving/pooling/openai_pooling_client.py>
|
||||
|
||||
[](){ #classification-api }
|
||||
|
||||
@ -431,7 +431,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo
|
||||
|
||||
We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
|
||||
|
||||
Code example: <gh-file:examples/online_serving/openai_classification_client.py>
|
||||
Code example: <gh-file:examples/online_serving/pooling/openai_classification_client.py>
|
||||
|
||||
#### Example Requests
|
||||
|
||||
@ -760,7 +760,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
|
||||
[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
|
||||
popular open-source tools.
|
||||
|
||||
Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
|
||||
Code example: <gh-file:examples/online_serving/pooling/jinaai_rerank_client.py>
|
||||
|
||||
#### Example Request
|
||||
|
||||
|
||||
@ -120,7 +120,7 @@ Please note that prefix caching is not yet supported for any of the above models
|
||||
|
||||
Whisper is supported. Other models requiring cross-attention between separate
|
||||
encoder and decoder (e.g., `BartForConditionalGeneration`,
|
||||
`MllamaForConditionalGeneration`) are not yet supported.
|
||||
`MllamaForConditionalGeneration`) are not supported.
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -1,311 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import regex as re
|
||||
from PIL import Image
|
||||
from transformers import DonutProcessor
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
@dataclass
|
||||
class ImageDimensions:
|
||||
original_w: int
|
||||
original_h: int
|
||||
padded_w: int
|
||||
padded_h: int
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
def map_to_original_coordinates(
|
||||
x1, y1, x2, y2, dims: ImageDimensions
|
||||
) -> tuple[int, int, int, int]:
|
||||
try:
|
||||
top = (dims.padded_h - dims.original_h) // 2
|
||||
left = (dims.padded_w - dims.original_w) // 2
|
||||
orig_x1 = max(0, x1 - left)
|
||||
orig_y1 = max(0, y1 - top)
|
||||
orig_x2 = min(dims.original_w, x2 - left)
|
||||
orig_y2 = min(dims.original_h, y2 - top)
|
||||
if orig_x2 <= orig_x1:
|
||||
orig_x2 = min(orig_x1 + 1, dims.original_w)
|
||||
if orig_y2 <= orig_y1:
|
||||
orig_y2 = min(orig_y1 + 1, dims.original_h)
|
||||
return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2)
|
||||
except Exception as e:
|
||||
print(f"map_to_original_coordinates error: {str(e)}")
|
||||
return 0, 0, min(100, dims.original_w), min(100, dims.original_h)
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
def adjust_box_edges(image, boxes: list[list[float]], max_pixels=15, threshold=0.2):
|
||||
if isinstance(image, str):
|
||||
image = cv2.imread(image)
|
||||
img_h, img_w = image.shape[:2]
|
||||
new_boxes = []
|
||||
for box in boxes:
|
||||
best_box = copy.deepcopy(box)
|
||||
|
||||
def check_edge(img, current_box, i, is_vertical):
|
||||
edge = current_box[i]
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
_, binary = cv2.threshold(
|
||||
gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
|
||||
)
|
||||
if is_vertical:
|
||||
line = binary[current_box[1] : current_box[3] + 1, edge]
|
||||
else:
|
||||
line = binary[edge, current_box[0] : current_box[2] + 1]
|
||||
transitions = np.abs(np.diff(line))
|
||||
return np.sum(transitions) / len(transitions)
|
||||
|
||||
edges = [(0, -1, True), (2, 1, True), (1, -1, False), (3, 1, False)]
|
||||
current_box = copy.deepcopy(box)
|
||||
current_box[0] = min(max(current_box[0], 0), img_w - 1)
|
||||
current_box[1] = min(max(current_box[1], 0), img_h - 1)
|
||||
current_box[2] = min(max(current_box[2], 0), img_w - 1)
|
||||
current_box[3] = min(max(current_box[3], 0), img_h - 1)
|
||||
|
||||
for i, direction, is_vertical in edges:
|
||||
best_score = check_edge(image, current_box, i, is_vertical)
|
||||
if best_score <= threshold:
|
||||
continue
|
||||
for step in range(max_pixels):
|
||||
current_box[i] += direction
|
||||
if i == 0 or i == 2:
|
||||
current_box[i] = min(max(current_box[i], 0), img_w - 1)
|
||||
else:
|
||||
current_box[i] = min(max(current_box[i], 0), img_h - 1)
|
||||
score = check_edge(image, current_box, i, is_vertical)
|
||||
if score < best_score:
|
||||
best_score = score
|
||||
best_box = copy.deepcopy(current_box)
|
||||
if score <= threshold:
|
||||
break
|
||||
new_boxes.append(best_box)
|
||||
return new_boxes
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None):
|
||||
try:
|
||||
x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h)
|
||||
x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h)
|
||||
x1, y1, x2, y2 = (
|
||||
max(0, min(x1, dims.padded_w - 1)),
|
||||
max(0, min(y1, dims.padded_h - 1)),
|
||||
max(0, min(x2, dims.padded_w)),
|
||||
max(0, min(y2, dims.padded_h)),
|
||||
)
|
||||
if x2 <= x1:
|
||||
x2 = min(x1 + 1, dims.padded_w)
|
||||
if y2 <= y1:
|
||||
y2 = min(y1 + 1, dims.padded_h)
|
||||
new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]])
|
||||
x1, y1, x2, y2 = new_boxes[0]
|
||||
x1, y1, x2, y2 = (
|
||||
max(0, min(x1, dims.padded_w - 1)),
|
||||
max(0, min(y1, dims.padded_h - 1)),
|
||||
max(0, min(x2, dims.padded_w)),
|
||||
max(0, min(y2, dims.padded_h)),
|
||||
)
|
||||
if x2 <= x1:
|
||||
x2 = min(x1 + 1, dims.padded_w)
|
||||
if y2 <= y1:
|
||||
y2 = min(y1 + 1, dims.padded_h)
|
||||
if previous_box is not None:
|
||||
prev_x1, prev_y1, prev_x2, prev_y2 = previous_box
|
||||
if (x1 < prev_x2 and x2 > prev_x1) and (y1 < prev_y2 and y2 > prev_y1):
|
||||
y1 = prev_y2
|
||||
y1 = min(y1, dims.padded_h - 1)
|
||||
if y2 <= y1:
|
||||
y2 = min(y1 + 1, dims.padded_h)
|
||||
new_previous_box = [x1, y1, x2, y2]
|
||||
orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(
|
||||
x1, y1, x2, y2, dims
|
||||
)
|
||||
return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
|
||||
except Exception as e:
|
||||
print(f"process_coordinates error: {str(e)}")
|
||||
orig_x1, orig_y1, orig_x2, orig_y2 = (
|
||||
0,
|
||||
0,
|
||||
min(100, dims.original_w),
|
||||
min(100, dims.original_h),
|
||||
)
|
||||
return 0, 0, 100, 100, orig_x1, orig_y1, orig_x2, orig_y2, [0, 0, 100, 100]
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
def prepare_image(image) -> tuple[np.ndarray, ImageDimensions]:
|
||||
try:
|
||||
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
||||
original_h, original_w = image_cv.shape[:2]
|
||||
max_size = max(original_h, original_w)
|
||||
top = (max_size - original_h) // 2
|
||||
bottom = max_size - original_h - top
|
||||
left = (max_size - original_w) // 2
|
||||
right = max_size - original_w - left
|
||||
padded_image = cv2.copyMakeBorder(
|
||||
image_cv, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0)
|
||||
)
|
||||
padded_h, padded_w = padded_image.shape[:2]
|
||||
dimensions = ImageDimensions(
|
||||
original_w=original_w,
|
||||
original_h=original_h,
|
||||
padded_w=padded_w,
|
||||
padded_h=padded_h,
|
||||
)
|
||||
return padded_image, dimensions
|
||||
except Exception as e:
|
||||
print(f"prepare_image error: {str(e)}")
|
||||
h, w = image.height, image.width
|
||||
dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h)
|
||||
return np.zeros((h, w, 3), dtype=np.uint8), dimensions
|
||||
|
||||
|
||||
# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
|
||||
def parse_layout_string(bbox_str):
|
||||
"""Parse layout string using regular expressions"""
|
||||
pattern = r"\[(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+)\]\s*(\w+)"
|
||||
matches = re.finditer(pattern, bbox_str)
|
||||
|
||||
parsed_results = []
|
||||
for match in matches:
|
||||
coords = [float(match.group(i)) for i in range(1, 5)]
|
||||
label = match.group(5).strip()
|
||||
parsed_results.append((coords, label))
|
||||
|
||||
return parsed_results
|
||||
|
||||
|
||||
model_id = "ByteDance/Dolphin"
|
||||
|
||||
# The input image size for Dolphin is 896 x 896,
|
||||
# and the patch_size is 4 x 4.
|
||||
# Therefore, the initial number of patches is:
|
||||
# Height: 896 / 4 = 224 patches
|
||||
# Width: 896 / 4 = 224 patches
|
||||
|
||||
# The Dolphin model uses a staged downsampling approach,
|
||||
# defined by the "depths": [2, 2, 14, 2] configuration.
|
||||
# Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed,
|
||||
# which halves the feature map's dimensions (dividing both height and width by 2).
|
||||
# Before Stage 2: The size changes from 224 x 224 to (224/2) x (224/2) = 112 x 112.
|
||||
# Before Stage 3: The size changes from 112 x 112 to (112/2) x (112/2) = 56 x 56.
|
||||
# Before Stage 4: The size changes from 56 x 56 to (56/2) x (56/2) = 28 x 28.
|
||||
|
||||
# Because vLLM needs to fill the image features with an encoder_prompt,
|
||||
# and the encoder_prompt will have `<pad>` tokens added when tokenized,
|
||||
# we need to construct an encoder_prompt with a length of 28 x 28 - 1 = 783.
|
||||
encoder_prompt = "".join(["0"] * 783)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=2048,
|
||||
)
|
||||
|
||||
processor = DonutProcessor.from_pretrained(model_id)
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
dtype="float16",
|
||||
max_num_seqs=8,
|
||||
hf_overrides={"architectures": ["DonutForConditionalGeneration"]},
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--image_path", type=str, default=None, help="Path to a local image file."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.image_path:
|
||||
if not os.path.exists(args.image_path):
|
||||
raise FileNotFoundError(f"Error: File not found at {args.image_path}")
|
||||
image = Image.open(args.image_path).convert("RGB")
|
||||
else:
|
||||
image = fetch_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg"
|
||||
)
|
||||
|
||||
|
||||
prompt = "Parse the reading order of this document. "
|
||||
decoder_prompt = f"<s>{prompt}<Answer/>"
|
||||
decoder_prompt_tokens = TokensPrompt(
|
||||
prompt_token_ids=processor.tokenizer(decoder_prompt, add_special_tokens=False)[
|
||||
"input_ids"
|
||||
]
|
||||
)
|
||||
enc_dec_prompt = ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=TextPrompt(prompt=encoder_prompt, multi_modal_data={"image": image}),
|
||||
decoder_prompt=decoder_prompt_tokens,
|
||||
)
|
||||
layout_outputs = llm.generate(prompts=enc_dec_prompt, sampling_params=sampling_params)
|
||||
layout_result_str = layout_outputs[0].outputs[0].text
|
||||
print(f"Layout analysis output:\n{layout_result_str}")
|
||||
|
||||
padded_image, dims = prepare_image(image)
|
||||
layout_results = parse_layout_string(layout_result_str)
|
||||
text_table_elements = []
|
||||
previous_box = None
|
||||
reading_order = 0
|
||||
for bbox_coords, label in layout_results:
|
||||
if label == "fig":
|
||||
continue
|
||||
try:
|
||||
x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = (
|
||||
process_coordinates(bbox_coords, padded_image, dims, previous_box)
|
||||
)
|
||||
cropped = padded_image[y1:y2, x1:x2]
|
||||
if cropped.size > 0 and cropped.shape[0] > 3 and cropped.shape[1] > 3:
|
||||
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
||||
prompt_ocr = (
|
||||
"Parse the table in the image. "
|
||||
if label == "tab"
|
||||
else "Read text in the image. "
|
||||
)
|
||||
text_table_elements.append(
|
||||
{
|
||||
"crop": pil_crop,
|
||||
"prompt": prompt_ocr,
|
||||
"reading_order": reading_order,
|
||||
}
|
||||
)
|
||||
reading_order += 1
|
||||
except Exception as e:
|
||||
print(f"Error processing bbox (label: {label}): {str(e)}")
|
||||
continue
|
||||
|
||||
if text_table_elements:
|
||||
batch_prompts = []
|
||||
for elem in text_table_elements:
|
||||
decoder_prompt_str = f"<s>{elem['prompt']}<Answer/>"
|
||||
decoder_prompt_tokens = TokensPrompt(
|
||||
prompt_token_ids=processor.tokenizer(
|
||||
decoder_prompt_str, add_special_tokens=False
|
||||
)["input_ids"]
|
||||
)
|
||||
enc_dec_prompt = ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=TextPrompt(
|
||||
prompt=encoder_prompt, multi_modal_data={"image": elem["crop"]}
|
||||
),
|
||||
decoder_prompt=decoder_prompt_tokens,
|
||||
)
|
||||
batch_prompts.append(enc_dec_prompt)
|
||||
batch_outputs = llm.generate(prompts=batch_prompts, sampling_params=sampling_params)
|
||||
for i, output in enumerate(batch_outputs):
|
||||
text_table_elements[i]["text"] = output.outputs[0].text.strip()
|
||||
|
||||
print("------" * 8)
|
||||
text_table_elements.sort(key=lambda x: x["reading_order"])
|
||||
for elem in text_table_elements:
|
||||
print(elem.get("text", ""))
|
||||
@ -1,195 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Demonstrate prompting of text-to-text
|
||||
encoder/decoder models, specifically BART and mBART.
|
||||
|
||||
This script is refactored to allow model selection via command-line arguments.
|
||||
|
||||
NOTE: This example is not yet supported in V1.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.inputs import (
|
||||
ExplicitEncoderDecoderPrompt,
|
||||
TextPrompt,
|
||||
TokensPrompt,
|
||||
zip_enc_dec_prompts,
|
||||
)
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
"""
|
||||
Holds the configuration for a specific model, including its
|
||||
HuggingFace ID and the prompts to use for the demo.
|
||||
"""
|
||||
|
||||
model_id: str
|
||||
encoder_prompts: list
|
||||
decoder_prompts: list
|
||||
hf_overrides: Optional[dict] = None
|
||||
|
||||
|
||||
def get_bart_config() -> ModelRequestData:
|
||||
"""
|
||||
Returns the configuration for facebook/bart-large-cnn.
|
||||
This uses the exact test cases from the original script.
|
||||
"""
|
||||
encoder_prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"An encoder prompt",
|
||||
]
|
||||
decoder_prompts = [
|
||||
"A decoder prompt",
|
||||
"Another decoder prompt",
|
||||
]
|
||||
return ModelRequestData(
|
||||
model_id="facebook/bart-large-cnn",
|
||||
encoder_prompts=encoder_prompts,
|
||||
decoder_prompts=decoder_prompts,
|
||||
)
|
||||
|
||||
|
||||
def get_mbart_config() -> ModelRequestData:
|
||||
"""
|
||||
Returns the configuration for facebook/mbart-large-en-ro.
|
||||
This uses prompts suitable for an English-to-Romanian translation task.
|
||||
"""
|
||||
encoder_prompts = [
|
||||
"The quick brown fox jumps over the lazy dog.",
|
||||
"How are you today?",
|
||||
]
|
||||
decoder_prompts = ["", ""]
|
||||
hf_overrides = {"architectures": ["MBartForConditionalGeneration"]}
|
||||
return ModelRequestData(
|
||||
model_id="facebook/mbart-large-en-ro",
|
||||
encoder_prompts=encoder_prompts,
|
||||
decoder_prompts=decoder_prompts,
|
||||
hf_overrides=hf_overrides,
|
||||
)
|
||||
|
||||
|
||||
MODEL_GETTERS = {
|
||||
"bart": get_bart_config,
|
||||
"mbart": get_mbart_config,
|
||||
}
|
||||
|
||||
|
||||
def create_all_prompt_types(
|
||||
encoder_prompts_raw: list,
|
||||
decoder_prompts_raw: list,
|
||||
tokenizer,
|
||||
) -> list:
|
||||
"""
|
||||
Generates a list of diverse prompt types for demonstration.
|
||||
This function is generic and uses the provided raw prompts
|
||||
to create various vLLM input objects.
|
||||
"""
|
||||
text_prompt_raw = encoder_prompts_raw[0]
|
||||
text_prompt = TextPrompt(prompt=encoder_prompts_raw[1 % len(encoder_prompts_raw)])
|
||||
tokens_prompt = TokensPrompt(
|
||||
prompt_token_ids=tokenizer.encode(
|
||||
encoder_prompts_raw[2 % len(encoder_prompts_raw)]
|
||||
)
|
||||
)
|
||||
|
||||
decoder_tokens_prompt = TokensPrompt(
|
||||
prompt_token_ids=tokenizer.encode(decoder_prompts_raw[0])
|
||||
)
|
||||
single_prompt_examples = [
|
||||
text_prompt_raw,
|
||||
text_prompt,
|
||||
tokens_prompt,
|
||||
]
|
||||
explicit_pair_examples = [
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=text_prompt_raw,
|
||||
decoder_prompt=decoder_tokens_prompt,
|
||||
),
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=text_prompt,
|
||||
decoder_prompt=decoder_prompts_raw[1 % len(decoder_prompts_raw)],
|
||||
),
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=tokens_prompt,
|
||||
decoder_prompt=text_prompt,
|
||||
),
|
||||
]
|
||||
zipped_prompt_list = zip_enc_dec_prompts(
|
||||
encoder_prompts_raw,
|
||||
decoder_prompts_raw,
|
||||
)
|
||||
return single_prompt_examples + explicit_pair_examples + zipped_prompt_list
|
||||
|
||||
|
||||
def create_sampling_params() -> SamplingParams:
|
||||
"""Create a sampling params object."""
|
||||
return SamplingParams(
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
min_tokens=0,
|
||||
max_tokens=30,
|
||||
)
|
||||
|
||||
|
||||
def print_outputs(outputs: list):
|
||||
"""Formats and prints the generation outputs."""
|
||||
print("-" * 80)
|
||||
for i, output in enumerate(outputs):
|
||||
prompt = output.prompt
|
||||
encoder_prompt = output.encoder_prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Output {i + 1}:")
|
||||
print(f"Encoder Prompt: {encoder_prompt!r}")
|
||||
print(f"Decoder Prompt: {prompt!r}")
|
||||
print(f"Generated Text: {generated_text!r}")
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Main execution function."""
|
||||
model_key = args.model
|
||||
if model_key not in MODEL_GETTERS:
|
||||
raise ValueError(
|
||||
f"Unknown model: {model_key}. "
|
||||
f"Available models: {list(MODEL_GETTERS.keys())}"
|
||||
)
|
||||
config_getter = MODEL_GETTERS[model_key]
|
||||
model_config = config_getter()
|
||||
|
||||
print(f"🚀 Running demo for model: {model_config.model_id}")
|
||||
llm = LLM(
|
||||
model=model_config.model_id,
|
||||
dtype="float",
|
||||
hf_overrides=model_config.hf_overrides,
|
||||
)
|
||||
tokenizer = llm.llm_engine.get_tokenizer_group()
|
||||
prompts = create_all_prompt_types(
|
||||
encoder_prompts_raw=model_config.encoder_prompts,
|
||||
decoder_prompts_raw=model_config.decoder_prompts,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
sampling_params = create_sampling_params()
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
print_outputs(outputs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="A flexible demo for vLLM encoder-decoder models."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
"-m",
|
||||
type=str,
|
||||
default="bart",
|
||||
choices=MODEL_GETTERS.keys(),
|
||||
help="The short name of the model to run.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@ -13,8 +13,6 @@ from typing import NamedTuple
|
||||
|
||||
from vllm import LLM, EngineArgs, PromptType, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -23,113 +21,6 @@ class ModelRequestData(NamedTuple):
|
||||
prompts: Sequence[PromptType]
|
||||
|
||||
|
||||
def run_donut():
|
||||
engine_args = EngineArgs(
|
||||
model="naver-clova-ix/donut-base-finetuned-docvqa",
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
dtype="float16",
|
||||
hf_overrides={"architectures": ["DonutForConditionalGeneration"]},
|
||||
)
|
||||
|
||||
# The input image size for donut-base-finetuned-docvqa is 2560 x 1920,
|
||||
# and the patch_size is 4 x 4.
|
||||
# Therefore, the initial number of patches is:
|
||||
# Height: 1920 / 4 = 480 patches
|
||||
# Width: 2560 / 4 = 640 patches
|
||||
# The Swin model uses a staged downsampling approach,
|
||||
# defined by the "depths": [2, 2, 14, 2] configuration.
|
||||
# Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed,
|
||||
# which halves the feature map's dimensions (dividing both height and width by 2).
|
||||
# Before Stage 2: The size changes from 480 x 640 to (480/2) x (640/2) = 240 x 320.
|
||||
# Before Stage 3: The size changes from 240 x 320 to (240/2) x (320/2) = 120 x 160.
|
||||
# Before Stage 4: The size changes from 120 x 160 to (120/2) x (160/2) = 60 x 80.
|
||||
# Because vLLM needs to fill the image features with an encoder_prompt,
|
||||
# and the encoder_prompt will have `<pad>` tokens added when tokenized,
|
||||
# we need to construct an encoder_prompt with a length of 60 x 80 - 1 = 4799.
|
||||
prompts = [
|
||||
{
|
||||
"encoder_prompt": {
|
||||
"prompt": "".join(["$"] * 4799),
|
||||
"multi_modal_data": {
|
||||
"image": fetch_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg"
|
||||
) # noqa: E501
|
||||
},
|
||||
},
|
||||
"decoder_prompt": "<s_docvqa><s_question>What time is the coffee break?</s_question><s_answer>", # noqa: E501
|
||||
},
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_florence2():
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Florence-2-large",
|
||||
tokenizer="Isotr0py/Florence-2-tokenizer",
|
||||
max_num_seqs=8,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
dtype="half",
|
||||
)
|
||||
|
||||
prompts = [
|
||||
{ # implicit prompt with task token
|
||||
"prompt": "<DETAILED_CAPTION>",
|
||||
"multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
|
||||
},
|
||||
{ # explicit encoder/decoder prompt
|
||||
"encoder_prompt": {
|
||||
"prompt": "Describe in detail what is shown in the image.",
|
||||
"multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},
|
||||
},
|
||||
"decoder_prompt": "",
|
||||
},
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_mllama():
|
||||
engine_args = EngineArgs(
|
||||
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
dtype="half",
|
||||
)
|
||||
|
||||
prompts = [
|
||||
{ # Implicit prompt
|
||||
"prompt": "<|image|><|begin_of_text|>What is the content of this image?", # noqa: E501
|
||||
"multi_modal_data": {
|
||||
"image": ImageAsset("stop_sign").pil_image,
|
||||
},
|
||||
},
|
||||
{ # Explicit prompt
|
||||
"encoder_prompt": {
|
||||
"prompt": "<|image|>",
|
||||
"multi_modal_data": {
|
||||
"image": ImageAsset("stop_sign").pil_image,
|
||||
},
|
||||
},
|
||||
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
|
||||
},
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_whisper():
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
@ -166,9 +57,6 @@ def run_whisper():
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"donut": run_donut,
|
||||
"florence2": run_florence2,
|
||||
"mllama": run_mllama,
|
||||
"whisper": run_whisper,
|
||||
}
|
||||
|
||||
@ -182,7 +70,7 @@ def parse_args():
|
||||
"--model-type",
|
||||
"-m",
|
||||
type=str,
|
||||
default="mllama",
|
||||
default="whisper",
|
||||
choices=model_example_map.keys(),
|
||||
help='Huggingface "model_type".',
|
||||
)
|
||||
|
||||
33
examples/offline_inference/pooling/README.md
Normal file
33
examples/offline_inference/pooling/README.md
Normal file
@ -0,0 +1,33 @@
|
||||
# Pooling models
|
||||
|
||||
## Convert llm model to seq cls
|
||||
|
||||
```bash
|
||||
# for BAAI/bge-reranker-v2-gemma
|
||||
# Caution: "Yes" and "yes" are two different tokens
|
||||
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
|
||||
# for mxbai-rerank-v2
|
||||
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
|
||||
# for Qwen3-Reranker
|
||||
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
|
||||
```
|
||||
|
||||
## Embed jina_embeddings_v3 usage
|
||||
|
||||
Only text matching task is supported for now. See <gh-pr:16120>
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
|
||||
```
|
||||
|
||||
## Embed matryoshka dimensions usage
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/embed_matryoshka_fy.py
|
||||
```
|
||||
|
||||
## Qwen3 reranker usage
|
||||
|
||||
```bash
|
||||
python qwen3_reranker.py
|
||||
```
|
||||
@ -204,28 +204,6 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Florence2
|
||||
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Florence-2-large",
|
||||
tokenizer="Isotr0py/Florence-2-tokenizer",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Fuyu
|
||||
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1008,44 +986,6 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# LLama 3.2
|
||||
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
|
||||
# Note: The default setting of max_num_seqs (256) and
|
||||
# max_model_len (131072) for this model may cause OOM.
|
||||
# You may lower either to run this example on lower-end GPUs.
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
messages = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "image"}, {"type": "text", "text": question}],
|
||||
}
|
||||
]
|
||||
for question in questions
|
||||
]
|
||||
prompts = tokenizer.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=False
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Molmo
|
||||
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1665,7 +1605,6 @@ model_example_map = {
|
||||
"command_a_vision": run_command_a_vision,
|
||||
"deepseek_vl_v2": run_deepseek_vl2,
|
||||
"ernie45_vl": run_ernie45_vl,
|
||||
"florence2": run_florence2,
|
||||
"fuyu": run_fuyu,
|
||||
"gemma3": run_gemma3,
|
||||
"gemma3n": run_gemma3n,
|
||||
@ -1691,7 +1630,6 @@ model_example_map = {
|
||||
"minicpmv": run_minicpmv,
|
||||
"minimax_vl_01": run_minimax_vl_01,
|
||||
"mistral3": run_mistral3,
|
||||
"mllama": run_mllama,
|
||||
"molmo": run_molmo,
|
||||
"nemotron_vl": run_nemotron_vl,
|
||||
"NVLM_D": run_nvlm_d,
|
||||
@ -1716,6 +1654,13 @@ model_example_map = {
|
||||
}
|
||||
|
||||
|
||||
MODELS_NEED_VIDEO_METADATA = [
|
||||
"glm4_1v",
|
||||
"glm4_5v",
|
||||
"glm4_5v_fp8",
|
||||
]
|
||||
|
||||
|
||||
def get_multi_modal_input(args):
|
||||
"""
|
||||
return {
|
||||
@ -1740,12 +1685,13 @@ def get_multi_modal_input(args):
|
||||
|
||||
if args.modality == "video":
|
||||
# Input video and question
|
||||
needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA
|
||||
video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
|
||||
metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
|
||||
vid_questions = ["Why is this video funny?"]
|
||||
|
||||
return {
|
||||
"data": [(video, metadata)] if args.model_type == "glm4_1v" else video,
|
||||
"data": ([(video, metadata)] if needs_metadata else video),
|
||||
"questions": vid_questions,
|
||||
}
|
||||
|
||||
|
||||
@ -637,26 +637,6 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
img_prompt = "Given the first image <|image|> and the second image<|image|>"
|
||||
prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "nvidia/NVLM-D-72B"
|
||||
|
||||
@ -1253,7 +1233,6 @@ model_example_map = {
|
||||
"llava-next": load_llava_next,
|
||||
"llava-onevision": load_llava_onevision,
|
||||
"mistral3": load_mistral3,
|
||||
"mllama": load_mllama,
|
||||
"NVLM_D": load_nvlm_d,
|
||||
"ovis": load_ovis,
|
||||
"ovis2_5": load_ovis2_5,
|
||||
|
||||
@ -120,7 +120,7 @@ echo " - API Key: $API_KEY"
|
||||
echo " - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN"
|
||||
echo ""
|
||||
echo "🧪 Test the server with:"
|
||||
echo " python examples/online_serving/openai_embedding_long_text_client.py"
|
||||
echo " python examples/online_serving/openai_embedding_long_text/client.py"
|
||||
echo ""
|
||||
echo "📚 Enhanced features enabled:"
|
||||
echo " ✅ Intelligent native pooling type detection"
|
||||
|
||||
43
examples/online_serving/pooling/README.md
Normal file
43
examples/online_serving/pooling/README.md
Normal file
@ -0,0 +1,43 @@
|
||||
# Pooling models
|
||||
|
||||
## Cohere rerank usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/cohere_rerank_client.py
|
||||
```
|
||||
|
||||
## Jinaai rerank usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/jinaai_rerank_client.py
|
||||
```
|
||||
|
||||
## Openai chat embedding for multimodal usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
|
||||
```
|
||||
|
||||
## Openai classification usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/openai_classification_client.py
|
||||
```
|
||||
|
||||
## Openai embedding usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/openai_embedding_client.py
|
||||
```
|
||||
|
||||
## Openai embedding matryoshka dimensions usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
|
||||
```
|
||||
|
||||
## Openai pooling usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/openai_pooling_client.py
|
||||
```
|
||||
@ -1,5 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
"""Example Python client for multimodal embedding API using vLLM API server
|
||||
NOTE:
|
||||
start a supported multimodal embeddings model server with `vllm serve`, e.g.
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
@ -1,5 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Example Python client for classification API using vLLM API server
|
||||
NOTE:
|
||||
start a supported classification model server with `vllm serve`, e.g.
|
||||
vllm serve jason9693/Qwen2.5-1.5B-apeach
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import pprint
|
||||
@ -1,5 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Example Python client for embedding API using vLLM API server
|
||||
NOTE:
|
||||
start a supported embeddings model server with `vllm serve`, e.g.
|
||||
vllm serve intfloat/e5-small
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
@ -4,7 +4,9 @@
|
||||
Example online usage of Pooling API.
|
||||
|
||||
Run `vllm serve <model> --runner pooling`
|
||||
to start up the server in vLLM.
|
||||
to start up the server in vLLM. e.g.
|
||||
|
||||
vllm serve internlm/internlm2-1_8b-reward --trust-remote-code
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@ -23,7 +25,7 @@ def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach")
|
||||
parser.add_argument("--model", type=str, default="internlm/internlm2-1_8b-reward")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
@ -6,6 +6,7 @@ pytest-asyncio
|
||||
pytest-rerunfailures
|
||||
pytest-shard
|
||||
pytest-timeout
|
||||
pytest-cov
|
||||
|
||||
# testing utils
|
||||
backoff # required for phi4mm test
|
||||
|
||||
@ -135,6 +135,8 @@ colorful==0.5.6
|
||||
# via ray
|
||||
contourpy==1.3.0
|
||||
# via matplotlib
|
||||
coverage==7.10.6
|
||||
# via pytest-cov
|
||||
cramjam==2.9.0
|
||||
# via fastparquet
|
||||
cupy-cuda12x==13.6.0
|
||||
@ -686,7 +688,9 @@ platformdirs==4.3.6
|
||||
plotly==5.24.1
|
||||
# via genai-perf
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
# via
|
||||
# pytest
|
||||
# pytest-cov
|
||||
polars==1.29.0
|
||||
# via mteb
|
||||
pooch==1.8.2
|
||||
@ -786,6 +790,7 @@ pytest==8.3.5
|
||||
# buildkite-test-collector
|
||||
# genai-perf
|
||||
# pytest-asyncio
|
||||
# pytest-cov
|
||||
# pytest-forked
|
||||
# pytest-mock
|
||||
# pytest-rerunfailures
|
||||
@ -796,6 +801,8 @@ pytest==8.3.5
|
||||
# terratorch
|
||||
pytest-asyncio==0.24.0
|
||||
# via -r requirements/test.in
|
||||
pytest-cov==6.3.0
|
||||
# via -r requirements/test.in
|
||||
pytest-forked==1.6.0
|
||||
# via -r requirements/test.in
|
||||
pytest-mock==3.14.0
|
||||
|
||||
8
setup.py
8
setup.py
@ -56,8 +56,6 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
|
||||
# fallback to cpu
|
||||
VLLM_TARGET_DEVICE = "cpu"
|
||||
|
||||
MAIN_CUDA_VERSION = "12.8"
|
||||
|
||||
|
||||
def is_sccache_available() -> bool:
|
||||
return which("sccache") is not None and \
|
||||
@ -507,7 +505,7 @@ def get_vllm_version() -> str:
|
||||
version += f"{sep}precompiled"
|
||||
else:
|
||||
cuda_version = str(get_nvcc_cuda_version())
|
||||
if cuda_version != MAIN_CUDA_VERSION:
|
||||
if cuda_version != envs.VLLM_MAIN_CUDA_VERSION:
|
||||
cuda_version_str = cuda_version.replace(".", "")[:3]
|
||||
# skip this for source tarball, required for pypi
|
||||
if "sdist" not in sys.argv:
|
||||
@ -515,7 +513,7 @@ def get_vllm_version() -> str:
|
||||
elif _is_hip():
|
||||
# Get the Rocm Version
|
||||
rocm_version = get_rocm_version() or torch.version.hip
|
||||
if rocm_version and rocm_version != MAIN_CUDA_VERSION:
|
||||
if rocm_version and rocm_version != envs.VLLM_MAIN_CUDA_VERSION:
|
||||
version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
|
||||
elif _is_tpu():
|
||||
version += f"{sep}tpu"
|
||||
@ -664,7 +662,7 @@ setup(
|
||||
"mistral_common[audio]"], # Required for audio processing
|
||||
"video": [], # Kept for backwards compatibility
|
||||
# FlashInfer should be updated together with the Dockerfile
|
||||
"flashinfer": ["flashinfer-python==0.3.0"],
|
||||
"flashinfer": ["flashinfer-python==0.3.1"],
|
||||
# Optional deps for AMD FP4 quantization support
|
||||
"petit-kernel": ["petit-kernel"],
|
||||
},
|
||||
|
||||
@ -64,4 +64,8 @@ class TestBackend:
|
||||
num_pre = len(list(find_op_nodes(op, self.graph_pre_pass)))
|
||||
num_post = len(list(find_op_nodes(op, self.graph_post_pass)))
|
||||
assert num_pre == 0, f"Unexpected op {op.name()} in pre-pass graph"
|
||||
assert num_post > 0, f"Op {op.name()} not found in post-pass graph"
|
||||
assert num_post > 0, f"Op {op.name()} not found in post-pass graph"
|
||||
|
||||
def op_count(self, op: OpOverload, before=False) -> int:
|
||||
graph = self.graph_pre_pass if before else self.graph_post_pass
|
||||
return len(list(find_op_nodes(op, graph)))
|
||||
|
||||
106
tests/compile/test_noop_elimination.py
Normal file
106
tests/compile/test_noop_elimination.py
Normal file
@ -0,0 +1,106 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
|
||||
VllmConfig)
|
||||
|
||||
from .backend import TestBackend
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype",
|
||||
[torch.float16, torch.bfloat16, torch.float32])
|
||||
@pytest.mark.parametrize("num_tokens", [256, 1024])
|
||||
@pytest.mark.parametrize("hidden_size", [64, 4096])
|
||||
def test_noop_elimination(dtype, num_tokens, hidden_size):
|
||||
torch.set_default_device("cuda")
|
||||
torch.set_default_dtype(dtype)
|
||||
torch.manual_seed(1)
|
||||
|
||||
class Model(torch.nn.Module):
|
||||
|
||||
def forward(self, x):
|
||||
# Chain of reshapes
|
||||
y = x.reshape(-1, 128, 32)
|
||||
z = y.reshape(-1, 4096)
|
||||
# No-op reshape
|
||||
a = z.reshape(-1, 4096)
|
||||
# Final reshape that should remain
|
||||
b = a.reshape(-1, 128, 32)
|
||||
# No-op slice
|
||||
c = b[0:b.shape[0]]
|
||||
# The pass should replace the result of this op with `c`.
|
||||
d = torch.slice_scatter(
|
||||
torch.ones_like(c), # Dummy tensor to be scattered into
|
||||
c, # Source tensor
|
||||
0, # dim
|
||||
0, # start
|
||||
c.shape[0], # end
|
||||
)
|
||||
return d
|
||||
|
||||
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
pass_config=PassConfig(enable_noop=True),
|
||||
))
|
||||
with vllm.config.set_current_vllm_config(vllm_config):
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
|
||||
backend = TestBackend(noop_pass)
|
||||
|
||||
model = Model()
|
||||
# First dimension dynamic
|
||||
x = torch.rand(num_tokens, hidden_size)
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
result = model(x)
|
||||
|
||||
model2 = torch.compile(model, backend=backend)
|
||||
result2 = model2(x)
|
||||
|
||||
ATOL, RTOL = (2e-3, 2e-3)
|
||||
torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
|
||||
|
||||
# The no-op reshape and slice should be eliminated.
|
||||
# The chain of reshapes should be fused into a single reshape.
|
||||
assert backend.op_count(torch.ops.aten.reshape.default) == 1
|
||||
assert backend.op_count(torch.ops.aten.slice.Tensor) == 0
|
||||
assert backend.op_count(torch.ops.aten.slice_scatter.default) == 0
|
||||
|
||||
|
||||
def test_non_noop_slice_preserved():
|
||||
"""Ensure that a slice with end=-1 (dropping last row) is NOT eliminated.
|
||||
|
||||
Regression test for a bug where end=-1 was treated like an inferred
|
||||
dimension (reshape semantics) leading to incorrect elimination.
|
||||
"""
|
||||
torch.set_default_device("cuda")
|
||||
x = torch.randn(16, 16)
|
||||
|
||||
class SliceModel(torch.nn.Module):
|
||||
|
||||
def forward(self, x):
|
||||
base = x.clone()
|
||||
src = torch.ones(15, 16)
|
||||
y = torch.slice_scatter(base, src, dim=0, start=0, end=-1)
|
||||
return x[0:-1, :], y
|
||||
|
||||
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
pass_config=PassConfig(enable_noop=True),
|
||||
))
|
||||
with vllm.config.set_current_vllm_config(vllm_config):
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
backend = TestBackend(noop_pass)
|
||||
model = SliceModel()
|
||||
ref = model(x)
|
||||
compiled = torch.compile(model, backend=backend)
|
||||
out = compiled(x)
|
||||
torch.testing.assert_close(ref, out)
|
||||
# The slice should remain (not a no-op).
|
||||
assert backend.op_count(torch.ops.aten.slice.Tensor) == 1
|
||||
assert backend.op_count(torch.ops.aten.slice_scatter.default) == 1
|
||||
@ -3,15 +3,12 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
|
||||
STR_NOT_IMPL_ENC_DEC_SWA)
|
||||
from vllm.core.block_manager import SelfAttnBlockSpaceManager
|
||||
from vllm.core.interfaces import AllocStatus
|
||||
from vllm.sequence import Logprob, SequenceStatus
|
||||
from vllm.utils import chunk_list
|
||||
|
||||
from ..utils import (create_dummy_prompt, create_seq_group,
|
||||
create_seq_group_encoder_decoder)
|
||||
from ..utils import create_dummy_prompt, create_seq_group
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@ -58,156 +55,6 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
|
||||
assert can_allocate_result == AllocStatus.LATER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_seq_group_encoder_decoder(block_size: int,
|
||||
num_seqs_per_group: int,
|
||||
num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
)
|
||||
num_watermark_blocks = int(watermark * num_gpu_blocks)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
|
||||
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
|
||||
# the current implementation assumes all seqs are new prompts / don't have
|
||||
# different output lens.
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
|
||||
for bdx, num_prompt_blocks in enumerate(
|
||||
range(1, num_gpu_blocks - num_output_blocks)):
|
||||
num_cross_blocks_per_seq = num_prompt_blocks
|
||||
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id=str(bdx))
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
|
||||
can_allocate_result = block_manager.can_allocate(seq_group)
|
||||
|
||||
num_required_blocks = num_prompt_blocks + \
|
||||
num_output_blocks + \
|
||||
num_cross_blocks_per_seq
|
||||
|
||||
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
|
||||
assert can_allocate_result == AllocStatus.NEVER
|
||||
elif num_gpu_blocks >= num_required_blocks:
|
||||
assert can_allocate_result == AllocStatus.OK
|
||||
else:
|
||||
assert can_allocate_result == AllocStatus.LATER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
|
||||
num_seqs_per_group: int,
|
||||
num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
'''
|
||||
SWA short for Sliding Window Attention.
|
||||
|
||||
At time of writing block manager does not support SWA.
|
||||
|
||||
However even when SWA is implemented for block manager,
|
||||
there will still most likely be a separate workstream required
|
||||
to enable SWA for encoder/decoder models.
|
||||
|
||||
Therefore this test enforces that one of the following cases
|
||||
hold true:
|
||||
1. Block manager does not support SWA at all (true at time of writing)
|
||||
2. Block manager fails with NotImplementError when SWA is enabled
|
||||
AND a SequenceGroup with an encoder sequence (i.e. in support of an
|
||||
encoder/decoder model) is passed into can_allocate() as an argument
|
||||
|
||||
The setup for this test is stripped down version of
|
||||
test_can_allocate_seq_group_encoder_decoder()
|
||||
'''
|
||||
|
||||
with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
sliding_window=5 # SWA
|
||||
)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
num_prompt_blocks = 1
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id="0")
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
block_manager.can_allocate(seq_group)
|
||||
|
||||
# Assert that either
|
||||
# 1. Block manager constructor fails with assertion that sliding window
|
||||
# is not yet supported (most likely near-term outcome at time of
|
||||
# writing), or
|
||||
# 2. can_allocate() fails with NotImplementedError due to combination of
|
||||
# encoder/decoder and sliding window attention
|
||||
if isinstance(exc_info.value, NotImplementedError):
|
||||
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
|
||||
elif isinstance(exc_info.value, AssertionError):
|
||||
assert str(exc_info.value) == "Sliding window not yet supported"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
|
||||
block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
enable_caching=True # Prefix cache
|
||||
)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
num_prompt_blocks = 1
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id="0")
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
|
||||
# Assert that either can_allocate() fails with NotImplementedError
|
||||
# due to combination of encoder/decoder and prefix cache
|
||||
with pytest.raises(NotImplementedError) as exc_info:
|
||||
block_manager.can_allocate(seq_group)
|
||||
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("prompt_len", [1, 7, 8])
|
||||
@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
|
||||
|
||||
@ -1,105 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest # noqa
|
||||
|
||||
from vllm.config import CacheConfig, SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.sequence import SequenceGroup
|
||||
|
||||
from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
|
||||
get_sequence_groups, schedule_and_update_computed_tokens)
|
||||
|
||||
|
||||
def test_scheduler_schedule_simple_encoder_decoder():
|
||||
'''
|
||||
Test basic scheduler functionality in the context
|
||||
of an encoder/decoder model. Focus on testing
|
||||
enc/dec-specific functionality sense tests already
|
||||
exist for decoder-only functionality
|
||||
|
||||
Test behavior:
|
||||
* Construct Scheduler
|
||||
* Construct dummy encoder/decoder sequence groups
|
||||
* Add dummy seq groups to scheduler backlog
|
||||
* Schedule the next seq group & validate:
|
||||
* Cross-attn block tables
|
||||
* Updated states of seq groups
|
||||
* Number of batched tokens
|
||||
* Number of blocks to copy/swap-in/swap-out
|
||||
* Number of scheduled seq groups
|
||||
* Repeat for both prefill- and decode-phase
|
||||
* Abort scheduled seq groups
|
||||
* Assert that aborted seq groups no longer appear in
|
||||
cross-attention block table
|
||||
'''
|
||||
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=num_seq_group,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group
|
||||
cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: list[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
req_id_list = []
|
||||
for i in range(num_seq_group):
|
||||
req_id = str(i)
|
||||
req_id_list.append(req_id)
|
||||
_, _, seq_group = create_dummy_prompt_encoder_decoder(
|
||||
req_id, block_size, block_size, block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Schedule seq groups prefill.
|
||||
num_tokens = block_size * num_seq_group
|
||||
seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
|
||||
# - Verify that sequence group cross-attention block tables are
|
||||
# registered with the block manager
|
||||
assert all([(req_id in scheduler.block_manager.cross_block_tables)
|
||||
for req_id in req_id_list])
|
||||
# - Validate sequence-group status
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
# - Validate number of batched tokens
|
||||
assert out.num_batched_tokens == num_tokens
|
||||
# - Validate there are no remaining blocks to swap
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
# - Validate all seq groups were scheduled
|
||||
assert len(seq_group_meta_list) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Schedule seq groups decode.
|
||||
seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
|
||||
# - Verify that sequence group metadata includes encoder attention
|
||||
# and cross-attention metadata
|
||||
assert all([
|
||||
not ((seq_group_meta.encoder_seq_data is None) or
|
||||
(seq_group_meta.cross_block_table is None))
|
||||
for seq_group_meta in seq_group_meta_list
|
||||
])
|
||||
# - Validate sequence-group status
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
# - Validate there is one batched token per seq group
|
||||
assert out.num_batched_tokens == num_seq_group
|
||||
# - Validate there are no remaining blocks to swap
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
# - Validate that all seq groups were scheduled
|
||||
assert len(seq_group_meta_list) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Abort sequences
|
||||
for req_id in req_id_list:
|
||||
scheduler.abort_seq_group(req_id)
|
||||
# - Verify that sequence group cross-attention block tables are
|
||||
# NO LONGER registered with the block manager
|
||||
assert req_id not in scheduler.block_manager.cross_block_tables
|
||||
@ -215,9 +215,7 @@ TEXT_GENERATION_MODELS = {
|
||||
EMBEDDING_MODELS = { # type: ignore[var-annotated]
|
||||
# [Text-only]
|
||||
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
|
||||
# TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883
|
||||
# is fixed
|
||||
#"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
|
||||
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
|
||||
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
|
||||
load_format="dummy", runner="pooling"
|
||||
),
|
||||
@ -244,9 +242,6 @@ MULTIMODAL_MODELS = {
|
||||
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
|
||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
|
||||
# [Encoder-decoder]
|
||||
# TODO: Implement PP
|
||||
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
|
||||
@ -235,7 +235,6 @@ def _compare_sp(
|
||||
'level': 3,
|
||||
'custom_ops': ["+rms_norm"],
|
||||
'compile_sizes': [4, 8],
|
||||
'splitting_ops': [],
|
||||
'pass_config': {
|
||||
'enable_sequence_parallelism': True,
|
||||
'enable_fusion': enable_fusion,
|
||||
@ -251,6 +250,8 @@ def _compare_sp(
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
"--compilation_config",
|
||||
|
||||
@ -1,131 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""E2E tests to verify the correctness of the encoder-decoder framework
|
||||
|
||||
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
|
||||
"""
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
|
||||
global_force_attn_backend_context_manager)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import DecoderPromptType
|
||||
from ..models.utils import check_logprobs_close
|
||||
|
||||
LIST_ENC_DEC_SUPPORTED_BACKENDS = [
|
||||
_Backend.XFORMERS, _Backend.FLASH_ATTN, None
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||
all tests in the module.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "</s>"
|
||||
if decoder_prompt_type == DecoderPromptType.NONE:
|
||||
hf_output_str = "<s>" + hf_output_str
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clear_cache():
|
||||
"""Fixture to clear backend cache before each test."""
|
||||
_cached_get_attn_backend.cache_clear() # Clear the cache
|
||||
yield # This allows the test to run
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cpu(),
|
||||
reason="CPU backend is not currently supported with encoder/decoder models"
|
||||
)
|
||||
@pytest.mark.skip(reason="bart not supported in V1")
|
||||
def test_encoder_decoder_e2e(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
enforce_eager: bool,
|
||||
attn_backend: _Backend,
|
||||
) -> None:
|
||||
'''
|
||||
End-to-End (E2E) test for the encoder-decoder framework.
|
||||
This test evaluates the encoder-decoder functionality using the BART
|
||||
model. We compare the outputs of the Hugging Face and vLLM
|
||||
implementations to ensure that both implementations produce consistent
|
||||
and correct results.
|
||||
'''
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
test_case_prompts = example_encoder_decoder_prompts[
|
||||
decoder_prompt_type]
|
||||
|
||||
# Configuration settings for HF baseline
|
||||
hf_kwargs = {
|
||||
"top_k": None,
|
||||
"num_beams": 1,
|
||||
"repetition_penalty": 1.0,
|
||||
"top_p": 1.0,
|
||||
"length_penalty": 1.0,
|
||||
"early_stopping": False,
|
||||
"no_repeat_ngram_size": None,
|
||||
"min_length": 0
|
||||
}
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_outputs = (
|
||||
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
test_case_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
with vllm_runner(model, dtype=dtype,
|
||||
enforce_eager=enforce_eager) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
test_case_prompts, max_tokens, num_logprobs)
|
||||
|
||||
hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
|
||||
else 0)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, decoder_prompt_type)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=hf_skip_tokens,
|
||||
)
|
||||
228
tests/engine/test_stop_checker.py
Normal file
228
tests/engine/test_stop_checker.py
Normal file
@ -0,0 +1,228 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.reasoning import ReasoningParser
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import Sequence, SequenceStatus
|
||||
|
||||
REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
|
||||
|
||||
|
||||
class MockReasoningParser(ReasoningParser):
|
||||
"""Mock reasoning parser for testing purposes."""
|
||||
|
||||
def __init__(self,
|
||||
tokenizer: AutoTokenizer,
|
||||
reasoning_active: bool = False):
|
||||
super().__init__(tokenizer)
|
||||
self.reasoning_active = reasoning_active
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return not self.reasoning_active
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
return input_ids
|
||||
|
||||
|
||||
class MockSequence(Sequence):
|
||||
"""Mock sequence for testing purposes."""
|
||||
|
||||
def __init__(self, token_ids, output_text="test_output", eos_token_id=0):
|
||||
self.token_ids = token_ids
|
||||
self.output_text = output_text
|
||||
self.eos_token_id = eos_token_id
|
||||
self.status = SequenceStatus.RUNNING
|
||||
self.stop_reason = None
|
||||
|
||||
def get_token_ids(self):
|
||||
return self.token_ids
|
||||
|
||||
def get_last_token_id(self):
|
||||
return self.token_ids[-1] if self.token_ids else None
|
||||
|
||||
def get_len(self):
|
||||
return len(self.token_ids)
|
||||
|
||||
def get_output_len(self):
|
||||
return len(self.token_ids) - 1 # Simulating prompt + outputs
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def deepseek_r1_qwen_tokenizer():
|
||||
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stop_checker():
|
||||
return StopChecker(max_model_len=10,
|
||||
get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stop_checker_with_reasoner():
|
||||
reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer)
|
||||
return StopChecker(max_model_len=10,
|
||||
get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer,
|
||||
reasoner=reasoner)
|
||||
|
||||
|
||||
def test_eos_token_stopping(stop_checker):
|
||||
"""Test sequence stopping when EOS token is encountered."""
|
||||
seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
stop_checker.maybe_stop_sequence(seq,
|
||||
new_char_count=1,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
|
||||
|
||||
def test_ignore_eos(stop_checker):
|
||||
"""Test sequence continuing when EOS token is ignored."""
|
||||
seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
|
||||
sampling_params = SamplingParams(ignore_eos=True)
|
||||
|
||||
stop_checker.maybe_stop_sequence(seq,
|
||||
new_char_count=1,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
assert seq.status == SequenceStatus.RUNNING
|
||||
|
||||
|
||||
def test_min_tokens(stop_checker):
|
||||
"""Test min_tokens prevents early stopping."""
|
||||
seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
|
||||
sampling_params = SamplingParams(min_tokens=3)
|
||||
|
||||
stop_checker.maybe_stop_sequence(seq,
|
||||
new_char_count=1,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
assert seq.status == SequenceStatus.RUNNING
|
||||
|
||||
|
||||
def test_stop_token_ids(stop_checker):
|
||||
"""Test sequence stopping with custom stop token IDs."""
|
||||
seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
|
||||
sampling_params = SamplingParams(stop_token_ids=[3])
|
||||
|
||||
stop_checker.maybe_stop_sequence(seq,
|
||||
new_char_count=1,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
assert seq.stop_reason == 3
|
||||
|
||||
|
||||
def test_stop_strings(stop_checker):
|
||||
"""Test sequence stopping with stop strings."""
|
||||
seq = MockSequence(token_ids=[1, 2, 3],
|
||||
output_text="test output with STOP",
|
||||
eos_token_id=0)
|
||||
sampling_params = SamplingParams(stop=["STOP"])
|
||||
|
||||
stop_checker.maybe_stop_sequence(seq,
|
||||
new_char_count=1,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
assert seq.stop_reason == "STOP"
|
||||
assert "STOP" not in seq.output_text # Default behavior removes stop string
|
||||
|
||||
|
||||
def test_include_stop_str_in_output(stop_checker):
|
||||
"""Test keeping stop strings in output."""
|
||||
seq = MockSequence(token_ids=[1, 2, 3],
|
||||
output_text="test output with STOP",
|
||||
eos_token_id=0)
|
||||
sampling_params = SamplingParams(stop=["STOP"],
|
||||
include_stop_str_in_output=True)
|
||||
|
||||
stop_checker.maybe_stop_sequence(seq,
|
||||
new_char_count=5,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
assert "STOP" in seq.output_text
|
||||
|
||||
|
||||
def test_max_tokens(stop_checker):
|
||||
"""Test sequence stopping at max_tokens."""
|
||||
seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
|
||||
sampling_params = SamplingParams(max_tokens=2)
|
||||
|
||||
stop_checker.maybe_stop_sequence(seq,
|
||||
new_char_count=1,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED
|
||||
|
||||
|
||||
def test_max_model_len(stop_checker):
|
||||
"""Test sequence stopping at max_model_len."""
|
||||
seq = MockSequence(token_ids=list(range(11)),
|
||||
eos_token_id=0) # 11 tokens, max is 10
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
stop_checker.maybe_stop_sequence(seq,
|
||||
new_char_count=1,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
assert seq.status == SequenceStatus.FINISHED_LENGTH_CAPPED
|
||||
|
||||
|
||||
def test_reasoning_skip_stops(stop_checker_with_reasoner):
|
||||
"""Test that stop tokens and strings are ignored during reasoning."""
|
||||
# Set reasoning_active to True to simulate being in reasoning mode
|
||||
stop_checker_with_reasoner.reasoner.reasoning_active = True
|
||||
|
||||
# Test with stop token
|
||||
seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
|
||||
sampling_params = SamplingParams(stop_token_ids=[3])
|
||||
|
||||
stop_checker_with_reasoner.maybe_stop_sequence(
|
||||
seq, new_char_count=1, sampling_params=sampling_params)
|
||||
assert seq.status == SequenceStatus.RUNNING
|
||||
|
||||
# Test with stop string
|
||||
seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP")
|
||||
sampling_params = SamplingParams(stop=["STOP"])
|
||||
|
||||
stop_checker_with_reasoner.maybe_stop_sequence(
|
||||
seq, new_char_count=4, sampling_params=sampling_params)
|
||||
assert seq.status == SequenceStatus.RUNNING
|
||||
|
||||
# But EOS token still stops the sequence
|
||||
seq = MockSequence(token_ids=[1, 2, 0], eos_token_id=0)
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
stop_checker_with_reasoner.maybe_stop_sequence(
|
||||
seq, new_char_count=1, sampling_params=sampling_params)
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
|
||||
|
||||
def test_reasoning_end_enables_stops(stop_checker_with_reasoner):
|
||||
"""Test that stop tokens work after reasoning ends."""
|
||||
# Set reasoning_active to False to simulate being out of reasoning mode
|
||||
stop_checker_with_reasoner.reasoner.reasoning_active = False
|
||||
|
||||
# Test with stop token
|
||||
seq = MockSequence(token_ids=[1, 2, 3], eos_token_id=0)
|
||||
sampling_params = SamplingParams(stop_token_ids=[3])
|
||||
|
||||
stop_checker_with_reasoner.maybe_stop_sequence(
|
||||
seq, new_char_count=1, sampling_params=sampling_params)
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
|
||||
# Test with stop string
|
||||
seq = MockSequence(token_ids=[1, 2, 3], output_text="test STOP")
|
||||
sampling_params = SamplingParams(stop=["STOP"])
|
||||
|
||||
stop_checker_with_reasoner.maybe_stop_sequence(
|
||||
seq, new_char_count=4, sampling_params=sampling_params)
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
@ -1,56 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "facebook/bart-base"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--enforce-eager",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.skip(reason="bart is not yet supported in V1")
|
||||
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
|
||||
completion = await client.completions.create(model=model_name,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
|
||||
assert completion.id is not None
|
||||
assert completion.choices is not None and len(completion.choices) == 1
|
||||
|
||||
choice = completion.choices[0]
|
||||
assert len(choice.text) >= 5
|
||||
assert choice.finish_reason == "length"
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=2, total_tokens=7)
|
||||
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 1
|
||||
@ -9,7 +9,7 @@ from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import MultiModalConfig
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
||||
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
|
||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
|
||||
@ -232,6 +232,9 @@ EXPECTED_METRICS_V1 = [
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:gpu_prefix_cache_queries",
|
||||
"vllm:gpu_prefix_cache_hits",
|
||||
"vllm:kv_cache_usage_perc",
|
||||
"vllm:prefix_cache_queries",
|
||||
"vllm:prefix_cache_hits",
|
||||
"vllm:num_preemptions_total",
|
||||
"vllm:prompt_tokens_total",
|
||||
"vllm:generation_tokens_total",
|
||||
@ -277,6 +280,9 @@ EXPECTED_METRICS_V1 = [
|
||||
]
|
||||
|
||||
HIDDEN_DEPRECATED_METRICS: list[str] = [
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:gpu_prefix_cache_queries",
|
||||
"vllm:gpu_prefix_cache_hits",
|
||||
"vllm:time_per_output_token_seconds_sum",
|
||||
"vllm:time_per_output_token_seconds_bucket",
|
||||
"vllm:time_per_output_token_seconds_count",
|
||||
@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
|
||||
client: openai.AsyncClient, use_v1: bool):
|
||||
|
||||
running_requests, waiting_requests, kv_cache_usage = (
|
||||
_get_running_metrics_from_api(server))
|
||||
_get_running_metrics_from_api(server, use_v1))
|
||||
|
||||
# Expect no running requests or kvcache usage
|
||||
assert running_requests == 0
|
||||
@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
|
||||
|
||||
# Check that we have running requests
|
||||
running_requests, waiting_requests, kv_cache_usage = (
|
||||
_get_running_metrics_from_api(server))
|
||||
_get_running_metrics_from_api(server, use_v1))
|
||||
|
||||
# Expect running requests and kvcache usage
|
||||
assert running_requests > 0
|
||||
@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
|
||||
|
||||
# Verify running and waiting requests counts and KV cache usage are zero
|
||||
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
|
||||
_get_running_metrics_from_api(server))
|
||||
_get_running_metrics_from_api(server, use_v1))
|
||||
|
||||
assert running_requests_after == 0,\
|
||||
(f"Expected 0 running requests after abort, got "
|
||||
@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
|
||||
f"{kv_cache_usage_after}")
|
||||
|
||||
|
||||
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
|
||||
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
|
||||
"""Return (running_count, waiting_count, kv_cache_usage)"""
|
||||
|
||||
response = requests.get(server.url_for("metrics"))
|
||||
@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
|
||||
# Verify running and waiting requests counts and KV cache usage are zero
|
||||
running_requests, waiting_requests, kv_cache_usage = None, None, None
|
||||
|
||||
kv_cache_usage_metric = ("vllm:kv_cache_usage_perc"
|
||||
if use_v1 else "vllm:gpu_cache_usage_perc")
|
||||
|
||||
for family in text_string_to_metric_families(response.text):
|
||||
if family.name == "vllm:num_requests_running":
|
||||
for sample in family.samples:
|
||||
@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
|
||||
if sample.name == "vllm:num_requests_waiting":
|
||||
waiting_requests = sample.value
|
||||
break
|
||||
elif family.name == "vllm:gpu_cache_usage_perc":
|
||||
elif family.name == kv_cache_usage_metric:
|
||||
for sample in family.samples:
|
||||
if sample.name == "vllm:gpu_cache_usage_perc":
|
||||
if sample.name == kv_cache_usage_metric:
|
||||
kv_cache_usage = sample.value
|
||||
break
|
||||
|
||||
|
||||
@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_max_tokens(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the first paragraph of Moby Dick?",
|
||||
reasoning={"effort": "low"},
|
||||
max_output_tokens=30,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "incomplete"
|
||||
assert response.incomplete_details.reason == "max_output_tokens"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_chat(client: OpenAI, model_name: str):
|
||||
|
||||
@ -12,7 +12,7 @@ from unittest.mock import MagicMock
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.config import MultiModalConfig
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
|
||||
@ -20,7 +20,6 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
|
||||
parse_chat_messages_futures,
|
||||
resolve_chat_template_content_format,
|
||||
resolve_hf_chat_template)
|
||||
from vllm.entrypoints.llm import apply_hf_chat_template
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
|
||||
from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
|
||||
encode_video_base64)
|
||||
@ -38,7 +37,6 @@ QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
|
||||
QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
|
||||
MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
|
||||
HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
@ -125,27 +123,6 @@ def qwen25omni_tokenizer():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def mllama_model_config():
|
||||
return ModelConfig(
|
||||
MLLAMA_MODEL_ID,
|
||||
runner="generate",
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def mllama_tokenizer():
|
||||
return TokenizerGroup(
|
||||
MLLAMA_MODEL_ID,
|
||||
enable_lora=False,
|
||||
max_num_seqs=5,
|
||||
max_input_length=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def mistral_model_config():
|
||||
return ModelConfig(
|
||||
@ -2249,180 +2226,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
|
||||
)
|
||||
|
||||
|
||||
### Mllama currently wraps images / texts as interleaved dictionaries
|
||||
def test_mllama_single_image(
|
||||
mllama_model_config,
|
||||
mllama_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
"""Ensures that a single image is parsed correctly mllama."""
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of this image is:"
|
||||
},
|
||||
{
|
||||
"image_url": image_url
|
||||
},
|
||||
],
|
||||
}],
|
||||
mllama_model_config,
|
||||
mllama_tokenizer,
|
||||
content_format="openai",
|
||||
)
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of this image is:"
|
||||
},
|
||||
{
|
||||
"type": "image"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
|
||||
def test_mllama_interleaved_images(
|
||||
mllama_model_config,
|
||||
mllama_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
"""Ensures that multiple image are parsed as interleaved dicts."""
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the first image is:",
|
||||
},
|
||||
{
|
||||
"image_url": image_url
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the second image is:",
|
||||
},
|
||||
{
|
||||
"image_url": image_url
|
||||
},
|
||||
],
|
||||
}],
|
||||
mllama_model_config,
|
||||
mllama_tokenizer,
|
||||
content_format="openai",
|
||||
)
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the first image is:"
|
||||
},
|
||||
{
|
||||
"type": "image"
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the second image is:"
|
||||
},
|
||||
{
|
||||
"type": "image"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [MLLAMA_MODEL_ID])
|
||||
def test_multimodal_image_parsing_matches_hf(model, image_url):
|
||||
"""Checks end to end hf alignment for multimodal [image] parsing."""
|
||||
|
||||
def get_conversation(is_hf: bool):
|
||||
img_part = {"type": "image_url", "image_url": {"url": image_url}}
|
||||
if is_hf:
|
||||
img_part = {"type": "image"}
|
||||
return [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the first image is:",
|
||||
},
|
||||
img_part,
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The content of the second image is:",
|
||||
},
|
||||
img_part,
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What animal is in the first image?",
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
# Build a config for the model
|
||||
model_config = ModelConfig(
|
||||
model,
|
||||
runner="generate",
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
},
|
||||
)
|
||||
|
||||
# Build the tokenizer group and grab the underlying tokenizer
|
||||
tokenizer_group = TokenizerGroup(
|
||||
model,
|
||||
enable_lora=False,
|
||||
max_num_seqs=5,
|
||||
max_input_length=None,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
tokenizer = tokenizer_group.tokenizer
|
||||
|
||||
# Build and parse a conversation with {"type": "image"} using the tokenizer
|
||||
hf_conversation = get_conversation(is_hf=True)
|
||||
hf_result = tokenizer.apply_chat_template(
|
||||
hf_conversation,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
# Now parse with vLLMs chat utils & apply the template
|
||||
vllm_conversation = get_conversation(is_hf=False)
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
vllm_conversation,
|
||||
model_config,
|
||||
tokenizer_group,
|
||||
content_format="openai",
|
||||
)
|
||||
|
||||
vllm_result = apply_hf_chat_template(
|
||||
tokenizer=tokenizer,
|
||||
conversation=conversation,
|
||||
chat_template=None,
|
||||
model_config=model_config,
|
||||
tools=None,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
assert hf_result == vllm_result
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
@ -2486,7 +2289,6 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
|
||||
(QWEN25VL_MODEL_ID, "openai"),
|
||||
(ULTRAVOX_MODEL_ID, "string"),
|
||||
(QWEN2AUDIO_MODEL_ID, "openai"),
|
||||
(MLLAMA_MODEL_ID, "openai"),
|
||||
(LLAMA_GUARD_MODEL_ID, "openai")],
|
||||
)
|
||||
# yapf: enable
|
||||
@ -2545,7 +2347,6 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
[("Salesforce/blip2-opt-2.7b", "string"),
|
||||
("facebook/chameleon-7b", "string"),
|
||||
("deepseek-ai/deepseek-vl2-tiny", "string"),
|
||||
("microsoft/Florence-2-base", "string"),
|
||||
("adept/fuyu-8b", "string"),
|
||||
("google/paligemma-3b-mix-224", "string"),
|
||||
("Qwen/Qwen-VL", "string"),
|
||||
|
||||
@ -0,0 +1,6 @@
|
||||
model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
||||
accuracy_threshold: 0.72
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
max_model_len: 4096
|
||||
|
||||
@ -3,3 +3,4 @@ Llama-3.2-1B-Instruct-INT8-CT.yaml
|
||||
Llama-3-8B-Instruct-nonuniform-CT.yaml
|
||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
||||
Qwen1.5-MoE-W4A16-CT.yaml
|
||||
DeepSeek-V2-Lite-Instruct-FP8.yaml
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
25
tests/kernels/quantization/test_hadacore.py
Normal file
25
tests/kernels/quantization/test_hadacore.py
Normal file
@ -0,0 +1,25 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from compressed_tensors.transform import deterministic_hadamard_matrix
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||
@pytest.mark.parametrize("hidden_dim", [2**n for n in range(10)])
|
||||
def test_hadacore(batch_size, hidden_dim, dtype=torch.bfloat16, device="cuda"):
|
||||
x = torch.eye(hidden_dim, dtype=dtype, device=device)
|
||||
hadamard = deterministic_hadamard_matrix(
|
||||
hidden_dim, dtype=torch.float64, device="cuda") / math.sqrt(hidden_dim)
|
||||
|
||||
y = ops.hadacore_transform(x.clone())
|
||||
y_true = (x.to(hadamard.dtype) @ hadamard.T).to(y.dtype)
|
||||
assert torch.allclose(y, y_true)
|
||||
|
||||
y = ops.hadacore_transform(y)
|
||||
assert torch.allclose(y, x)
|
||||
@ -5,6 +5,8 @@ import torch
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
rocm_per_tensor_w8a8_scaled_mm_impl)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DTYPES = [torch.bfloat16, torch.float16]
|
||||
@ -116,3 +118,32 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
|
||||
current_platform.get_cu_count())
|
||||
|
||||
assert torch.allclose(out, ref_out, rtol=0.01)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("use_bias", [True, False])
|
||||
@pytest.mark.skipif(
|
||||
not (current_platform.is_rocm() and current_platform.supports_fp8()),
|
||||
reason="only test for rocm fp8")
|
||||
def test_rocm_per_tensor_w8a8_scaled_mm_impl(n, k, m, dtype, seed, use_bias):
|
||||
torch.manual_seed(seed)
|
||||
|
||||
A = torch.rand(n, k, device="cuda")
|
||||
B = torch.rand(m, k, device="cuda")
|
||||
|
||||
A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
|
||||
B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
|
||||
|
||||
bias = torch.rand(1, m, dtype=dtype, device="cuda") if use_bias else None
|
||||
|
||||
output = rocm_per_tensor_w8a8_scaled_mm_impl(A, B.t(), dtype, scale_a,
|
||||
scale_b, bias)
|
||||
ref_out = torch._scaled_mm(A,
|
||||
B.t(),
|
||||
out_dtype=dtype,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
bias=bias)
|
||||
assert torch.allclose(output, ref_out, rtol=0.01)
|
||||
|
||||
@ -1,222 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
|
||||
HfRunner, VllmRunner)
|
||||
from ....utils import multi_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "</s>"
|
||||
if decoder_prompt_type == DecoderPromptType.NONE:
|
||||
hf_output_str = "<s>" + hf_output_str
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
) -> None:
|
||||
'''
|
||||
Test the vLLM BART model for a variety of encoder/decoder input prompts,
|
||||
by validating it against HuggingFace (HF) BART.
|
||||
|
||||
Arguments:
|
||||
|
||||
* hf_runner: HuggingFace (HF) test model runner
|
||||
* vllm_runner: vLLM test model runner
|
||||
* example_encoder_decoder_prompts: test fixture which provides a
|
||||
dictionary of dummy prompts
|
||||
* model: the HF ID of the specific BART variant under test
|
||||
* dtype: the tensor datatype to employ
|
||||
* max_tokens
|
||||
* num_logprobs
|
||||
* decoder_prompt_type: key into the example_encoder_decoder_prompts
|
||||
dictionary; selects specific encoder/decoder
|
||||
prompt scenarios to test
|
||||
|
||||
A note on using HF BART as a baseline for validating vLLM BART,
|
||||
specifically when the decoder prompt is None.
|
||||
|
||||
The HF GenerationMixin's default behavior is to force the first
|
||||
decoded token to be <BOS> if the prompt does not already contain
|
||||
<BOS> (this is accomplished using a logit
|
||||
processor setting.)
|
||||
|
||||
So when we use HF BART as our baseline for comparison, note that
|
||||
when the user provides a request with a None decoder prompt
|
||||
(i.e. a singleton encoder prompt, or else an explicit encoder/
|
||||
decoder prompt with the decoder sub-prompt set to None), HF and
|
||||
vLLM handle this in different ways:
|
||||
|
||||
* HF will (1) tokenize the None prompt as an empty token-list,
|
||||
(2) append <decoder-start-token> to the beginning, yielding
|
||||
[<decoder-start-token>], (3) pass this token list to the model, and
|
||||
then (4) after computing logits during prefill, override the model
|
||||
logits & force <BOS> to be the first generated token.
|
||||
|
||||
* vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
|
||||
start-token to the beginning, yielding [<decoder-start-token><BOS>],
|
||||
(3) pass these tokens to the model & proceed with generation.
|
||||
|
||||
The net effect is that compared to vLLM, the list of HF *decoded* tokens
|
||||
will contain one more initial <BOS> than the vLLM generated tokens,
|
||||
because vLLM's <BOS> token is injected into the prompt rather than into
|
||||
the generated output. This is in spite of the fact that overall, the
|
||||
complete sequences (prompt + decoded tokens) produced by vLLM will match
|
||||
HF.
|
||||
|
||||
So when we use HF decoded token output to validate vLLM's decoded token
|
||||
output, the testing process must account for the difference in decoded
|
||||
token sequences between vLLM and HF specifically in the
|
||||
decoder-prompt-is-None case.
|
||||
|
||||
One option is to disable the logit processor feature that forces the
|
||||
<BOS> token to be decoded (forced_bos_token_id = None), eliminating
|
||||
the problem entirely. However this is not "normal" BART usage.
|
||||
|
||||
The other option is - only in the decoder-prompt-is-None case - to
|
||||
discard the first decoded token from the HF output before comparing it
|
||||
to vLLM.
|
||||
|
||||
To that end, when testing the scenario where the decoder prompt is None
|
||||
(and only in that one scenario), this test skips the first HF decoded
|
||||
token during the process of validating the vLLM decoded output.
|
||||
'''
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default).
|
||||
|
||||
# Note: currently encoder/decoder models are only compatible with
|
||||
# enforce_eager=True. Normally this is not a problem because
|
||||
# for encoder/decoder models vLLM will
|
||||
# default to enforce_eager=True if enforce_eager
|
||||
# is left unspecified. However, the
|
||||
# VllmRunner test fixture (which wraps around the LLM class) defaults to
|
||||
# enforce_eager=False (a behavior which a number of already-existing
|
||||
# decoder-only unit tests expect), so when testing an encoder/decoder
|
||||
# model we must explicitly specify enforce_eager=True in the VllmRunner
|
||||
# constructor.
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs)
|
||||
|
||||
# Configuration settings for HF baseline
|
||||
hf_kwargs = {
|
||||
"top_k": None,
|
||||
"num_beams": 1,
|
||||
"repetition_penalty": 1.0,
|
||||
"top_p": 1.0,
|
||||
"length_penalty": 1.0,
|
||||
"early_stopping": False,
|
||||
"no_repeat_ngram_size": None,
|
||||
"min_length": 0
|
||||
}
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
|
||||
hf_skip_tokens = (1
|
||||
if decoder_prompt_type == DecoderPromptType.NONE else 0)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, decoder_prompt_type)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=hf_skip_tokens,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param("facebook/bart-base",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param("facebook/bart-large-cnn"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
@pytest.mark.skip(reason="bart not supported in V1")
|
||||
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
|
||||
dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
|
||||
@pytest.mark.skip(reason="bart not supported in V1")
|
||||
def test_models_distributed(hf_runner, vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
distributed_executor_backend, model, dtype,
|
||||
max_tokens, num_logprobs,
|
||||
decoder_prompt_type) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
@ -1,123 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import DecoderPromptType, HfRunner, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
hf_output_str = output_str + "</s>"
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
prompts: list[dict[str, str]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
) -> None:
|
||||
'''
|
||||
Test the vLLM mBART model by validating it against HuggingFace (HF).
|
||||
(Docstring content is omitted for brevity)
|
||||
'''
|
||||
|
||||
vllm_prompts = prompts
|
||||
if decoder_prompt_type == DecoderPromptType.NONE:
|
||||
vllm_prompts = [{
|
||||
"encoder_prompt": p['encoder_prompt'],
|
||||
"decoder_prompt": ""
|
||||
} for p in prompts]
|
||||
|
||||
vllm_kwargs = {
|
||||
"hf_overrides": {
|
||||
"architectures": ["MBartForConditionalGeneration"]
|
||||
}
|
||||
}
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
**vllm_kwargs) as vllm_model: # type: ignore
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
vllm_prompts, max_tokens, num_logprobs)
|
||||
|
||||
hf_kwargs = {
|
||||
"top_k": None,
|
||||
"num_beams": 1,
|
||||
"repetition_penalty": 1.0,
|
||||
"top_p": 1.0,
|
||||
"length_penalty": 1.0,
|
||||
"early_stopping": False,
|
||||
"no_repeat_ngram_size": None,
|
||||
"min_length": 0
|
||||
}
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_kwargs["decoder_start_token_id"] = (
|
||||
hf_model.tokenizer.lang_code_to_id["ro_RO"])
|
||||
|
||||
hf_outputs = (
|
||||
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
prompts, # HF runner still uses the original prompts
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
|
||||
hf_skip_tokens = 0
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, decoder_prompt_type)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=hf_skip_tokens,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[pytest.param("facebook/mbart-large-en-ro")],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
|
||||
dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
@ -1,147 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = ["microsoft/Florence-2-base"]
|
||||
# Florence-2 model repo's tokenizer config is missing some special tokens.
|
||||
# Therefore, we use a converted tokenizer from a forked repo
|
||||
TOKENIZER = "Isotr0py/Florence-2-tokenizer"
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<OD>", # special task token which will output special tokens
|
||||
"cherry_blossom":
|
||||
"Describe in detail what is shown in the image.",
|
||||
})
|
||||
|
||||
|
||||
def get_hf_images_prompts(
|
||||
prompts_: list[ExplicitEncoderDecoderPrompt[str, TextPrompt]],
|
||||
) -> tuple[list[ExplicitEncoderDecoderPrompt[str, str]], list[Image.Image]]:
|
||||
prompts, images = [], []
|
||||
for prompt in prompts_:
|
||||
encoder_prompt = prompt["encoder_prompt"]
|
||||
prompts.append(
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=encoder_prompt["prompt"],
|
||||
decoder_prompt=None,
|
||||
))
|
||||
images.append(encoder_prompt["multi_modal_data"]["image"])
|
||||
return prompts, images
|
||||
|
||||
|
||||
def hf_to_vllm_output(hf_output: tuple[list[int], str,
|
||||
Optional[SampleLogprobs]]):
|
||||
"""Sanitize hf output to be comparable with vllm output."""
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
|
||||
output_str = output_str.replace("</s>", "").replace("<s>", "")
|
||||
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: list[list[ExplicitEncoderDecoderPrompt]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
) -> None:
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=8,
|
||||
tokenizer_name=TOKENIZER,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
skip_special_tokens=False,
|
||||
) for prompts in inputs
|
||||
]
|
||||
|
||||
hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
|
||||
|
||||
with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.lm_head
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
prompts, max_tokens, num_logprobs=num_logprobs, images=images)
|
||||
for prompts, images in hf_inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs],
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=1,
|
||||
)
|
||||
|
||||
|
||||
# FIXME: https://github.com/huggingface/transformers/issues/38358
|
||||
@pytest.mark.skip("Model initialization fails")
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, model: str,
|
||||
size_factors: list[int], dtype: str, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [[
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=TextPrompt(
|
||||
prompt=prompt,
|
||||
multi_modal_data={"image": rescale_image_size(image, factor)}),
|
||||
decoder_prompt=None,
|
||||
) for factor in size_factors
|
||||
] for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_image,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
@ -1,768 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional, overload
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging.version import Version
|
||||
from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.attention.backends.flash_attn import FlashAttentionMetadata
|
||||
from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
|
||||
global_force_attn_backend_context_manager)
|
||||
from vllm.model_executor.models.mllama import MllamaForConditionalGeneration
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
|
||||
PromptImageInput, VllmRunner)
|
||||
from ....quantization.utils import is_quant_method_supported
|
||||
from ....utils import (create_new_process_for_each_test, large_gpu_test,
|
||||
multi_gpu_test)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
_LIMIT_IMAGE_PER_PROMPT = 3
|
||||
MLLAMA_IMAGE_TOKEN_ID = 128256
|
||||
|
||||
LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|image|><|begin_of_text|>The meaning of the image is",
|
||||
"cherry_blossom":
|
||||
"<|image|><|begin_of_text|>The city is",
|
||||
})
|
||||
|
||||
text_only_prompts = [
|
||||
"The color of the sky is blue but sometimes it can also be",
|
||||
]
|
||||
|
||||
models = [
|
||||
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
]
|
||||
|
||||
# Indices for inputs
|
||||
TEXT_ONLY = '0'
|
||||
IMAGE_AT_BEG = '1'
|
||||
IMAGE_AT_MIDDLE = '2'
|
||||
TWO_IMAGES = '3'
|
||||
|
||||
# Input tokenized
|
||||
prompt_data = {
|
||||
# Tell me a story
|
||||
TEXT_ONLY: [41551, 757, 264, 3446],
|
||||
# <|image|> What's the content of this image
|
||||
IMAGE_AT_BEG:
|
||||
[MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220],
|
||||
# Hello <|image|>What' the content of this image
|
||||
IMAGE_AT_MIDDLE:
|
||||
[9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217],
|
||||
#<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
|
||||
TWO_IMAGES: [
|
||||
MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30,
|
||||
MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
|
||||
Optional[SampleLogprobs]],
|
||||
model: str):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
image_token_id = config.image_token_index
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
||||
]
|
||||
|
||||
hf_output_str = output_str
|
||||
if hf_output_ids[-1] == eos_token_id:
|
||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def _get_inputs(
|
||||
image_assets: ImageTestAssets,
|
||||
*,
|
||||
size_factors: Optional[list[float]] = None,
|
||||
sizes: Optional[list[tuple[int, int]]] = None,
|
||||
) -> list[tuple[list[str], PromptImageInput]]:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
if size_factors is not None:
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
elif sizes is not None:
|
||||
inputs_per_image = [(
|
||||
[
|
||||
prompt if size is not None else text_only_prompts[0]
|
||||
for size in sizes
|
||||
],
|
||||
[
|
||||
image.resize(size) if size is not None else None
|
||||
for size in sizes
|
||||
],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
if len(sizes) == 0:
|
||||
inputs_per_image.append(
|
||||
(text_only_prompts, [None] * len(text_only_prompts)))
|
||||
else:
|
||||
raise ValueError("You must provide either `size_factors` or `sizes`")
|
||||
|
||||
return inputs_per_image
|
||||
|
||||
|
||||
@overload
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
model: str,
|
||||
*,
|
||||
size_factors: list[float],
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
model: str,
|
||||
*,
|
||||
sizes: list[tuple[int, int]],
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
...
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
model: str,
|
||||
*,
|
||||
size_factors: Optional[list[float]] = None,
|
||||
sizes: Optional[list[tuple[int, int]]] = None,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
_get_inputs(image_assets, size_factors=size_factors, sizes=sizes),
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: list[tuple[list[str], PromptImageInput]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=19212, # 3 max size images
|
||||
max_num_seqs=3,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
limit_mm_per_prompt={"image":
|
||||
_LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
|
||||
vllm_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
model_kwargs={"device_map": "auto"},
|
||||
auto_cls=AutoModelForImageTextToText) as hf_model:
|
||||
hf_outputs_per_image = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
||||
vllm_outputs_per_image):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, model)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clear_cache():
|
||||
"""Fixture to clear backend cache before each test."""
|
||||
_cached_get_attn_backend.cache_clear() # Clear the cache
|
||||
yield # This allows the test to run
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"sizes",
|
||||
[
|
||||
# Text only
|
||||
[],
|
||||
# Single-size
|
||||
[(512, 512)],
|
||||
# Single-size, batched
|
||||
[(512, 512), (512, 512), (512, 512)],
|
||||
# Multi-size, batched
|
||||
[(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
|
||||
(1024, 1024), (512, 1536), (512, 2028)],
|
||||
# Multi-size, batched, including text only
|
||||
[(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
|
||||
(1024, 1024), (512, 1536), (512, 2028), None],
|
||||
# mllama has 8 possible aspect ratios, carefully set the sizes
|
||||
# to cover all of them
|
||||
])
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
@pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
|
||||
reason="Transformers v4.55 has a regression issue on mllama, "
|
||||
"see: https://github.com/huggingface/transformers/pull/40083")
|
||||
def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
|
||||
model, sizes, dtype, max_tokens,
|
||||
num_logprobs,
|
||||
attn_backend: _Backend) -> None:
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
sizes=sizes,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
@pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
|
||||
reason="Transformers v4.55 has a regression issue on mllama, "
|
||||
"see: https://github.com/huggingface/transformers/pull/40083")
|
||||
def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
|
||||
model, dtype, max_tokens, num_logprobs,
|
||||
attn_backend: _Backend) -> None:
|
||||
|
||||
stop_sign = image_assets[0].pil_image
|
||||
cherry_blossom = image_assets[1].pil_image
|
||||
|
||||
inputs = [(
|
||||
[
|
||||
"<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501
|
||||
"<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501
|
||||
"<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.", # noqa: E501
|
||||
],
|
||||
[
|
||||
[stop_sign, cherry_blossom],
|
||||
# Images with different sizes.
|
||||
[
|
||||
stop_sign.resize((512, 512)),
|
||||
stop_sign,
|
||||
],
|
||||
[
|
||||
stop_sign,
|
||||
stop_sign.resize((512, 1536)),
|
||||
cherry_blossom.resize((512, 1024)),
|
||||
],
|
||||
])]
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
@pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
|
||||
reason="Transformers v4.55 has a regression issue on mllama, "
|
||||
"see: https://github.com/huggingface/transformers/pull/40083")
|
||||
def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
|
||||
dtype, max_tokens, num_logprobs,
|
||||
attn_backend: _Backend) -> None:
|
||||
|
||||
stop_sign = image_assets[0].pil_image
|
||||
cherry_blossom = image_assets[1].pil_image
|
||||
|
||||
inputs = [(
|
||||
[
|
||||
"<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
|
||||
"<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, " # noqa: E501
|
||||
"which is a stop sign and which is a cherry blossom?", # noqa: E501
|
||||
],
|
||||
[
|
||||
[stop_sign],
|
||||
[stop_sign, cherry_blossom],
|
||||
])]
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
|
||||
reason="Transformers v4.55 has a regression issue on mllama, "
|
||||
"see: https://github.com/huggingface/transformers/pull/40083")
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
distributed_executor_backend,
|
||||
model,
|
||||
dtype,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model=model,
|
||||
size_factors=[0.25, 0.5, 1.0],
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["float16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
def test_bnb_regression(
|
||||
image_assets: ImageTestAssets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
):
|
||||
stop_sign = image_assets[0].pil_image
|
||||
prompts = [
|
||||
{
|
||||
"prompt": "<|begin_of_text|>The content of the image <|image|> is",
|
||||
"multi_modal_data": {
|
||||
"image": stop_sign
|
||||
},
|
||||
},
|
||||
{
|
||||
"prompt":
|
||||
"The color of the sky is blue but sometimes it can also be",
|
||||
},
|
||||
]
|
||||
# Test regression about QKVCrossParallelLinear
|
||||
llm = LLM(
|
||||
model=model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
quantization="bitsandbytes",
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
assert outputs
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_explicit_implicit_prompt(
|
||||
image_assets: ImageTestAssets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
):
|
||||
stop_sign = image_assets[0].pil_image
|
||||
# yapf: disable
|
||||
prompts = [
|
||||
# explicit prompt
|
||||
{
|
||||
"encoder_prompt": {
|
||||
"prompt": "<|image|>",
|
||||
"multi_modal_data": {"image": stop_sign},
|
||||
},
|
||||
"decoder_prompt": {
|
||||
"prompt_token_ids": [128000, 791, 2262, 315, 279, 2217, 220, 128256, 374], # noqa: E501
|
||||
}
|
||||
},
|
||||
{
|
||||
"encoder_prompt": "Not <|image|>",
|
||||
"decoder_prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501
|
||||
},
|
||||
# implicit prompt
|
||||
{
|
||||
"prompt": "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
|
||||
"multi_modal_data": {"image": stop_sign},
|
||||
},
|
||||
{
|
||||
"prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501
|
||||
},
|
||||
]
|
||||
# yapf: enable
|
||||
llm = LLM(
|
||||
model=model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
n_prompts = len(prompts)
|
||||
explicit_outputs = outputs[:n_prompts // 2]
|
||||
implicit_outputs = outputs[n_prompts // 2:]
|
||||
for exp_output, imp_output in zip(explicit_outputs, implicit_outputs):
|
||||
assert exp_output.outputs[0].text == imp_output.outputs[0].text
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
|
||||
num_logprobs, attn_backend: _Backend) -> None:
|
||||
|
||||
stop_sign = image_assets[0].pil_image
|
||||
|
||||
with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=4,
|
||||
tensor_parallel_size=1,
|
||||
limit_mm_per_prompt={"image":
|
||||
_LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
|
||||
|
||||
# Regression tests for https://github.com/vllm-project/vllm/issues/10648
|
||||
|
||||
# Number of groups of image tokens is greater than the number of images
|
||||
# provided (the whitespace between the tags is necessary)
|
||||
prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images" # noqa: E501
|
||||
image = stop_sign
|
||||
with pytest.raises(ValueError):
|
||||
vllm_model.generate_greedy_logprobs([prompt],
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
images=[image])
|
||||
|
||||
# Batch of a text-only and image request that requires cross-attention
|
||||
prompts = [
|
||||
"What is the capital of spain?",
|
||||
"Text before the image...<|image|>What is in the image?", # noqa: E501
|
||||
]
|
||||
images = [
|
||||
None,
|
||||
[stop_sign],
|
||||
]
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
images=images)
|
||||
|
||||
# Test the reverse order too for good measure
|
||||
prompts = [
|
||||
"<|begin_of_text|>Text before the image...<|image|>What is in the image?", # noqa: E501
|
||||
"<|begin_of_text|>Hello!",
|
||||
]
|
||||
images = [
|
||||
[stop_sign],
|
||||
None,
|
||||
]
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
images=images)
|
||||
|
||||
# Mixed batch with text and images with different numbers of tiles
|
||||
prompts = [
|
||||
"<|begin_of_text|>Hello!",
|
||||
"<|begin_of_text|>Some text before.<|image|>What is in the image?", # noqa: E501
|
||||
"<|begin_of_text|>Some text before.<|image|>What is in the image?", # noqa: E501
|
||||
]
|
||||
images = [
|
||||
None,
|
||||
[stop_sign],
|
||||
# smaller image must be 2nd for the repro
|
||||
[stop_sign.resize((448, 448))],
|
||||
]
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
images=images)
|
||||
|
||||
|
||||
class DummyModel:
|
||||
image_token_id = MLLAMA_IMAGE_TOKEN_ID
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize(
|
||||
"input_indices_and_output",
|
||||
# inputs, (cross_attention_mask, kv_range_for_decode)
|
||||
[([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)),
|
||||
([TEXT_ONLY, IMAGE_AT_BEG], (None, None)),
|
||||
([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])),
|
||||
([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])),
|
||||
([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
|
||||
((23, 24), [[0, 6], [6, 12]])),
|
||||
([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])),
|
||||
([TWO_IMAGES], ((18, 12), [[6, 12]])),
|
||||
([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))])
|
||||
def test_get_cross_attention_mask(input_indices_and_output) -> None:
|
||||
|
||||
input_indices, expected_output = input_indices_and_output
|
||||
|
||||
sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
|
||||
num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices
|
||||
if i != TEXT_ONLY]
|
||||
input = torch.cat(sequences)
|
||||
|
||||
seq_lens = [len(s) for s in sequences]
|
||||
|
||||
attn_data = FlashAttentionMetadata(
|
||||
seq_lens=seq_lens,
|
||||
# Dummy values
|
||||
enable_kv_scales_calculation=False,
|
||||
num_prefills=0,
|
||||
num_prefill_tokens=0,
|
||||
num_decode_tokens=0,
|
||||
slot_mapping=0,
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
seq_lens_tensor=0,
|
||||
max_prefill_seq_len=0,
|
||||
max_decode_seq_len=0,
|
||||
context_lens_tensor=None,
|
||||
block_tables=None,
|
||||
use_cuda_graph=False,
|
||||
)
|
||||
|
||||
dummy = DummyModel()
|
||||
|
||||
cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
|
||||
.get_cross_attention_mask(dummy,
|
||||
input,
|
||||
attn_data,
|
||||
num_tiles=num_tiles,
|
||||
num_tokens_per_tile=3,
|
||||
dtype=torch.bfloat16)
|
||||
|
||||
expected_cross_attention_mask, expected_kv_range_for_decode = \
|
||||
expected_output
|
||||
|
||||
assert kv_range_for_decode == expected_kv_range_for_decode
|
||||
if expected_cross_attention_mask is not None:
|
||||
assert cross_attention_mask is not None
|
||||
assert cross_attention_mask.shape == expected_cross_attention_mask
|
||||
else:
|
||||
assert cross_attention_mask is None
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize(
|
||||
"input_indices",
|
||||
[[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE],
|
||||
[TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
|
||||
[IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]])
|
||||
def test_get_full_text_row_masked_out_mask(input_indices) -> None:
|
||||
|
||||
sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
|
||||
|
||||
seq_lens = [len(s) for s in sequences]
|
||||
|
||||
num_prefill_tokens = sum(seq_lens)
|
||||
|
||||
# TEXT_ONLY is zero, so it will be masked out,
|
||||
# other instances should not be.
|
||||
encoder_seq_lens = [int(i) for i in input_indices]
|
||||
|
||||
attn_data = FlashAttentionMetadata(
|
||||
seq_lens=seq_lens,
|
||||
encoder_seq_lens=encoder_seq_lens,
|
||||
num_prefill_tokens=num_prefill_tokens,
|
||||
# Dummy values
|
||||
enable_kv_scales_calculation=False,
|
||||
num_prefills=0,
|
||||
num_decode_tokens=0,
|
||||
slot_mapping=0,
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
seq_lens_tensor=0,
|
||||
max_prefill_seq_len=0,
|
||||
max_decode_seq_len=0,
|
||||
context_lens_tensor=None,
|
||||
block_tables=None,
|
||||
use_cuda_graph=False,
|
||||
)
|
||||
|
||||
dummy = DummyModel()
|
||||
|
||||
full_text_row_masked_out_mask = MllamaForConditionalGeneration\
|
||||
.get_full_text_row_masked_out_mask(dummy,
|
||||
attn_data,
|
||||
torch.get_default_device())
|
||||
|
||||
full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze()
|
||||
full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist()
|
||||
|
||||
idx = 0
|
||||
assert len(full_text_row_masked_out_mask) == num_prefill_tokens
|
||||
for i, seq_len in enumerate(seq_lens):
|
||||
must_be_masked = input_indices[i] != TEXT_ONLY
|
||||
for _ in range(seq_len):
|
||||
assert full_text_row_masked_out_mask[idx] == must_be_masked, \
|
||||
f"full_text_row_masked_out_mask[{idx}] must be " \
|
||||
f"'{must_be_masked}' "
|
||||
idx += 1
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("encoder_seq_lens, num_tiles, expected", [
|
||||
([6404], [[4]], [6404]),
|
||||
([0, 6404], [[4]], [6404]),
|
||||
([0, 1601, 8005], [[1], [4, 1]], [1601, 8005]),
|
||||
([0, 19212, 0, 3202], [[4, 4, 4], [2]], [19212, 3202]),
|
||||
])
|
||||
def test_parse_and_validate_encoder_lens(encoder_seq_lens, num_tiles,
|
||||
expected) -> None:
|
||||
|
||||
dummy = DummyModel()
|
||||
num_tokens_per_tile = 1601
|
||||
actual_encoder_seq_lens = MllamaForConditionalGeneration \
|
||||
._get_and_validate_encoder_lens(
|
||||
dummy,
|
||||
encoder_seq_lens,
|
||||
num_tiles,
|
||||
num_tokens_per_tile,
|
||||
)
|
||||
assert actual_encoder_seq_lens == expected, \
|
||||
f"Expected {expected} but got {actual_encoder_seq_lens}"
|
||||
@ -32,11 +32,14 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
||||
# Ensure video metadata is included
|
||||
if "video" in mm_data:
|
||||
video = mm_data["video"]
|
||||
num_frames = len(video)
|
||||
mm_data["video"] = (video, {
|
||||
"total_num_frames": len(video),
|
||||
"fps": len(video),
|
||||
"total_num_frames": num_frames,
|
||||
"fps": num_frames,
|
||||
"duration": 1,
|
||||
"video_backend": "opencv"
|
||||
"frames_indices": [i for i in range(num_frames)],
|
||||
"video_backend": "opencv",
|
||||
"do_sample_frames": True,
|
||||
})
|
||||
return mm_data
|
||||
|
||||
@ -164,8 +167,6 @@ def _test_processing_correctness(
|
||||
# incorrect token ids. So we need use `add_special_tokens=False` here
|
||||
# to leave bos_token to be added by the processor.
|
||||
_ADD_SPECIAL_TOKENS_OVERRIDES = {
|
||||
"donut": False,
|
||||
"mllama": False,
|
||||
"ovis": False,
|
||||
"ovis2_5": False,
|
||||
"paligemma": False,
|
||||
@ -275,9 +276,7 @@ def _test_processing_correctness_one(
|
||||
"facebook/chameleon-7b",
|
||||
"CohereLabs/command-a-vision-07-2025",
|
||||
"deepseek-ai/deepseek-vl2-tiny",
|
||||
"naver-clova-ix/donut-base-finetuned-docvqa",
|
||||
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
|
||||
"microsoft/Florence-2-base",
|
||||
"adept/fuyu-8b",
|
||||
"google/gemma-3-4b-it",
|
||||
"google/gemma-3n-E2B-it",
|
||||
@ -302,7 +301,6 @@ def _test_processing_correctness_one(
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
"TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||
"mispeech/midashenglm-7b",
|
||||
"openbmb/MiniCPM-Llama3-V-2_5",
|
||||
|
||||
@ -12,8 +12,19 @@ from ...utils import build_model_context
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
|
||||
@pytest.mark.parametrize("expected_toks_per_frame", [299])
|
||||
@pytest.mark.parametrize("num_frames", [32, 128])
|
||||
@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)])
|
||||
@pytest.mark.parametrize(
|
||||
"num_frames, fps, expected_grid_t",
|
||||
[
|
||||
# pre-sampled fixed frames (unexpected behavior,
|
||||
# but we still expect it to work without errors)
|
||||
(32, 1, 16),
|
||||
(32, 2, 16),
|
||||
(128, 1, 64),
|
||||
(128, 2, 64),
|
||||
# post-sampled frames (expected behavior)
|
||||
(-1, 1, 5),
|
||||
(-1, 2, 10),
|
||||
])
|
||||
def test_processor_override(
|
||||
model_id: str,
|
||||
expected_toks_per_frame: int,
|
||||
@ -80,7 +91,7 @@ def test_video_loader_consistency(
|
||||
|
||||
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
|
||||
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
|
||||
video_bytes, requested_fps=fps)
|
||||
video_bytes, fps=fps)
|
||||
|
||||
# pre-sampled loader shouldn't read all frames
|
||||
assert len(dynamic_video) < len(static_video)
|
||||
|
||||
@ -1,72 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for mllama's multimodal preprocessing and profiling."""
|
||||
import pytest
|
||||
from transformers import MllamaConfig
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.profiling import MultiModalProfiler
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["meta-llama/Llama-3.2-11B-Vision-Instruct"])
|
||||
@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
|
||||
@pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
|
||||
def test_profiling(
|
||||
model_id: str,
|
||||
max_model_len: int,
|
||||
max_num_seqs: int,
|
||||
):
|
||||
# regression test for https://github.com/vllm-project/vllm/issues/13929
|
||||
from vllm.model_executor.models.mllama import calc_token_per_chunk
|
||||
|
||||
model_config_kwargs = {
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
model_config_kwargs=model_config_kwargs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
mm_config = ctx.get_mm_config()
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
|
||||
dummy_encoder_data = profiler.get_encoder_dummy_data(
|
||||
max_model_len,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
)
|
||||
dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
|
||||
max_model_len,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
)
|
||||
|
||||
hf_config = ctx.get_hf_config(MllamaConfig)
|
||||
image_size = hf_config.vision_config.image_size
|
||||
encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
|
||||
] * max_num_seqs
|
||||
|
||||
mm_data = processor.apply(
|
||||
prompt=dummy_mm_data.prompt,
|
||||
mm_data=dummy_mm_data.mm_data,
|
||||
hf_processor_mm_kwargs=dict(),
|
||||
)["mm_kwargs"].get_data()
|
||||
|
||||
# Get the actual number of encoder tokens for each sample.
|
||||
# Because attn_metadata.encoder_seq_lens only counts the last
|
||||
# group of images for each sample, which is used to cheat the
|
||||
# block manager to allocate blocks for those images only.
|
||||
# See MllamaMultiModalProcessor for more details.
|
||||
num_tiles = [[t] for t in mm_data.pop("num_tiles")]
|
||||
num_tokens_per_tile = calc_token_per_chunk(image_size)
|
||||
actual_encoder_seq_lens = [
|
||||
sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
|
||||
]
|
||||
|
||||
# simulate mllama image-present prefill.
|
||||
for actual_len, last_group_len in zip(actual_encoder_seq_lens,
|
||||
encoder_seq_lens):
|
||||
assert actual_len >= last_group_len
|
||||
@ -31,7 +31,6 @@ from ...utils import dummy_hf_overrides
|
||||
|
||||
ARCH_TO_SKIP = {
|
||||
"MolmoForCausalLM": "incompatible requirements",
|
||||
"Florence2ForConditionalGeneration": "not supported in V1",
|
||||
}
|
||||
ARCH_NEEDS_EXTRAS = [
|
||||
"InternVLChatModel",
|
||||
|
||||
@ -180,6 +180,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
"BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
|
||||
trust_remote_code=True),
|
||||
"BailingMoeV2ForCausalLM": _HfExamplesInfo("inclusionAI/Ling-mini-2.0",
|
||||
trust_remote_code=True),
|
||||
"BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1",
|
||||
min_transformers_version="4.55.3",
|
||||
extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501
|
||||
@ -352,11 +354,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
|
||||
trust_remote_code=True),
|
||||
"Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
|
||||
# [Encoder-decoder]
|
||||
"BartModel": _HfExamplesInfo("facebook/bart-base"),
|
||||
"BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
|
||||
"MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro", # noqa: E501
|
||||
hf_overrides={"architectures": ["MBartForConditionalGeneration"]}), # noqa: E501
|
||||
}
|
||||
|
||||
_EMBEDDING_EXAMPLE_MODELS = {
|
||||
@ -494,7 +491,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
|
||||
max_model_len=10240,
|
||||
extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501
|
||||
extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501
|
||||
),
|
||||
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
|
||||
extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
|
||||
@ -581,15 +578,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
is_available_online=False,
|
||||
),
|
||||
# [Encoder-decoder]
|
||||
"DonutForConditionalGeneration": _HfExamplesInfo("naver-clova-ix/donut-base-finetuned-docvqa", # noqa: E501
|
||||
hf_overrides={"architectures": ["DonutForConditionalGeneration"], "model_type": "donut"}, # noqa: E501
|
||||
extras={"dolphin": "ByteDance/Dolphin"}), # noqa: E501
|
||||
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
||||
# Therefore, we borrow the BartTokenizer from the original Bart model
|
||||
"Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501
|
||||
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
|
||||
trust_remote_code=True), # noqa: E501
|
||||
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
||||
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
|
||||
# [Cross-encoder]
|
||||
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501
|
||||
|
||||
@ -10,7 +10,7 @@ from vllm import LLM
|
||||
from vllm.config import ModelImpl
|
||||
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
|
||||
from vllm.utils import GiB_bytes
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_config
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
|
||||
from vllm.v1.engine.core import EngineCore as V1EngineCore
|
||||
|
||||
from ..utils import create_new_process_for_each_test
|
||||
@ -68,11 +68,11 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
|
||||
def _initialize_kv_caches_v1(self, vllm_config):
|
||||
kv_cache_specs = self.model_executor.get_kv_cache_specs()
|
||||
scheduler_kv_cache_config = get_kv_cache_config(
|
||||
scheduler_kv_cache_config = get_kv_cache_configs(
|
||||
vllm_config,
|
||||
kv_cache_specs[0],
|
||||
10 * GiB_bytes,
|
||||
)
|
||||
kv_cache_specs,
|
||||
[10 * GiB_bytes],
|
||||
)[0]
|
||||
|
||||
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
|
||||
return 1, 0, scheduler_kv_cache_config
|
||||
@ -92,10 +92,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
|
||||
# L4 supports FA3.
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
|
||||
if model_arch == "Florence2ForConditionalGeneration":
|
||||
# An encoder-decoder model that's V0-only. Just skip it
|
||||
# since V0 is about to be removed.
|
||||
pytest.skip("Skipping Florence2ForConditionalGeneration")
|
||||
if model_arch == "WhisperForConditionalGeneration":
|
||||
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
LLM(
|
||||
|
||||
@ -50,7 +50,6 @@ def test_registry_imports(model_arch):
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
|
||||
("LlamaForCausalLM", False, False, False),
|
||||
("MllamaForConditionalGeneration", True, False, False),
|
||||
("LlavaForConditionalGeneration", True, True, False),
|
||||
("BertForSequenceClassification", False, False, True),
|
||||
("RobertaForSequenceClassification", False, False, True),
|
||||
|
||||
@ -7,6 +7,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.config import ModelConfig, ParallelConfig, VllmConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.cache import (MultiModalCache,
|
||||
MultiModalProcessorCacheItem,
|
||||
MultiModalProcessorCacheItemMetadata,
|
||||
@ -17,7 +18,6 @@ from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
|
||||
MultiModalKwargsItems,
|
||||
MultiModalSharedField)
|
||||
from vllm.multimodal.processing import PromptInsertion
|
||||
from vllm.multimodal.registry import MultiModalRegistry
|
||||
|
||||
|
||||
def _dummy_elem(
|
||||
@ -96,7 +96,9 @@ def _create_vllm_config(
|
||||
enable_ipc: bool,
|
||||
):
|
||||
return VllmConfig(
|
||||
model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb),
|
||||
model_config=ModelConfig(
|
||||
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
mm_processor_cache_gb=mm_processor_cache_gb),
|
||||
parallel_config=ParallelConfig(
|
||||
data_parallel_size=1 if enable_ipc else 2),
|
||||
)
|
||||
@ -113,15 +115,16 @@ def _compare_caches(
|
||||
n_iter: int = 100,
|
||||
seed: int = 0,
|
||||
):
|
||||
mm_registry = MultiModalRegistry()
|
||||
cache_0_p0 = processor_cache_from_config(config_0, mm_registry)
|
||||
cache_0_p1 = engine_receiver_cache_from_config(config_0, mm_registry)
|
||||
cache_1_p0 = processor_cache_from_config(config_1, mm_registry)
|
||||
cache_1_p1 = engine_receiver_cache_from_config(config_1, mm_registry)
|
||||
cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY)
|
||||
cache_0_p1 = engine_receiver_cache_from_config(config_0,
|
||||
MULTIMODAL_REGISTRY)
|
||||
cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY)
|
||||
cache_1_p1 = engine_receiver_cache_from_config(config_1,
|
||||
MULTIMODAL_REGISTRY)
|
||||
|
||||
cache_size_gb = max(
|
||||
config_0.model_config.mm_processor_cache_gb,
|
||||
config_1.model_config.mm_processor_cache_gb,
|
||||
config_0.model_config.multimodal_config.mm_processor_cache_gb,
|
||||
config_1.model_config.multimodal_config.mm_processor_cache_gb,
|
||||
)
|
||||
item_size_gb = int(cache_size_gb / item_capacity)
|
||||
|
||||
|
||||
@ -6,9 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
|
||||
import pytest
|
||||
|
||||
from vllm.compilation.backends import VllmBackend
|
||||
from vllm.config import (ModelConfig, PoolerConfig, VllmConfig, get_field,
|
||||
update_config)
|
||||
from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config
|
||||
from vllm.config.load import LoadConfig
|
||||
from vllm.config.utils import get_field
|
||||
from vllm.model_executor.layers.pooler import PoolingType
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@ -299,9 +299,8 @@ def test_rope_customization():
|
||||
reason="Encoder Decoder models not supported on ROCm.")
|
||||
@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
|
||||
("facebook/opt-125m", False),
|
||||
("facebook/bart-base", True),
|
||||
("openai/whisper-tiny", True),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", False),
|
||||
("meta-llama/Llama-3.2-11B-Vision", True),
|
||||
])
|
||||
def test_is_encoder_decoder(model_id, is_encoder_decoder):
|
||||
config = ModelConfig(model_id)
|
||||
|
||||
@ -501,34 +501,6 @@ def test_bind_kv_cache_non_attention():
|
||||
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
|
||||
|
||||
|
||||
def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
|
||||
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
from vllm.attention import Attention, AttentionType
|
||||
|
||||
# example from bart
|
||||
ctx = {
|
||||
'encoder.layers.0.self_attn.attn':
|
||||
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
|
||||
'decoder.layers.0.encoder_attn.attn':
|
||||
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
|
||||
'decoder.layers.0.self_attn.attn':
|
||||
Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
|
||||
}
|
||||
|
||||
kv_cache = [
|
||||
torch.zeros((1, )),
|
||||
]
|
||||
encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
|
||||
|
||||
bind_kv_cache(ctx, [kv_cache])
|
||||
assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
|
||||
assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
|
||||
assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
|
||||
|
||||
|
||||
def test_bind_kv_cache_pp():
|
||||
with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
|
||||
# this test runs with 1 GPU, but we simulate 2 GPUs
|
||||
|
||||
@ -18,19 +18,28 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager
|
||||
from vllm.v1.core.kv_cache_utils import (
|
||||
BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
|
||||
estimate_max_model_len, generate_block_hash_extra_keys,
|
||||
get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
|
||||
get_kv_cache_configs, get_max_concurrency_for_kv_cache_config,
|
||||
get_request_block_hasher, hash_block_tokens, init_none_hash,
|
||||
is_kv_cache_type_uniform, make_block_hash_with_group_id,
|
||||
unify_kv_cache_configs)
|
||||
is_kv_cache_type_uniform, make_block_hash_with_group_id)
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, KVCacheTensor,
|
||||
SlidingWindowSpec)
|
||||
KVCacheGroupSpec, KVCacheSpec,
|
||||
KVCacheTensor, SlidingWindowSpec)
|
||||
from vllm.v1.metrics.stats import PrefixCacheStats
|
||||
from vllm.v1.request import Request
|
||||
|
||||
# yapf: enable
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _auto_init_hash_fn(request):
|
||||
hash_fn: Callable
|
||||
if "hash_fn" in request.fixturenames:
|
||||
hash_fn = init_none_hash(request.getfixturevalue("hash_fn"))
|
||||
else:
|
||||
hash_fn = sha256
|
||||
init_none_hash(hash_fn)
|
||||
|
||||
|
||||
def make_request(
|
||||
request_id: str,
|
||||
prompt_token_ids: list[int],
|
||||
@ -244,6 +253,18 @@ def test_free_kv_cache_block_queue_append_n():
|
||||
assert blocks[3].next_free_block is queue.fake_free_list_tail
|
||||
assert queue.fake_free_list_tail.prev_free_block is blocks[3]
|
||||
|
||||
# Create an empty FreeKVCacheBlockQueue
|
||||
invalid_queue = FreeKVCacheBlockQueue([])
|
||||
# set prev_free_block to None and this will cause assertation in append_n
|
||||
invalid_queue.fake_free_list_tail.prev_free_block = None
|
||||
with pytest.raises(AssertionError):
|
||||
# Append 1 block
|
||||
# fake_head->fake_tail
|
||||
invalid_queue.append_n(blocks[0:1])
|
||||
assert invalid_queue.num_free_blocks == 0
|
||||
assert (invalid_queue.fake_free_list_head.next_free_block ==
|
||||
invalid_queue.fake_free_list_tail)
|
||||
|
||||
|
||||
def test_free_kv_cache_block_queue_popleft_n():
|
||||
blocks = [KVCacheBlock(block_id=i) for i in range(6)]
|
||||
@ -269,9 +290,11 @@ def test_free_kv_cache_block_queue_popleft_n():
|
||||
# Pop 0 block
|
||||
# fake_head->b1->b3->b5->b4->b0->b2->fake_tail
|
||||
assert len(queue.popleft_n(0)) == 0
|
||||
assert queue.num_free_blocks == 6
|
||||
# Pop 1 block
|
||||
# fake_head->b3->b5->b4->b0->b2->fake_tail
|
||||
result_blocks = queue.popleft_n(1)
|
||||
assert queue.num_free_blocks == 5
|
||||
assert len(result_blocks) == 1
|
||||
assert result_blocks[0] is blocks[1]
|
||||
for block in result_blocks:
|
||||
@ -281,6 +304,7 @@ def test_free_kv_cache_block_queue_popleft_n():
|
||||
# fake_head->b4->b0->b2->fake_tail
|
||||
result_blocks = queue.popleft_n(2)
|
||||
assert len(result_blocks) == 2
|
||||
assert queue.num_free_blocks == 3
|
||||
assert result_blocks[0] is blocks[3]
|
||||
assert result_blocks[1] is blocks[5]
|
||||
for block in result_blocks:
|
||||
@ -290,6 +314,7 @@ def test_free_kv_cache_block_queue_popleft_n():
|
||||
# fake_head->fake_tail
|
||||
result_blocks = queue.popleft_n(3)
|
||||
assert len(result_blocks) == 3
|
||||
assert queue.num_free_blocks == 0
|
||||
assert result_blocks[0] is blocks[4]
|
||||
assert result_blocks[1] is blocks[0]
|
||||
assert result_blocks[2] is blocks[2]
|
||||
@ -409,7 +434,6 @@ def test_generate_block_hash_extra_keys_cache_salt():
|
||||
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
|
||||
def test_hash_block_tokens(hash_fn):
|
||||
init_none_hash(hash_fn)
|
||||
parent_block_hash = BlockHash(b"123")
|
||||
curr_block_token_ids = (1, 2, 3)
|
||||
extra_keys = ("key1", "key2")
|
||||
@ -422,8 +446,6 @@ def test_hash_block_tokens(hash_fn):
|
||||
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
|
||||
def test_request_block_hasher(hash_fn):
|
||||
kv_cache_utils.init_none_hash(hash_fn)
|
||||
|
||||
request = make_request(
|
||||
request_id="0",
|
||||
prompt_token_ids=[_ for _ in range(6)],
|
||||
@ -446,8 +468,6 @@ def test_request_block_hasher(hash_fn):
|
||||
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
|
||||
def test_hash_tokens_different_mm_input(hash_fn):
|
||||
init_none_hash(hash_fn)
|
||||
|
||||
request1 = make_request(
|
||||
request_id="0",
|
||||
prompt_token_ids=[_ for _ in range(6)],
|
||||
@ -476,8 +496,6 @@ def test_hash_tokens_different_mm_input(hash_fn):
|
||||
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
|
||||
def test_hash_request_tokens_no_mm_inputs(hash_fn):
|
||||
kv_cache_utils.init_none_hash(hash_fn)
|
||||
|
||||
request = make_request(
|
||||
request_id="0",
|
||||
prompt_token_ids=[_ for _ in range(6)],
|
||||
@ -531,102 +549,288 @@ def test_metrics():
|
||||
assert not metrics.query_queue
|
||||
|
||||
|
||||
def test_unify_kv_cache_configs():
|
||||
same_kv_cache_config = [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=100, shared_by=["layer1"]),
|
||||
KVCacheTensor(size=100, shared_by=["layer2"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=20,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=100, shared_by=["layer1"]),
|
||||
KVCacheTensor(size=100, shared_by=["layer2"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
],
|
||||
),
|
||||
]
|
||||
unify_kv_cache_configs(same_kv_cache_config)
|
||||
assert same_kv_cache_config[0].num_blocks == 10
|
||||
assert same_kv_cache_config[1].num_blocks == 10
|
||||
def test_get_kv_cache_configs_multiple_workers():
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
need_sort_kv_cache_config = [
|
||||
ref_kv_cache_spec = new_kv_cache_spec()
|
||||
same_kv_cache_specs = [{
|
||||
"layer1": new_kv_cache_spec(),
|
||||
"layer2": new_kv_cache_spec(),
|
||||
}, {
|
||||
"layer1": new_kv_cache_spec(),
|
||||
"layer2": new_kv_cache_spec(),
|
||||
}]
|
||||
|
||||
# Basic case. All things are the same.
|
||||
kv_cache_configs = get_kv_cache_configs(vllm_config, same_kv_cache_specs, [
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10,
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10
|
||||
])
|
||||
assert kv_cache_configs == [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=100, shared_by=["layer1"]),
|
||||
KVCacheTensor(size=100, shared_by=["layer2"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer1"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer2"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=20,
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=100, shared_by=["layer1"]),
|
||||
KVCacheTensor(size=100, shared_by=["layer2"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer1"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer2"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
unify_kv_cache_configs(need_sort_kv_cache_config)
|
||||
sorted_kv_cache_groups = [
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)),
|
||||
]
|
||||
assert (
|
||||
need_sort_kv_cache_config[0].kv_cache_groups == sorted_kv_cache_groups)
|
||||
assert (
|
||||
need_sort_kv_cache_config[1].kv_cache_groups == sorted_kv_cache_groups)
|
||||
|
||||
diff_kv_cache_config = [
|
||||
# Different available memory. This is the case for TP.
|
||||
# Use the smallest memory available.
|
||||
kv_cache_configs = get_kv_cache_configs(vllm_config, same_kv_cache_specs, [
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10,
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 20
|
||||
])
|
||||
assert kv_cache_configs == [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=100, shared_by=["layer1"]),
|
||||
KVCacheTensor(size=100, shared_by=["layer2"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer1"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer2"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=20,
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=100, shared_by=["layer1"]),
|
||||
KVCacheTensor(size=100, shared_by=["layer2"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20,
|
||||
shared_by=["layer1"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20,
|
||||
shared_by=["layer2"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=8)),
|
||||
KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
# Different KV cache specs. This is the case for PP.
|
||||
different_layer_specs = [{
|
||||
"layer1": new_kv_cache_spec(),
|
||||
}, {
|
||||
"layer2": new_kv_cache_spec(),
|
||||
"layer3": new_kv_cache_spec(),
|
||||
}]
|
||||
|
||||
# Different workers have different layers.
|
||||
kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, different_layer_specs, [
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10,
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10
|
||||
])
|
||||
assert kv_cache_configs == [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20,
|
||||
shared_by=["layer1"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer2"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer3"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer2", "layer3"], new_kv_cache_spec()),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
# Some layers are the same, some are different. This is the case for TP+PP
|
||||
tp_pp_kv_cache_specs = [{
|
||||
"layer1": new_kv_cache_spec(),
|
||||
"layer2": new_kv_cache_spec(),
|
||||
}, {
|
||||
"layer1": new_kv_cache_spec(),
|
||||
"layer2": new_kv_cache_spec(),
|
||||
}, {
|
||||
"layer3": new_kv_cache_spec(),
|
||||
}, {
|
||||
"layer3": new_kv_cache_spec(),
|
||||
}]
|
||||
|
||||
kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, tp_pp_kv_cache_specs, [
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10,
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10,
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10,
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10,
|
||||
])
|
||||
assert kv_cache_configs == [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer1"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer2"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer1"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer2"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20,
|
||||
shared_by=["layer3"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer3"], ref_kv_cache_spec),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 20,
|
||||
shared_by=["layer3"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer3"], ref_kv_cache_spec),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
# Different workers have different types of layers. This is the case for
|
||||
# hybrid models + PP.
|
||||
different_type_layer_specs = [{
|
||||
"layer1": new_kv_cache_spec(),
|
||||
"layer2": new_kv_cache_spec(),
|
||||
}, {
|
||||
"layer3": new_sliding_window_spec(),
|
||||
"layer4": new_sliding_window_spec(),
|
||||
}]
|
||||
kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, different_type_layer_specs, [
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10,
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10,
|
||||
])
|
||||
assert kv_cache_configs == [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer1"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer2"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1", "layer2"], ref_kv_cache_spec),
|
||||
KVCacheGroupSpec([], new_sliding_window_spec()),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer3"]),
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer4"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec([], ref_kv_cache_spec),
|
||||
KVCacheGroupSpec(["layer3", "layer4"],
|
||||
new_sliding_window_spec()),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
# When divided into multiple KVCacheGroups, need to ensure the number of
|
||||
# layers per group is similar.
|
||||
different_type_layer_specs = [{
|
||||
"layer1": new_kv_cache_spec(),
|
||||
"layer2": new_sliding_window_spec(),
|
||||
"layer3": new_sliding_window_spec(),
|
||||
}, {
|
||||
"layer4": new_kv_cache_spec(),
|
||||
"layer5": new_sliding_window_spec(),
|
||||
"layer6": new_sliding_window_spec(),
|
||||
}]
|
||||
kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, different_type_layer_specs, [
|
||||
ref_kv_cache_spec.page_size_bytes * 10,
|
||||
ref_kv_cache_spec.page_size_bytes * 10,
|
||||
])
|
||||
assert kv_cache_configs == [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer1", "layer2", "layer3"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], ref_kv_cache_spec),
|
||||
KVCacheGroupSpec(["layer2"], new_sliding_window_spec()),
|
||||
KVCacheGroupSpec(["layer3"], new_sliding_window_spec()),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=ref_kv_cache_spec.page_size_bytes * 10,
|
||||
shared_by=["layer4", "layer5", "layer6"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer4"], ref_kv_cache_spec),
|
||||
KVCacheGroupSpec(["layer5"], new_sliding_window_spec()),
|
||||
KVCacheGroupSpec(["layer6"], new_sliding_window_spec()),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
# Have conflicting layers. Need to raise an error.
|
||||
conflicting_layer_specs = [{
|
||||
"layer1": new_kv_cache_spec(),
|
||||
}, {
|
||||
"layer1": new_sliding_window_spec(),
|
||||
}]
|
||||
with pytest.raises(AssertionError):
|
||||
unify_kv_cache_configs(diff_kv_cache_config)
|
||||
get_kv_cache_configs(vllm_config, conflicting_layer_specs, [
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10,
|
||||
ref_kv_cache_spec.page_size_bytes * 2 * 10,
|
||||
])
|
||||
|
||||
|
||||
def test_merge_kv_cache_spec():
|
||||
@ -890,7 +1094,7 @@ def test_allocate_with_lookahead():
|
||||
assert len(blocks.get_block_ids()[0]) == 2
|
||||
|
||||
|
||||
def test_get_kv_cache_config():
|
||||
def test_get_kv_cache_config_one_worker():
|
||||
# pass max_model_len to pass check_enough_kv_cache_memory
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
@ -901,8 +1105,10 @@ def test_get_kv_cache_config():
|
||||
'layer_1': new_kv_cache_spec(),
|
||||
'layer_2': new_kv_cache_spec(),
|
||||
}
|
||||
kv_cache_config_full = get_kv_cache_config(
|
||||
vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32)
|
||||
kv_cache_config_full = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs_full],
|
||||
[mem_per_block_per_layer * 2 * 32])[0]
|
||||
print(kv_cache_config_full)
|
||||
assert kv_cache_config_full == KVCacheConfig(
|
||||
num_blocks=32,
|
||||
kv_cache_tensors=[
|
||||
@ -920,8 +1126,9 @@ def test_get_kv_cache_config():
|
||||
'layer_1': new_sliding_window_spec(),
|
||||
'layer_2': new_sliding_window_spec(),
|
||||
}
|
||||
kv_cache_config_sliding = get_kv_cache_config(
|
||||
vllm_config, kv_cache_specs_sliding, mem_per_block_per_layer * 2 * 32)
|
||||
kv_cache_config_sliding = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs_sliding],
|
||||
[mem_per_block_per_layer * 2 * 32])[0]
|
||||
assert kv_cache_config_sliding == KVCacheConfig(
|
||||
num_blocks=32,
|
||||
kv_cache_tensors=[
|
||||
@ -940,8 +1147,9 @@ def test_get_kv_cache_config():
|
||||
'layer_1': new_kv_cache_spec(),
|
||||
'layer_2': new_sliding_window_spec(),
|
||||
}
|
||||
kv_cache_config_hybrid = get_kv_cache_config(
|
||||
vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
|
||||
kv_cache_config_hybrid = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs_hybrid],
|
||||
[mem_per_block_per_layer * 2 * 32])[0]
|
||||
assert kv_cache_config_hybrid == KVCacheConfig(
|
||||
num_blocks=32,
|
||||
kv_cache_tensors=[
|
||||
@ -962,8 +1170,9 @@ def test_get_kv_cache_config():
|
||||
'layer_1': new_kv_cache_spec(),
|
||||
'layer_2': new_sliding_window_spec(),
|
||||
}
|
||||
kv_cache_config_hybrid = get_kv_cache_config(
|
||||
vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
|
||||
kv_cache_config_hybrid = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs_hybrid],
|
||||
[mem_per_block_per_layer * 2 * 32])[0]
|
||||
assert kv_cache_config_hybrid == KVCacheConfig(
|
||||
num_blocks=64,
|
||||
kv_cache_tensors=[
|
||||
@ -985,21 +1194,22 @@ def test_get_kv_cache_config():
|
||||
'layer_5': new_sliding_window_spec(),
|
||||
'layer_6': new_sliding_window_spec(),
|
||||
}
|
||||
kv_cache_config_hybrid = get_kv_cache_config(
|
||||
vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
|
||||
kv_cache_config_hybrid = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs_hybrid],
|
||||
[mem_per_block_per_layer * 2 * 32])[0]
|
||||
assert kv_cache_config_hybrid == KVCacheConfig(
|
||||
num_blocks=32,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_1", "layer_3", "layer_5"]),
|
||||
shared_by=["layer_1", "layer_3", "layer_4"]),
|
||||
KVCacheTensor(size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_2", "layer_4", "layer_6"]),
|
||||
shared_by=["layer_2", "layer_5", "layer_6"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer_3", "layer_4"],
|
||||
KVCacheGroupSpec(["layer_3", "layer_5"],
|
||||
new_sliding_window_spec()),
|
||||
KVCacheGroupSpec(["layer_5", "layer_6"],
|
||||
KVCacheGroupSpec(["layer_4", "layer_6"],
|
||||
new_sliding_window_spec()),
|
||||
],
|
||||
)
|
||||
@ -1017,27 +1227,30 @@ def test_get_kv_cache_config():
|
||||
'layer_9': new_sliding_window_spec(),
|
||||
'layer_10': new_sliding_window_spec(),
|
||||
}
|
||||
kv_cache_config_hybrid = get_kv_cache_config(
|
||||
vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 3 * 32)
|
||||
kv_cache_config_hybrid = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs_hybrid],
|
||||
[mem_per_block_per_layer * 3 * 32])[0]
|
||||
assert kv_cache_config_hybrid == KVCacheConfig(
|
||||
num_blocks=32,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(
|
||||
size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_1", "layer_4", "layer_7", "layer_10"]),
|
||||
shared_by=["layer_1", "layer_4", "layer_5", "layer_6"]),
|
||||
KVCacheTensor(
|
||||
size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_2", "layer_7", "layer_8", "layer_9"]),
|
||||
KVCacheTensor(size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_2", "layer_5", "layer_8"]),
|
||||
KVCacheTensor(size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_3", "layer_6", "layer_9"]),
|
||||
shared_by=["layer_3", "layer_10"]),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer_1", "layer_2", "layer_3"],
|
||||
new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer_4", "layer_5", "layer_6"],
|
||||
KVCacheGroupSpec(["layer_4", "layer_7", "layer_10"],
|
||||
new_sliding_window_spec()),
|
||||
KVCacheGroupSpec(["layer_7", "layer_8", "layer_9"],
|
||||
KVCacheGroupSpec(["layer_5", "layer_8"],
|
||||
new_sliding_window_spec()),
|
||||
KVCacheGroupSpec(["layer_6", "layer_9"],
|
||||
new_sliding_window_spec()),
|
||||
KVCacheGroupSpec(["layer_10"], new_sliding_window_spec()),
|
||||
],
|
||||
)
|
||||
|
||||
@ -1047,13 +1260,14 @@ def test_get_kv_cache_config():
|
||||
'layer_2': new_kv_cache_spec(),
|
||||
}
|
||||
with pytest.raises(NotImplementedError):
|
||||
get_kv_cache_config(vllm_config, kv_cache_specs_hybrid,
|
||||
mem_per_block_per_layer * 2 * 32)
|
||||
get_kv_cache_configs(vllm_config, [kv_cache_specs_hybrid],
|
||||
[mem_per_block_per_layer * 2 * 32])[0]
|
||||
|
||||
# Test num_gpu_blocks_override
|
||||
vllm_config.cache_config.num_gpu_blocks_override = 16
|
||||
kv_cache_config_override_blocks = get_kv_cache_config(
|
||||
vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32)
|
||||
kv_cache_config_override_blocks = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs_full],
|
||||
[mem_per_block_per_layer * 2 * 32])[0]
|
||||
assert kv_cache_config_override_blocks == KVCacheConfig(
|
||||
num_blocks=16,
|
||||
kv_cache_tensors=[
|
||||
@ -1065,3 +1279,16 @@ def test_get_kv_cache_config():
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
|
||||
])
|
||||
|
||||
|
||||
def test_get_kv_cache_configs_attention_free():
|
||||
kv_cache_specs: dict[str, KVCacheSpec] = {}
|
||||
vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16))
|
||||
kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0])
|
||||
assert kv_cache_configs == [
|
||||
KVCacheConfig(
|
||||
num_blocks=1,
|
||||
kv_cache_tensors=[],
|
||||
kv_cache_groups=[],
|
||||
)
|
||||
]
|
||||
|
||||
@ -25,6 +25,16 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, SlidingWindowSpec)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _auto_init_hash_fn(request):
|
||||
hash_fn: Callable
|
||||
if "hash_fn" in request.fixturenames:
|
||||
hash_fn = init_none_hash(request.getfixturevalue("hash_fn"))
|
||||
else:
|
||||
hash_fn = sha256
|
||||
init_none_hash(hash_fn)
|
||||
|
||||
|
||||
def make_request(
|
||||
request_id: str,
|
||||
prompt_token_ids: list[int],
|
||||
@ -105,7 +115,6 @@ def make_kv_cache_config_hybrid_model(block_size: int,
|
||||
|
||||
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
|
||||
def test_prefill(hash_fn):
|
||||
init_none_hash(hash_fn)
|
||||
|
||||
block_size = 16
|
||||
manager = KVCacheManager(
|
||||
@ -736,7 +745,6 @@ def test_cache_blocks(hash_fn):
|
||||
This is a unit test that tests the correctness of the _cache_full_blocks
|
||||
function of KVCacheManager.
|
||||
"""
|
||||
init_none_hash(hash_fn)
|
||||
|
||||
block_size = 4
|
||||
block_pool = BlockPool(
|
||||
@ -849,7 +857,6 @@ def test_mm_prefix_caching():
|
||||
"""
|
||||
This tests that the multi-modal prefix caching is correct.
|
||||
"""
|
||||
kv_cache_utils.init_none_hash(sha256)
|
||||
|
||||
block_size = 16
|
||||
manager = KVCacheManager(
|
||||
@ -942,8 +949,6 @@ def test_cache_key_salting():
|
||||
This tests that cache salts are applied during hashing and the cache
|
||||
is separated cache as expected.
|
||||
"""
|
||||
kv_cache_utils.init_none_hash(sha256)
|
||||
|
||||
block_size = 16
|
||||
manager = KVCacheManager(
|
||||
make_kv_cache_config(block_size, 11),
|
||||
|
||||
@ -31,7 +31,7 @@ def _mk_processor(monkeypatch,
|
||||
raising=True)
|
||||
monkeypatch.setattr(ModelConfig,
|
||||
"__post_init__",
|
||||
lambda self: None,
|
||||
lambda self, *args: None,
|
||||
raising=True)
|
||||
monkeypatch.setattr(UnspecifiedPlatform,
|
||||
"is_async_output_supported",
|
||||
|
||||
@ -9,24 +9,9 @@ from vllm import LLM
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
|
||||
UNSUPPORTED_MODELS_V1 = [
|
||||
"facebook/bart-large-cnn", # encoder decoder
|
||||
]
|
||||
|
||||
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
|
||||
def test_reject_unsupported_models(monkeypatch, model):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
args = AsyncEngineArgs(model=model)
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
_ = args.create_engine_config()
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
|
||||
def test_reject_bad_config(monkeypatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
@ -77,12 +62,6 @@ def test_enable_by_default_fallback(monkeypatch):
|
||||
assert envs.VLLM_USE_V1
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
# Should fall back to V0 for supported model.
|
||||
_ = AsyncEngineArgs(
|
||||
model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
|
||||
assert not envs.VLLM_USE_V1
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
|
||||
def test_v1_llm_by_default(monkeypatch):
|
||||
with monkeypatch.context() as m:
|
||||
|
||||
@ -10,7 +10,7 @@ from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import GiB_bytes
|
||||
from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
|
||||
get_kv_cache_config)
|
||||
get_kv_cache_configs)
|
||||
from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
|
||||
SchedulerOutput)
|
||||
from vllm.v1.worker.tpu_model_runner import (
|
||||
@ -477,8 +477,8 @@ def test_init_kv_cache_without_kv_sharing():
|
||||
# 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
|
||||
# * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
|
||||
num_expected_blocks = 20480 # 20GB / 512KB / 2 (num layers)
|
||||
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
|
||||
[available_memory])[0]
|
||||
assert kv_cache_config.num_blocks == num_expected_blocks
|
||||
assert len(kv_cache_config.kv_cache_tensors) == 2
|
||||
assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
|
||||
@ -550,8 +550,8 @@ def test_init_kv_cache_with_kv_sharing_valid():
|
||||
# with KV sharing, we can allocate (available_mem//page_size//1) blocks
|
||||
# which is twice as many as without KV sharing
|
||||
num_expected_blocks = 2 * 20480 # 20GB / 512KB
|
||||
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
|
||||
[available_memory])[0]
|
||||
assert kv_cache_config.num_blocks == num_expected_blocks
|
||||
assert len(kv_cache_config.kv_cache_tensors) == 1
|
||||
# Each layer now has twice the available memory for KV cache
|
||||
|
||||
@ -15,7 +15,7 @@ from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import GiB_bytes, update_environment_variables
|
||||
from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
|
||||
get_kv_cache_config)
|
||||
get_kv_cache_configs)
|
||||
from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
|
||||
SchedulerOutput)
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
@ -585,8 +585,8 @@ def test_init_kv_cache_without_kv_sharing():
|
||||
available_memory = 20 * GiB_bytes
|
||||
# page size for layer 0's kv_cache_spec is 32KB
|
||||
num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers)
|
||||
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
|
||||
[available_memory])[0]
|
||||
assert kv_cache_config.num_blocks == num_expected_blocks
|
||||
assert len(kv_cache_config.kv_cache_tensors) == 2
|
||||
assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
|
||||
@ -657,8 +657,8 @@ def test_init_kv_cache_with_kv_sharing_valid():
|
||||
# with KV sharing, we can allocate (available_mem//page_size//1) blocks
|
||||
# which is twice as many as without KV sharing
|
||||
num_expected_blocks = 655360 # 20GB / 32KB
|
||||
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
|
||||
[available_memory])[0]
|
||||
assert kv_cache_config.num_blocks == num_expected_blocks
|
||||
assert len(kv_cache_config.kv_cache_tensors) == 1
|
||||
# Each layer now has twice the available memory for KV cache
|
||||
@ -788,8 +788,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
|
||||
kv_cache_spec = runner.get_kv_cache_spec()
|
||||
|
||||
available_memory = 5 * GiB_bytes
|
||||
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
|
||||
[available_memory])[0]
|
||||
runner.initialize_kv_cache(kv_cache_config)
|
||||
|
||||
# random partition of blocks
|
||||
|
||||
@ -1,648 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
|
||||
from vllm.utils import make_tensor_with_pad
|
||||
from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
|
||||
|
||||
BATCH_SIZES = [1, 4, 16, 64, 256]
|
||||
|
||||
|
||||
def _create_model_runner(model: str, *args,
|
||||
**kwargs) -> EncoderDecoderModelRunner:
|
||||
engine_args = EngineArgs(model, *args, **kwargs)
|
||||
engine_config = engine_args.create_engine_config()
|
||||
model_runner = EncoderDecoderModelRunner(
|
||||
vllm_config=engine_config,
|
||||
is_driver_worker=True,
|
||||
)
|
||||
return model_runner
|
||||
|
||||
|
||||
@pytest.mark.skipif(condition=current_platform.is_cpu(),
|
||||
reason="CPU backend is currently "
|
||||
"unsupported for encoder/ "
|
||||
"decoder models")
|
||||
def test_empty_seq_group():
|
||||
"""Verify prepare prompt and decode returns empty output
|
||||
for empty seq group list"""
|
||||
|
||||
model_runner = _create_model_runner(
|
||||
"facebook/bart-base",
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_num_batched_tokens=100000,
|
||||
max_num_seqs=100000,
|
||||
enable_chunked_prefill=False,
|
||||
enforce_eager=True,
|
||||
)
|
||||
seq_group_metadata_list: list[SequenceGroupMetadata] = []
|
||||
model_input = model_runner._prepare_model_input_tensors(
|
||||
seq_group_metadata_list)
|
||||
(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
encoder_input_tokens,
|
||||
encoder_input_positions,
|
||||
attn_metadata,
|
||||
return_seq_lens,
|
||||
) = (
|
||||
model_input.input_tokens,
|
||||
model_input.input_positions,
|
||||
model_input.encoder_input_tokens,
|
||||
model_input.encoder_input_positions,
|
||||
model_input.attn_metadata,
|
||||
model_input.seq_lens,
|
||||
)
|
||||
assert input_tokens is None
|
||||
assert input_positions is None
|
||||
assert encoder_input_tokens is None
|
||||
assert encoder_input_positions is None
|
||||
assert attn_metadata is None
|
||||
assert return_seq_lens is None
|
||||
|
||||
|
||||
@pytest.mark.skipif(condition=current_platform.is_cpu(),
|
||||
reason="CPU backend is currently "
|
||||
"unsupported for encoder/ "
|
||||
"decoder models")
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
def test_prepare_prompt(batch_size):
|
||||
'''
|
||||
Test the ability of the encoder/decoder model runner subclass to
|
||||
produce prefill-phase model inputs & attention metadata.
|
||||
|
||||
Test behavior:
|
||||
|
||||
* Instantiate BART base model & enc/dec model runner
|
||||
* Construct sequence-group metadata for dummy prompts
|
||||
* Test that encoder attention, decoder self-attention,
|
||||
and encoder/decoder cross-attention inputs are correct
|
||||
|
||||
Arguments:
|
||||
|
||||
* batch_size
|
||||
* backend_name: The attention backend under test
|
||||
* enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
|
||||
'''
|
||||
|
||||
model_runner = _create_model_runner(
|
||||
"facebook/bart-base",
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_num_batched_tokens=100000,
|
||||
max_num_seqs=100000,
|
||||
enable_chunked_prefill=False,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
seq_lens: list[int] = []
|
||||
encoder_seq_lens: list[int] = []
|
||||
seq_group_metadata_list: list[SequenceGroupMetadata] = []
|
||||
block_tables = {0: [1]}
|
||||
cross_block_table = [2]
|
||||
for i in range(batch_size):
|
||||
# make sure all tokens fit into one block
|
||||
seq_len = i % (model_runner.block_size - 1) + 1
|
||||
seq_lens.append(seq_len)
|
||||
seq_data = SequenceData.from_seqs(range(seq_len))
|
||||
encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
|
||||
encoder_seq_lens.append(encoder_seq_len)
|
||||
encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
|
||||
seq_group_metadata = SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=True,
|
||||
seq_data={0: seq_data},
|
||||
sampling_params=SamplingParams(temperature=0),
|
||||
block_tables=block_tables,
|
||||
encoder_seq_data=encoder_seq_data,
|
||||
cross_block_table=cross_block_table,
|
||||
)
|
||||
assert seq_group_metadata.token_chunk_size == seq_data.get_len()
|
||||
seq_group_metadata_list.append(seq_group_metadata)
|
||||
|
||||
# Build
|
||||
# * Decoder model inputs
|
||||
# * Decoder self-attention KV caching data structures
|
||||
# * Encoder model inputs
|
||||
# * Encoder/decoder cross-attention KV caching data structures
|
||||
model_input = model_runner.prepare_model_input(seq_group_metadata_list)
|
||||
|
||||
input_tokens = model_input.input_tokens
|
||||
input_positions = model_input.input_positions
|
||||
attn_metadata = model_input.attn_metadata
|
||||
return_seq_lens = model_input.seq_lens
|
||||
slot_mapping = attn_metadata.slot_mapping
|
||||
encoder_input_tokens = model_input.encoder_input_tokens
|
||||
encoder_input_positions = model_input.encoder_input_positions
|
||||
cross_slot_mapping = attn_metadata.cross_slot_mapping
|
||||
assert return_seq_lens == seq_lens
|
||||
assert len(slot_mapping) == len(input_tokens)
|
||||
assert len(cross_slot_mapping) == len(encoder_input_tokens)
|
||||
|
||||
# Verify input metadata is correct for prompts.
|
||||
# - Decoder attention metadata
|
||||
device = model_runner.device
|
||||
assert attn_metadata.num_prefills > 0
|
||||
assert attn_metadata.num_decode_tokens == 0
|
||||
assert torch.equal(attn_metadata.seq_lens_tensor,
|
||||
torch.tensor(seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.seq_lens == seq_lens
|
||||
assert attn_metadata.max_prefill_seq_len == max(seq_lens)
|
||||
assert attn_metadata.max_decode_seq_len == 0
|
||||
# - Encoder attention metadata
|
||||
assert attn_metadata.encoder_seq_lens == encoder_seq_lens
|
||||
assert torch.equal(
|
||||
attn_metadata.encoder_seq_lens_tensor,
|
||||
torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
|
||||
assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
|
||||
|
||||
# Test decoder subquery start locs.
|
||||
start_idx = 0
|
||||
start_loc = [start_idx]
|
||||
for seq_len in seq_lens:
|
||||
start_idx += seq_len
|
||||
start_loc.append(start_idx)
|
||||
assert torch.equal(
|
||||
attn_metadata.query_start_loc,
|
||||
torch.tensor(start_loc, dtype=torch.int32, device=device),
|
||||
)
|
||||
|
||||
# Test decoder seq start locs & context lengths
|
||||
|
||||
assert torch.equal(
|
||||
attn_metadata.seq_start_loc,
|
||||
torch.tensor(start_loc, dtype=torch.int32, device=device),
|
||||
)
|
||||
assert torch.equal(
|
||||
attn_metadata.context_lens_tensor,
|
||||
torch.zeros(attn_metadata.context_lens_tensor.shape[0],
|
||||
dtype=torch.int,
|
||||
device=device),
|
||||
)
|
||||
|
||||
# Verify block tables are correct for prompts
|
||||
# - Decoder self-attention
|
||||
expected = torch.tensor(
|
||||
[[] for _ in range(len(seq_group_metadata_list))],
|
||||
dtype=torch.int32,
|
||||
device=model_runner.device,
|
||||
)
|
||||
assert torch.equal(
|
||||
attn_metadata.block_tables,
|
||||
expected,
|
||||
)
|
||||
# - Encoder/decoder cross-attention
|
||||
assert torch.equal(
|
||||
attn_metadata.cross_block_tables,
|
||||
expected,
|
||||
)
|
||||
|
||||
# Cuda graph should not be used for prefill.
|
||||
assert attn_metadata.use_cuda_graph is False
|
||||
|
||||
# Verify the lengths of input tokens & positions
|
||||
# - Decoder
|
||||
assert len(input_tokens) == sum(seq_lens)
|
||||
assert len(input_positions) == sum(seq_lens)
|
||||
# -- An indirect check that model_input.input_tokens
|
||||
# and model_input.input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
)
|
||||
# - Encoder
|
||||
assert len(encoder_input_tokens) == sum(encoder_seq_lens)
|
||||
# -- An indirect check that model_input.encoder_input_tokens
|
||||
# and model_input.encoder_input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
encoder_input_tokens,
|
||||
encoder_input_positions,
|
||||
)
|
||||
|
||||
# Test that vLLM sampling infrastructure chooses the correct
|
||||
# sequence positions at which to sample (i.e. the end of
|
||||
# each sequence) in the prefill phase
|
||||
|
||||
expected_selected_token_indices = []
|
||||
selected_token_start_idx = 0
|
||||
for seq_len in seq_lens:
|
||||
# Compute the index offset of the final token in each
|
||||
# prompt (recall that the prompts are concatenated)
|
||||
expected_selected_token_indices.append(selected_token_start_idx +
|
||||
seq_len - 1)
|
||||
selected_token_start_idx += seq_len
|
||||
|
||||
sampling_metadata = model_input.sampling_metadata
|
||||
actual = sampling_metadata.selected_token_indices
|
||||
expected = torch.tensor(
|
||||
expected_selected_token_indices,
|
||||
device=actual.device,
|
||||
dtype=actual.dtype,
|
||||
)
|
||||
assert torch.equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.skipif(condition=current_platform.is_cpu(),
|
||||
reason="CPU backend is currently "
|
||||
"unsupported for encoder/ "
|
||||
"decoder models")
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
|
||||
def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
|
||||
'''
|
||||
Test the ability of the encoder/decoder model runner subclass to
|
||||
produce decode-phase model inputs & attention metadata.
|
||||
|
||||
Test behavior:
|
||||
|
||||
* Instantiate BART base model & enc/dec model runner
|
||||
* Construct sequence-group metadata for dummy prompts
|
||||
* Test that encoder attention, decoder self-attention,
|
||||
and encoder/decoder cross-attention inputs are correct
|
||||
|
||||
Arguments:
|
||||
|
||||
* batch_size
|
||||
* multiple_seqs_per_seq_group
|
||||
* backend_name: The attention backend under test
|
||||
* enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
|
||||
'''
|
||||
|
||||
model_runner = _create_model_runner(
|
||||
"facebook/bart-base",
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_num_batched_tokens=100000,
|
||||
max_num_seqs=100000,
|
||||
enable_chunked_prefill=False,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
seq_lens: list[int] = []
|
||||
encoder_seq_lens: list[int] = []
|
||||
seq_group_metadata_list: list[SequenceGroupMetadata] = []
|
||||
block_tables = {
|
||||
0: [1],
|
||||
1: [3]
|
||||
} if multiple_seqs_per_seq_group else {
|
||||
0: [1]
|
||||
}
|
||||
cross_block_table = [2]
|
||||
for i in range(batch_size):
|
||||
# make sure all tokens fit into one block
|
||||
seq_len = i % (model_runner.block_size - 1) + 1
|
||||
seq_data = SequenceData.from_seqs(range(seq_len))
|
||||
encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
|
||||
encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
|
||||
|
||||
seq_group_metadata = SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=False,
|
||||
seq_data={
|
||||
0: seq_data,
|
||||
1: seq_data
|
||||
} if multiple_seqs_per_seq_group else {0: seq_data},
|
||||
sampling_params=SamplingParams(temperature=0),
|
||||
block_tables=block_tables,
|
||||
encoder_seq_data=encoder_seq_data,
|
||||
cross_block_table=cross_block_table,
|
||||
)
|
||||
assert seq_group_metadata.token_chunk_size == 1
|
||||
seq_group_metadata_list.append(seq_group_metadata)
|
||||
seq_lens.extend(
|
||||
[seq_len for _ in range(len(seq_group_metadata.seq_data))])
|
||||
encoder_seq_lens.extend(
|
||||
[encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
|
||||
|
||||
# Build
|
||||
# * Decoder model inputs
|
||||
# * Decoder self-attention KV caching data structures
|
||||
# * Encoder model inputs
|
||||
# * Encoder/decoder cross-attention KV caching data structures
|
||||
model_input = model_runner.prepare_model_input(seq_group_metadata_list)
|
||||
input_tokens = model_input.input_tokens
|
||||
input_positions = model_input.input_positions
|
||||
attn_metadata = model_input.attn_metadata
|
||||
return_seq_lens = model_input.seq_lens
|
||||
slot_mapping = attn_metadata.slot_mapping
|
||||
encoder_input_tokens = model_input.encoder_input_tokens
|
||||
encoder_input_positions = model_input.encoder_input_positions
|
||||
cross_slot_mapping = attn_metadata.cross_slot_mapping
|
||||
assert return_seq_lens == seq_lens
|
||||
assert len(slot_mapping) == len(input_tokens)
|
||||
assert len(cross_slot_mapping) == len(encoder_input_tokens)
|
||||
|
||||
# Verify input metadata is correct for decode phase.
|
||||
# - Decoder attention metadata
|
||||
device = model_runner.device
|
||||
assert attn_metadata.num_prefills == 0
|
||||
assert attn_metadata.num_decode_tokens > 0
|
||||
assert torch.equal(attn_metadata.seq_lens_tensor,
|
||||
torch.tensor(seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.seq_lens == seq_lens
|
||||
assert attn_metadata.max_prefill_seq_len == 0
|
||||
assert attn_metadata.max_decode_seq_len == max(seq_lens)
|
||||
# - Encoder attention metadata
|
||||
assert attn_metadata.encoder_seq_lens == encoder_seq_lens
|
||||
assert torch.equal(
|
||||
attn_metadata.encoder_seq_lens_tensor,
|
||||
torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
|
||||
assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
|
||||
|
||||
# Test decoder subquery start locs.
|
||||
start_idx = 0
|
||||
start_loc = [start_idx]
|
||||
for seq_len in seq_lens:
|
||||
start_idx += 1
|
||||
start_loc.append(start_idx)
|
||||
assert torch.equal(
|
||||
attn_metadata.query_start_loc,
|
||||
torch.tensor(start_loc, dtype=torch.int32, device=device),
|
||||
)
|
||||
|
||||
# Test decoder seq start locs. Note that for normal prefill it is
|
||||
# equivalent to query_start_loc.
|
||||
start_idx = 0
|
||||
seq_start_loc = [start_idx]
|
||||
for seq_len in seq_lens:
|
||||
start_idx += seq_len
|
||||
seq_start_loc.append(start_idx)
|
||||
|
||||
# Test seq_start_loc and context lengths
|
||||
|
||||
assert torch.equal(
|
||||
attn_metadata.seq_start_loc,
|
||||
torch.tensor(seq_start_loc, dtype=torch.int32, device=device),
|
||||
)
|
||||
assert torch.equal(
|
||||
attn_metadata.context_lens_tensor,
|
||||
torch.tensor([seq_len - 1 for seq_len in seq_lens],
|
||||
dtype=torch.int,
|
||||
device=device))
|
||||
|
||||
# Verify block tables are correct for prompts
|
||||
# - Decoder self-attention
|
||||
flattened_block_tables = [
|
||||
block_table for block_table in block_tables.values()
|
||||
]
|
||||
expected = torch.tensor(flattened_block_tables *
|
||||
len(seq_group_metadata_list),
|
||||
dtype=torch.int32,
|
||||
device=model_runner.device)
|
||||
assert torch.equal(
|
||||
attn_metadata.block_tables,
|
||||
expected,
|
||||
)
|
||||
# - Encoder/decoder cross-attention
|
||||
expected = torch.tensor([
|
||||
cross_block_table for seq_group_metadata in seq_group_metadata_list
|
||||
for _ in range(len(seq_group_metadata.seq_data))
|
||||
],
|
||||
dtype=torch.int32,
|
||||
device=model_runner.device)
|
||||
assert torch.equal(
|
||||
attn_metadata.cross_block_tables,
|
||||
expected,
|
||||
)
|
||||
|
||||
# Model runner's CUDAGraph setting should be propagated to attention
|
||||
# metadata.
|
||||
assert attn_metadata.use_cuda_graph is False
|
||||
|
||||
# Verify the lengths of input tokens & positions
|
||||
# - Decoder
|
||||
assert len(input_tokens) == len(seq_lens)
|
||||
assert len(input_positions) == len(seq_lens)
|
||||
# -- An indirect check that model_input.input_tokens
|
||||
# and model_input.input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
)
|
||||
# - Encoder
|
||||
assert len(encoder_input_tokens) == 0
|
||||
assert len(encoder_input_tokens) == 0
|
||||
# -- An indirect check that model_input.encoder_input_tokens
|
||||
# and model_input.encoder_input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
encoder_input_tokens,
|
||||
encoder_input_positions,
|
||||
)
|
||||
|
||||
# Test that vLLM sampling infrastructure chooses the correct
|
||||
# sequence positions at which to sample (i.e. the end of
|
||||
# each sequence) in the decode phase
|
||||
|
||||
expected_selected_token_indices = []
|
||||
for selected_token_start_idx, seq_len in enumerate(seq_lens):
|
||||
# Compute the index offset of the final token in each
|
||||
# sequence's decoded outputs; since a single token is
|
||||
# decoded per iteration per sequence, then the length
|
||||
# of the decoded tokens for a given sequence is 1 and
|
||||
# the final index offset into a given sequence's
|
||||
# generated tokens is 0 (i.e. the expected sampling index
|
||||
# for a given sequence is just `selected_token_start_idx`)
|
||||
expected_selected_token_indices.append(selected_token_start_idx)
|
||||
|
||||
sampling_metadata = model_input.sampling_metadata
|
||||
actual = sampling_metadata.selected_token_indices
|
||||
expected = torch.tensor(
|
||||
expected_selected_token_indices,
|
||||
device=actual.device,
|
||||
dtype=actual.dtype,
|
||||
)
|
||||
assert torch.equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size", list(range(1, 257)))
|
||||
@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
|
||||
def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
|
||||
"""
|
||||
Tests that for encoder-decoder models with CUDA Graph capture and replay
|
||||
enabled, the tensors used during the decode phase are correctly padded
|
||||
for varying input batch sizes.
|
||||
"""
|
||||
model_runner = _create_model_runner(
|
||||
"facebook/bart-base",
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_num_batched_tokens=100000,
|
||||
max_num_seqs=100000,
|
||||
enable_chunked_prefill=False,
|
||||
enforce_eager=False,
|
||||
)
|
||||
block_tables = {
|
||||
0: [1],
|
||||
1: [3]
|
||||
} if multiple_seqs_per_seq_group else {
|
||||
0: [1]
|
||||
}
|
||||
seq_lens: list[int] = []
|
||||
encoder_seq_lens: list[int] = []
|
||||
seq_group_metadata_list: list[SequenceGroupMetadata] = []
|
||||
|
||||
cross_block_table = [2]
|
||||
expanded_batch_size = 0
|
||||
for i in range(batch_size):
|
||||
# make sure all tokens fit into one block
|
||||
seq_len = i % (model_runner.block_size - 1) + 1
|
||||
seq_data = SequenceData.from_seqs(range(seq_len))
|
||||
encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
|
||||
encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
|
||||
seq_group_metadata = SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=False,
|
||||
seq_data={
|
||||
0: seq_data,
|
||||
1: seq_data
|
||||
} if multiple_seqs_per_seq_group else {0: seq_data},
|
||||
sampling_params=SamplingParams(temperature=0),
|
||||
block_tables=block_tables,
|
||||
encoder_seq_data=encoder_seq_data,
|
||||
cross_block_table=cross_block_table,
|
||||
)
|
||||
assert seq_group_metadata.token_chunk_size == 1
|
||||
seq_lens.extend(
|
||||
[seq_len for _ in range(len(seq_group_metadata.seq_data))])
|
||||
encoder_seq_lens.extend(
|
||||
[encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
|
||||
expanded_batch_size = expanded_batch_size + len(
|
||||
seq_group_metadata.seq_data)
|
||||
seq_group_metadata_list.append(seq_group_metadata)
|
||||
|
||||
model_input = model_runner.prepare_model_input(seq_group_metadata_list)
|
||||
input_tokens = model_input.input_tokens
|
||||
input_positions = model_input.input_positions
|
||||
attn_metadata = model_input.attn_metadata
|
||||
return_seq_lens = model_input.seq_lens
|
||||
slot_mapping = attn_metadata.slot_mapping
|
||||
encoder_input_tokens = model_input.encoder_input_tokens
|
||||
encoder_input_positions = model_input.encoder_input_positions
|
||||
cross_slot_mapping = attn_metadata.cross_slot_mapping
|
||||
|
||||
# With CUDA Graph capture and replay enabled, the decoder and encoder
|
||||
# input sequences will be padded. Create the expected padded tensors
|
||||
# accordingly.
|
||||
graph_batch_size = model_runner.vllm_config.pad_for_cudagraph(
|
||||
expanded_batch_size)
|
||||
cuda_graph_pad_size = graph_batch_size - expanded_batch_size
|
||||
padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
|
||||
padded_encoder_seq_lens = encoder_seq_lens + list(
|
||||
itertools.repeat(1, cuda_graph_pad_size))
|
||||
|
||||
assert return_seq_lens == padded_seq_lens
|
||||
assert len(slot_mapping) == len(input_tokens)
|
||||
assert len(cross_slot_mapping) == len(encoder_input_tokens)
|
||||
|
||||
# Verify attention metadata
|
||||
device = model_runner.device
|
||||
assert attn_metadata.num_prefills == 0
|
||||
assert attn_metadata.num_decode_tokens > 0
|
||||
assert torch.equal(
|
||||
attn_metadata.seq_lens_tensor,
|
||||
torch.tensor(padded_seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.seq_lens == padded_seq_lens
|
||||
assert attn_metadata.max_prefill_seq_len == 0
|
||||
assert attn_metadata.max_decode_seq_len == max(seq_lens)
|
||||
# - Encoder attention metadata
|
||||
assert attn_metadata.encoder_seq_lens == padded_encoder_seq_lens
|
||||
assert torch.equal(
|
||||
attn_metadata.encoder_seq_lens_tensor,
|
||||
torch.tensor(padded_encoder_seq_lens, device=device, dtype=torch.int))
|
||||
assert attn_metadata.max_encoder_seq_len == max(padded_encoder_seq_lens)
|
||||
assert attn_metadata.num_encoder_tokens == sum(padded_encoder_seq_lens)
|
||||
|
||||
# Verify block tables are correct for prompts
|
||||
# - Decoder self-attention. Pad the block tables as expected.
|
||||
flattened_block_tables = [
|
||||
block_table for _ in range(len(seq_group_metadata_list))
|
||||
for block_table in block_tables.values()
|
||||
]
|
||||
flattened_block_tables.extend([[] for _ in range(cuda_graph_pad_size)])
|
||||
expected = make_tensor_with_pad(
|
||||
flattened_block_tables,
|
||||
max_len=64,
|
||||
pad=0,
|
||||
dtype=torch.int32,
|
||||
device=model_runner.device,
|
||||
)
|
||||
assert torch.equal(
|
||||
attn_metadata.block_tables,
|
||||
expected,
|
||||
)
|
||||
# - Encoder/decoder cross-attention. Pad the cross-attention block tables
|
||||
# as expected.
|
||||
expected = [
|
||||
cross_block_table for seq_group_metadata in seq_group_metadata_list
|
||||
for _ in range(len(seq_group_metadata.seq_data))
|
||||
]
|
||||
expected.extend([[] for _ in range(cuda_graph_pad_size)])
|
||||
expected = make_tensor_with_pad(
|
||||
expected,
|
||||
max_len=64,
|
||||
pad=0,
|
||||
dtype=torch.int32,
|
||||
device=model_runner.device,
|
||||
)
|
||||
assert torch.equal(
|
||||
attn_metadata.cross_block_tables,
|
||||
expected,
|
||||
)
|
||||
|
||||
# Model runner's CUDAGraph setting should be propagated to attention
|
||||
# metadata.
|
||||
assert attn_metadata.use_cuda_graph is True
|
||||
|
||||
# Verify the lengths of input tokens & positions
|
||||
# - Decoder
|
||||
assert len(input_tokens) == len(padded_seq_lens)
|
||||
assert len(input_positions) == len(padded_seq_lens)
|
||||
# -- An indirect check that model_input.input_tokens
|
||||
# and model_input.input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
)
|
||||
# - Encoder
|
||||
assert len(encoder_input_tokens) == 0
|
||||
assert len(encoder_input_tokens) == 0
|
||||
# -- An indirect check that model_input.encoder_input_tokens
|
||||
# and model_input.encoder_input_positions are correct -
|
||||
# by design of the test, the input tokens are
|
||||
# equal to the input position values, so if
|
||||
# the model_input data structure has the correct
|
||||
# values then these two should be equal
|
||||
assert torch.equal(
|
||||
encoder_input_tokens,
|
||||
encoder_input_positions,
|
||||
)
|
||||
@ -1,5 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
print("vLLM is now using 'uv' to disable build isolation for 'torch'.")
|
||||
print("Please instead install vLLM with 'uv pip install -e .' (must use 'uv')")
|
||||
import glob
|
||||
|
||||
requires_files = glob.glob('requirements/*.txt')
|
||||
requires_files += ["pyproject.toml"]
|
||||
for file in requires_files:
|
||||
print(f">>> cleaning {file}")
|
||||
with open(file) as f:
|
||||
lines = f.readlines()
|
||||
if "torch" in "".join(lines).lower():
|
||||
print("removed:")
|
||||
with open(file, 'w') as f:
|
||||
for line in lines:
|
||||
if 'torch' not in line.lower():
|
||||
f.write(line)
|
||||
else:
|
||||
print(line.strip())
|
||||
print(f"<<< done cleaning {file}")
|
||||
print()
|
||||
@ -117,13 +117,14 @@ def paged_attention_rocm(
|
||||
k_scale: torch.Tensor,
|
||||
v_scale: torch.Tensor,
|
||||
fp8_out_scale: Optional[torch.Tensor] = None,
|
||||
mfma_type: str = "fp8" if envs.VLLM_ROCM_FP8_MFMA_PAGE_ATTN else "f16",
|
||||
) -> None:
|
||||
torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
|
||||
key_cache, value_cache, num_kv_heads,
|
||||
scale, block_tables, seq_lens,
|
||||
query_start_loc, block_size, max_seq_len,
|
||||
alibi_slopes, kv_cache_dtype, k_scale,
|
||||
v_scale, fp8_out_scale)
|
||||
v_scale, fp8_out_scale, mfma_type)
|
||||
|
||||
|
||||
def mla_decode_kvcache_cpu(
|
||||
@ -2010,3 +2011,27 @@ def onednn_scaled_mm(
|
||||
input_zp_adj, bias, dnnl_handler.handler)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def hadacore_transform(x: torch.Tensor, inplace: bool = True) -> torch.Tensor:
|
||||
"""
|
||||
Perform Hadamard transforms using [Hadacore](https://arxiv.org/abs/2412.08832)
|
||||
kernels. Note that these kernels exploit the recursive properties of
|
||||
Sylvester Hadamards, and therefore do not require transform weight data
|
||||
|
||||
Note that sylvester hadamard transforms are also symmetric, which means that
|
||||
this function is also applies the (transpose <=> inverse) transform.
|
||||
|
||||
:param x: value to be transformed inplace
|
||||
:param inplace: modify value in place
|
||||
:return: value after transformation
|
||||
"""
|
||||
return torch.ops._C.hadacore_transform(x, inplace)
|
||||
|
||||
|
||||
if hasattr(torch.ops._C, "hadacore_transform"):
|
||||
|
||||
@register_fake("_C::hadacore_transform")
|
||||
def _hadacore_transform_fake(x: torch.Tensor,
|
||||
inplace: bool) -> torch.Tensor:
|
||||
return torch.empty_like(x) if not inplace else x
|
||||
|
||||
@ -13,7 +13,7 @@ logger = init_logger(__name__)
|
||||
try:
|
||||
import intel_extension_for_pytorch as ipex
|
||||
except ImportError as e:
|
||||
logger.warning("Import error msg: %s", e.msg)
|
||||
logger.debug("Import error msg: %s", e.msg)
|
||||
|
||||
|
||||
class ipex_ops:
|
||||
|
||||
@ -76,7 +76,7 @@ def video_to_pil_images_list(path: str,
|
||||
return [Image.fromarray(frame) for frame in frames]
|
||||
|
||||
|
||||
def video_get_metadata(path: str) -> dict[str, Any]:
|
||||
def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]:
|
||||
cap = cv2.VideoCapture(path)
|
||||
if not cap.isOpened():
|
||||
raise ValueError(f"Could not open video file {path}")
|
||||
@ -85,11 +85,18 @@ def video_get_metadata(path: str) -> dict[str, Any]:
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
duration = total_frames / fps if fps > 0 else 0
|
||||
|
||||
if num_frames == -1 or num_frames > total_frames:
|
||||
num_frames = total_frames
|
||||
|
||||
metadata = {
|
||||
"total_num_frames": total_frames,
|
||||
"total_num_frames": num_frames,
|
||||
"fps": fps,
|
||||
"duration": duration,
|
||||
"video_backend": "opencv"
|
||||
"video_backend": "opencv",
|
||||
"frames_indices": list(range(num_frames)),
|
||||
# extra field used to control hf processor's video
|
||||
# sampling behavior
|
||||
"do_sample_frames": num_frames == total_frames,
|
||||
}
|
||||
return metadata
|
||||
|
||||
@ -126,7 +133,7 @@ class VideoAsset:
|
||||
|
||||
@property
|
||||
def metadata(self) -> dict[str, Any]:
|
||||
ret = video_get_metadata(self.video_path)
|
||||
ret = video_get_metadata(self.video_path, self.num_frames)
|
||||
return ret
|
||||
|
||||
def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
|
||||
|
||||
@ -17,7 +17,6 @@ from vllm.attention.backends.mla.common import (MLACommonBackend,
|
||||
from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
|
||||
get_mla_metadata,
|
||||
is_flashmla_supported)
|
||||
from vllm.platforms.cuda import CudaPlatform
|
||||
|
||||
|
||||
class FlashMLABackend(MLACommonBackend):
|
||||
@ -179,18 +178,8 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
|
||||
logits_soft_cap, attn_type,
|
||||
kv_sharing_target_layer_name, **mla_args)
|
||||
|
||||
assert is_flashmla_supported(), \
|
||||
"FlashMLA is not supported on this device"
|
||||
|
||||
# disallow FlashMLA on NVIDIA Blackwell (SM 10.0+) GPUs
|
||||
# context:
|
||||
# https://github.com/deepseek-ai/FlashMLA/issues/83
|
||||
# https://github.com/vllm-project/vllm/issues/24513
|
||||
if CudaPlatform.has_device_capability(100):
|
||||
raise NotImplementedError(
|
||||
"FlashMLA is temporarily disabled on Blackwell (SM 10.0). "
|
||||
"Please use CUTLASS_MLA or TRITON_MLA instead. "
|
||||
"Example: `export VLLM_ATTENTION_BACKEND=CUTLASS_MLA`")
|
||||
is_supported, reason = is_flashmla_supported()
|
||||
assert is_supported, reason
|
||||
|
||||
unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
|
||||
if any(unsupported_features):
|
||||
|
||||
@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization.base_config import (
|
||||
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
||||
from vllm.model_executor.models.vision import get_vit_attn_backend
|
||||
from vllm.platforms import _Backend, current_platform
|
||||
from vllm.utils import direct_register_custom_op
|
||||
from vllm.utils import GiB_bytes, direct_register_custom_op
|
||||
|
||||
logger = init_logger(__name__)
|
||||
USE_XFORMERS_OPS = None
|
||||
@ -225,9 +225,26 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
).parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
|
||||
self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
|
||||
self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
|
||||
self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
|
||||
try:
|
||||
self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT,
|
||||
dtype=torch.float32)
|
||||
self.k_range = torch.tensor(envs.K_SCALE_CONSTANT,
|
||||
dtype=torch.float32)
|
||||
self.v_range = torch.tensor(envs.V_SCALE_CONSTANT,
|
||||
dtype=torch.float32)
|
||||
except torch.cuda.OutOfMemoryError as e:
|
||||
logger.error(
|
||||
"Failed to initialize attention q/k/v range constants: %s", e)
|
||||
if torch.cuda.is_available():
|
||||
logger.debug("CUDA device: %s", torch.cuda.current_device())
|
||||
logger.debug("Allocated: %.2f GiB",
|
||||
torch.cuda.memory_allocated() / GiB_bytes)
|
||||
logger.debug("Reserved: %.2f GiB",
|
||||
torch.cuda.memory_reserved() / GiB_bytes)
|
||||
raise RuntimeError(
|
||||
"Failed to initialize q/k/v range constants. "
|
||||
"This may be caused by insufficient memory to allocate "
|
||||
"kv cache.") from e
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@ -11,6 +11,7 @@ generation. Supported dataset types include:
|
||||
- HuggingFace
|
||||
- VisionArena
|
||||
"""
|
||||
import argparse
|
||||
import ast
|
||||
import base64
|
||||
import io
|
||||
@ -1019,6 +1020,25 @@ class ShareGPTDataset(BenchmarkDataset):
|
||||
return samples
|
||||
|
||||
|
||||
class _ValidateDatasetArgs(argparse.Action):
|
||||
"""Argparse action to validate dataset name and path compatibility."""
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
setattr(namespace, self.dest, values)
|
||||
|
||||
# Get current values of both dataset_name and dataset_path
|
||||
dataset_name = getattr(namespace, 'dataset_name', 'random')
|
||||
dataset_path = getattr(namespace, 'dataset_path', None)
|
||||
|
||||
# Validate the combination
|
||||
if dataset_name == "random" and dataset_path is not None:
|
||||
parser.error(
|
||||
"Cannot use 'random' dataset with --dataset-path. "
|
||||
"Please specify the appropriate --dataset-name (e.g., "
|
||||
"'sharegpt', 'custom', 'sonnet') for your dataset file: "
|
||||
f"{dataset_path}"
|
||||
)
|
||||
|
||||
|
||||
def add_dataset_parser(parser: FlexibleArgumentParser):
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument(
|
||||
@ -1031,6 +1051,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
default="random",
|
||||
action=_ValidateDatasetArgs,
|
||||
choices=[
|
||||
"sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf",
|
||||
"custom", "prefix_repetition", "spec_bench"
|
||||
@ -1046,6 +1067,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
|
||||
"--dataset-path",
|
||||
type=str,
|
||||
default=None,
|
||||
action=_ValidateDatasetArgs,
|
||||
help="Path to the sharegpt/sonnet dataset. "
|
||||
"Or the huggingface dataset ID if using HF dataset.",
|
||||
)
|
||||
|
||||
@ -62,9 +62,6 @@ class NoOpEliminationPass(VllmInductorPass):
|
||||
scaled_mm: "f16[s0, 4096]" = ...
|
||||
at = auto_functionalized(fused_add_rms_norm, input = scaled_mm, ...)
|
||||
out: "f16[s0, 4096]" = at[1]
|
||||
|
||||
TODO(luka): This is currently tested in test_fusion,
|
||||
but separate tests could be good.
|
||||
"""
|
||||
|
||||
def __call__(self, graph: torch.fx.Graph):
|
||||
@ -96,17 +93,19 @@ class NoOpEliminationPass(VllmInductorPass):
|
||||
# Invalid reshape args, skip
|
||||
continue
|
||||
|
||||
if self.all_dims_equivalent(shape, input_shape):
|
||||
if self.reshape_all_dims_equivalent(shape, input_shape):
|
||||
node.replace_all_uses_with(input)
|
||||
graph.erase_node(node)
|
||||
count += 1
|
||||
|
||||
elif is_func(node, torch.ops.aten.slice.Tensor):
|
||||
# python slicing semantics are different from reshape
|
||||
# Don't treat -1 as inferred dimension
|
||||
input, dim_index, start, end = node.args[:4]
|
||||
input_shape = input.meta["val"].shape
|
||||
i_dim = input_shape[dim_index]
|
||||
output_shape = node.meta["val"].shape
|
||||
|
||||
if start == 0 and self.dims_equivalent(end, i_dim):
|
||||
if output_shape == input_shape:
|
||||
node.replace_all_uses_with(input)
|
||||
graph.erase_node(node)
|
||||
count += 1
|
||||
@ -116,14 +115,7 @@ class NoOpEliminationPass(VllmInductorPass):
|
||||
base_shape = base.meta["val"].shape
|
||||
view_shape = view.meta["val"].shape
|
||||
|
||||
view_dim = view_shape[dim_index]
|
||||
|
||||
# Check that view fully covers base and the full view is used
|
||||
# (if the view fully covered the base after slicing but was not
|
||||
# fully used, we could replace slice_scatter with a simple slice
|
||||
# but that's a niche case).
|
||||
if (base_shape == view_shape and start == 0
|
||||
and self.dims_equivalent(end, view_dim)):
|
||||
if base_shape == view_shape:
|
||||
node.replace_all_uses_with(view)
|
||||
graph.erase_node(node)
|
||||
count += 1
|
||||
@ -132,13 +124,9 @@ class NoOpEliminationPass(VllmInductorPass):
|
||||
self.dump_graph(graph, "after_noop_elimination")
|
||||
self.end_and_log()
|
||||
|
||||
def all_dims_equivalent(self, dims: Iterable[Union[int, torch.fx.Node]],
|
||||
i_dims: Iterable[Union[int, SymInt]]):
|
||||
return all(
|
||||
self.dims_equivalent(s, i_s) for s, i_s in zip(dims, i_dims))
|
||||
|
||||
def dims_equivalent(self, dim: Union[int, torch.fx.Node],
|
||||
i_dim: Union[int, SymInt]) -> bool:
|
||||
# ---------------------- Reshape helpers ----------------------
|
||||
def reshape_dims_equivalent(self, dim: Union[int, torch.fx.Node],
|
||||
i_dim: Union[int, SymInt]) -> bool:
|
||||
"""
|
||||
This function checks if two dimensions are equivalent.
|
||||
:param dim: The dimension arg to reshape/slice
|
||||
@ -156,10 +144,18 @@ class NoOpEliminationPass(VllmInductorPass):
|
||||
In case 3, the reshape dimension is a torch.fx.Node,
|
||||
and its value is a SymInt. That value is equal to the
|
||||
input dimension.
|
||||
|
||||
"""
|
||||
# Case 1 and 2
|
||||
if dim == i_dim or dim == -1:
|
||||
return True
|
||||
# Case 3
|
||||
return isinstance(dim, torch.fx.Node) and dim.meta["val"] == i_dim
|
||||
|
||||
def reshape_all_dims_equivalent(
|
||||
self,
|
||||
dims: Iterable[Union[int, torch.fx.Node]],
|
||||
i_dims: Iterable[Union[int, SymInt]],
|
||||
) -> bool:
|
||||
return all(
|
||||
self.reshape_dims_equivalent(s, i_s)
|
||||
for s, i_s in zip(dims, i_dims))
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user