diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 86aae426c258c..92a1bcada3879 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -62,12 +62,8 @@ steps: env: DOCKER_BUILDKIT: "1" - - block: "Build release image (x86)" - depends_on: ~ - key: block-release-image-build - - label: "Build release image (x86)" - depends_on: block-release-image-build + depends_on: ~ id: build-release-image-x86 agents: queue: cpu_queue_postmerge @@ -80,7 +76,7 @@ steps: - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - label: "Build release image (arm64)" - depends_on: block-release-image-build + depends_on: ~ id: build-release-image-arm64 agents: queue: arm64_cpu_queue_postmerge diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index df0bae0c9cbff..c395011a24485 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then --ignore=entrypoints/llm/test_chat.py \ --ignore=entrypoints/llm/test_accuracy.py \ --ignore=entrypoints/llm/test_init.py \ - --ignore=entrypoints/llm/test_generate_multiple_loras.py \ --ignore=entrypoints/llm/test_prompt_validation.py "} fi diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 8b8f0e8c6578d..0f734763f13fd 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e @@ -89,17 +89,33 @@ function cpu_tests() { pytest -x -s -v \ tests/lora/test_qwen2vl.py" - # online serving + # online serving: tp+pp docker exec cpu-test-"$NUMA_NODE" bash -c ' set -e VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & + server_pid=$! timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 vllm bench serve \ --backend vllm \ --dataset-name random \ --model meta-llama/Llama-3.2-3B-Instruct \ --num-prompts 20 \ - --endpoint /v1/completions' + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' + + # online serving: tp+dp + docker exec cpu-test-"$NUMA_NODE" bash -c ' + set -e + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 & + server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --num-prompts 20 \ + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' } # All of CPU tests are expected to be finished less than 40 mins. diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 24cc57e9dfb97..482808cd07e8c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -109,10 +109,9 @@ steps: - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Entrypoints Test (API Server) # 40min @@ -326,7 +325,7 @@ steps: source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py parallelism: 4 - label: PyTorch Compilation Unit Tests @@ -463,8 +462,8 @@ steps: - tests/quantization commands: # temporary install here since we need nightly, will move to requirements/test.in - # after torchao 0.12 release - - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + # after torchao 0.12 release, and pin a working version of torchao nightly here + - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - label: LM Eval Small Models # 53min @@ -668,6 +667,7 @@ steps: # Quantization - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py @@ -677,6 +677,7 @@ steps: - pytest -v -s tests/compile/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py ##### 1 GPU test ##### ##### multi gpus test ##### @@ -805,13 +806,13 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_multi_loras_with_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py - label: Weight Loading Multiple GPU Test # 33min mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" - num_gpus: 2 + num_gpus: 2 optional: true source_file_dependencies: - vllm/ diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml index 6401d6586cc3d..e0ab3872d8fa3 100644 --- a/.github/workflows/issue_autolabel.yml +++ b/.github/workflows/issue_autolabel.yml @@ -49,6 +49,10 @@ jobs: term: "VLLM_ROCM_", searchIn: "both" }, + { + term: "aiter", + searchIn: "title" + }, { term: "rocm", searchIn: "title" diff --git a/CMakeLists.txt b/CMakeLists.txt index b0eb0f32e03a5..3f1f9a781a07a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1") -set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0") # # Try to find python package with an executable that exactly matches @@ -541,6 +541,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" + "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" @@ -559,6 +560,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" + "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" "csrc/quantization/fp4/nvfp4_experts_quant.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu") diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py index 883f0cf7e55f1..9663503e9baa0 100644 --- a/benchmarks/kernels/bench_block_fp8_gemm.py +++ b/benchmarks/kernels/bench_block_fp8_gemm.py @@ -16,6 +16,7 @@ assert current_platform.is_cuda(), ( # DeepSeek-V3 weight shapes DEEPSEEK_V3_SHAPES = [ (512 + 64, 7168), + (2112, 7168), ((128 + 64) * 128, 7168), (128 * (128 + 128), 512), (7168, 16384), diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index e648a91077fdb..98bde9d83c82d 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -141,6 +141,7 @@ def get_weight_shapes(tp_size): # cannot TP total = [ (512 + 64, 7168), + (2112, 7168), ((128 + 64) * 128, 7168), (128 * (128 + 128), 512), (7168, 16384), diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index fc82a1fa8ed78..fbb022464ef27 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -913,7 +913,6 @@ __global__ void cp_gather_cache( const int32_t split_end = min((split + 1) * split_slots, tot_slots); const bool is_active_split = (split_start < tot_slots); - const bool is_last_split = (split_end == tot_slots); if (!is_active_split) return; diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index f7b75c48373f6..2728aa81f0c9f 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -19,6 +19,13 @@ #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) +#define VLLM_DISPATCH_CASE_HALF_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + +#define VLLM_DISPATCH_HALF_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_HALF_TYPES(__VA_ARGS__)) + // ROCm devices might use either fn or fnuz, so set up dispatch table for both. // A host-based check at runtime will create a preferred FP8 type for ROCm // such that the correct kernel is dispatched. @@ -45,6 +52,15 @@ #define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__)) +#define AT_DISPATCH_BYTE_CASE(enum_type, ...) \ + AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, byte_t, __VA_ARGS__) + +#define VLLM_DISPATCH_CASE_BYTE_TYPES(...) \ + AT_DISPATCH_BYTE_CASE(at::ScalarType::Byte, __VA_ARGS__) + +#define VLLM_DISPATCH_BYTE_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_BYTE_TYPES(__VA_ARGS__)) + #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__)) diff --git a/csrc/ops.h b/csrc/ops.h index 86fe848e2fd5a..7a176a5c00322 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -130,6 +130,14 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input); void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& scale); +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) +void silu_and_mul_nvfp4_quant(torch::Tensor& out, + torch::Tensor& output_block_scale, + torch::Tensor& input, + torch::Tensor& input_global_scale); +#endif + void mul_and_silu(torch::Tensor& out, torch::Tensor& input); void gelu_and_mul(torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu new file mode 100644 index 0000000000000..9bbeb0334fb9a --- /dev/null +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +#include +#include "dispatch_utils.h" + +#include "cuda_utils.h" + +namespace vllm { + +// Get type2 from type or vice versa (applied to half and bfloat16) +template +struct TypeConverter { + using Type = half2; +}; // keep for generality + +template <> +struct TypeConverter { + using Type = c10::Half; +}; + +template <> +struct TypeConverter { + using Type = half2; +}; + +template <> +struct TypeConverter<__nv_bfloat162> { + using Type = c10::BFloat16; +}; + +template <> +struct TypeConverter { + using Type = __nv_bfloat162; +}; + +#define ELTS_PER_THREAD 8 + +constexpr int CVT_FP4_ELTS_PER_THREAD = 8; +constexpr int CVT_FP4_SF_VEC_SIZE = 16; + +// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). +inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + uint32_t val; + asm volatile( + "{\n" + ".reg .b8 byte0;\n" + ".reg .b8 byte1;\n" + ".reg .b8 byte2;\n" + ".reg .b8 byte3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" + "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" + "}" + : "=r"(val) + : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), + "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7])); + return val; +#else + return 0; +#endif +} + +// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). +inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + uint32_t val; + asm volatile( + "{\n" + ".reg .b8 byte0;\n" + ".reg .b8 byte1;\n" + ".reg .b8 byte2;\n" + ".reg .b8 byte3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" + "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" + "}" + : "=r"(val) + : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), + "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y)); + return val; +#else + return 0; +#endif +} + +// Fast reciprocal. +inline __device__ float reciprocal_approximate_ftz(float a) { + float b; + asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a)); + return b; +} + +template +__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, + int numCols, + SFType* SFout) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || + CVT_FP4_NUM_THREADS_PER_SF == 2); + + // One pair of threads write one SF to global memory. + // TODO: stage through smem for packed STG.32 + // is it better than STG.8 from 4 threads ? + if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { + // SF vector index (16 elements share one SF in the K dimension). + int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; + int32_t mIdx = rowIdx; + + // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] + // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] + + int32_t mTileIdx = mIdx / (32 * 4); + // SF vector size 16. + int factor = CVT_FP4_SF_VEC_SIZE * 4; + int32_t numKTiles = (numCols + factor - 1) / factor; + int64_t mTileStride = numKTiles * 32 * 4 * 4; + + int32_t kTileIdx = (kIdx / 4); + int64_t kTileStride = 32 * 4 * 4; + + // M tile layout [32, 4] is column-major. + int32_t outerMIdx = (mIdx % 32); + int64_t outerMStride = 4 * 4; + + int32_t innerMIdx = (mIdx % (32 * 4)) / 32; + int64_t innerMStride = 4; + + int32_t innerKIdx = (kIdx % 4); + int64_t innerKStride = 1; + + // Compute the global offset. + int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + + outerMIdx * outerMStride + innerMIdx * innerMStride + + innerKIdx * innerKStride; + + return reinterpret_cast(SFout) + SFOffset; + } +#endif + return nullptr; +} + +// Define a 16 bytes packed data type. +template +struct PackedVec { + typename TypeConverter::Type elts[4]; +}; + +template <> +struct PackedVec<__nv_fp8_e4m3> { + __nv_fp8x2_e4m3 elts[8]; +}; + +template +__inline__ __device__ PackedVec compute_silu(PackedVec& vec, + PackedVec& vec2) { + PackedVec result; +#pragma unroll + for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) { + if constexpr (std::is_same_v) { + half2 val(0.5f, 0.5f); + half2 t0 = __hmul2(vec.elts[i], val); + half2 t1 = __hfma2(h2tanh(t0), val, val); + half2 t2 = __hmul2(vec.elts[i], t1); + result.elts[i] = __hmul2(t2, vec2.elts[i]); + } else { + __nv_bfloat162 val(0.5f, 0.5f); + __nv_bfloat162 t0 = __hmul2(vec.elts[i], val); + __nv_bfloat162 t1 = __hfma2(h2tanh(t0), val, val); + __nv_bfloat162 t2 = __hmul2(vec.elts[i], t1); + result.elts[i] = __hmul2(t2, vec2.elts[i]); + } + } + return result; +} + +// Quantizes the provided PackedVec into the uint32_t output +template +__device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec& vec, + PackedVec& vec2, + float SFScaleVal, + uint8_t* SFout) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + PackedVec out_silu = compute_silu(vec, vec2); + // Get absolute maximum values among the local 8 values. + auto localMax = __habs2(out_silu.elts[0]); + + // Local maximum value. + #pragma unroll + for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { + localMax = __hmax2(localMax, __habs2(out_silu.elts[i])); + } + + // Get the absolute maximum among all 16 values (two threads). + localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax); + // Get the final absolute maximum values. + float vecMax = float(__hmax(localMax.x, localMax.y)); + + // Get the SF (max value of the vector / max value of e2m1). + // maximum value of e2m1 = 6.0. + // TODO: use half as compute data type. + float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f)); + // 8 bits representation of the SF. + uint8_t fp8SFVal; + // Write the SF to global memory (STG.8). + if constexpr (UE8M0_SF) { + // Extract the 8 exponent bits from float32. + // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits. + uint32_t tmp = reinterpret_cast(SFValue) >> 23; + fp8SFVal = tmp & 0xff; + // Convert back to fp32. + reinterpret_cast(SFValue) = tmp << 23; + } else { + // Here SFValue is always positive, so E4M3 is the same as UE4M3. + __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue); + reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp; + // Convert back to fp32. + SFValue = float(tmp); + } + // Get the output scale. + // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * + // reciprocal(SFScaleVal)) + float outputScale = + SFValue != 0 ? reciprocal_approximate_ftz( + SFValue * reciprocal_approximate_ftz(SFScaleVal)) + : 0.0f; + + if (SFout) { + // Write the SF to global memory (STG.8). + *SFout = fp8SFVal; + } + + // Convert the input to float. + float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2]; + + #pragma unroll + for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { + if constexpr (std::is_same_v) { + fp2Vals[i] = __half22float2(out_silu.elts[i]); + } else { + fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]); + } + fp2Vals[i].x *= outputScale; + fp2Vals[i].y *= outputScale; + } + + // Convert to e2m1 values. + uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals); + + // Write the e2m1 values to global memory. + return e2m1Vec; +#else + return 0; +#endif +} + +// Use UE4M3 by default. +template +__global__ void +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) +__launch_bounds__(1024, 4) silu_and_cvt_fp16_to_fp4( +#else +silu_and_cvt_fp16_to_fp4( +#endif + int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, + uint32_t* out, uint32_t* SFout) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) + using PackedVec = PackedVec; + static constexpr int CVT_FP4_NUM_THREADS_PER_SF = + (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); + static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, + "Vec size is not matched."); + + // Get the global scaling factor, which will be applied to the SF. + // Note SFScale is the same as next GEMM's alpha, which is + // (448.f / (Alpha_A / 6.f)). + float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0]; + + // Input tensor row/col loops. + for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) { + for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; + colIdx += blockDim.x) { + int64_t inOffset = + rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx; + int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + + numCols / CVT_FP4_ELTS_PER_THREAD + colIdx; + PackedVec in_vec = reinterpret_cast(in)[inOffset]; + PackedVec in_vec2 = reinterpret_cast(in)[inOffset2]; + + // Get the output tensor offset. + // Same as inOffset because 8 elements are packed into one uint32_t. + int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; + ; + auto& out_pos = out[outOffset]; + + auto sf_out = + cvt_quant_to_fp4_get_sf_out_offset( + rowIdx, colIdx, numCols, SFout); + + out_pos = silu_and_cvt_warp_fp16_to_fp4( + in_vec, in_vec2, SFScaleVal, sf_out); + } + } +#endif +} + +} // namespace vllm + +void silu_and_mul_nvfp4_quant(torch::Tensor& output, // [..., d] + torch::Tensor& output_sf, + torch::Tensor& input, // [..., 2 * d] + torch::Tensor& input_sf) { + TORCH_CHECK(input.dtype() == torch::kFloat16 || + input.dtype() == torch::kBFloat16); + int32_t m = input.size(0); + int32_t n = input.size(1) / 2; + TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16."); + int multiProcessorCount = + get_device_attribute(cudaDevAttrMultiProcessorCount, -1); + auto input_sf_ptr = static_cast(input_sf.data_ptr()); + auto sf_out = static_cast(output_sf.data_ptr()); + auto output_ptr = static_cast(output.data_ptr()); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); + dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024)); + int const numBlocksPerSM = 2048 / block.x; + dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); + VLLM_DISPATCH_HALF_TYPES( + input.scalar_type(), "act_and_mul_quant_kernel", [&] { + auto input_ptr = reinterpret_cast(input.data_ptr()); + VLLM_DISPATCH_BYTE_TYPES( + output.scalar_type(), "fused_act_and_mul_quant_kernel_nvfp4_type", + [&] { + vllm::silu_and_cvt_fp16_to_fp4 + <<>>( + m, n, input_ptr, input_sf_ptr, + reinterpret_cast(output_ptr), + reinterpret_cast(sf_out)); + }); + }); +} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 608b724403076..56626a02c0277 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -115,6 +115,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()"); ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant); +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) + ops.def( + "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, " + "Tensor input, Tensor input_global_scale) -> ()"); + ops.impl("silu_and_mul_nvfp4_quant", torch::kCUDA, &silu_and_mul_nvfp4_quant); +#endif + ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()"); ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu); diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index b11ccb5c00273..2d8cdcc11fa99 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -175,7 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u Known supported models: - Llama4 () -- MiniCPM-V-4 () +- MiniCPM-V-2.5 or above (, ) - Qwen2.5-VL () - Step3 () diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index 21b1f21d60a35..aafdb1058e03c 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -121,3 +121,31 @@ To support a model with interleaving sliding windows, we need to take care of th - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171). With these two steps, interleave sliding windows should work with the model. + +### How to support models that use Mamba? + +We consider 3 different scenarios: + +1. Models that use Mamba layers (either Mamba-1 or Mamba-2) but do not use attention layers. +2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers. +3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers. + +For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference. +The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config. +For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes. +Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations. +V0-only classes and code will be removed in the very near future. +The model should also be added to the `MODELS_CONFIG_MAP` dictionary in to ensure that the runtime defaults are optimized. + +For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together). +These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol). + +For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively. +Please follow the same guidelines as case (2) for implementing these models. +We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention). +For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`. +It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers. +Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this. +Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it. +Please see the calls to `direct_register_custom_op` in or for examples of this. +The new custom op should then be added to the list `_attention_ops` in to ensure that piecewise CUDA graphs works as intended. diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 9d51f9cf52f50..206ab7a468755 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -13,6 +13,41 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]: - `prompt`: The prompt should follow the format that is documented on HuggingFace. - `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][]. +### Stable UUIDs for Caching (multi_modal_uuids) + +When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content. + +??? code + + ```python + from vllm import LLM + from PIL import Image + + # Qwen2.5-VL example with two images + llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct") + + prompt = "USER: \nDescribe the differences.\nASSISTANT:" + img_a = Image.open("/path/to/a.jpg") + img_b = Image.open("/path/to/b.jpg") + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": [img_a, img_b]}, + # Provide stable IDs for caching. + # Requirements (matched by this example): + # - Include every modality present in multi_modal_data. + # - For lists, provide the same number of entries. + # - Use None to fall back to content hashing for that item. + "multi_modal_uuids": {"image": ["sku-1234-a", None]}, + }) + + for o in outputs: + print(o.outputs[0].text) + ``` + +!!! warning + If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored. + ### Image Inputs You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index e76ec35e1edcb..7f0ecb2bc0b74 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -96,6 +96,7 @@ Currently, there are no pre-built CPU wheels. - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`. - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`. +- `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence. - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False). - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False). @@ -179,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch - Offline Inference: `256 * world_size` - Online Serving: `128 * world_size` -vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP together if there are enough CPU sockets and memory nodes. +vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes. ### Which quantization configs does vLLM CPU support? diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md index 6dc6f94249c34..f7af259ace628 100644 --- a/docs/getting_started/installation/cpu/x86.inc.md +++ b/docs/getting_started/installation/cpu/x86.inc.md @@ -43,7 +43,7 @@ docker build -f docker/Dockerfile.cpu \ # Launching OpenAI server docker run --rm \ - --privileged=true \ + --security-opt seccomp=unconfined \ --shm-size=4g \ -p 8000:8000 \ -e VLLM_CPU_KVCACHE_SPACE= \ diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 01c1090c6fca8..e8fe77e8d6c98 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -335,9 +335,9 @@ th { | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ | -| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ | -| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ | +| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ | | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ | diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 20234e7611333..f71805436a6ae 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -107,16 +107,14 @@ to enable simultaneous generation and embedding using the same engine instance i #### Mamba Models Models using selective state-space mechanisms instead of standard transformer attention are supported. -Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. -Please note that prefix caching is not yet supported for these models. +Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported. -Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, +Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). -Please note that prefix caching is not yet supported for these models. -Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`). -Please note that prefix caching is not yet supported for these models. -It is also necessary to enforce eager mode for these models in V1. +Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`). + +Please note that prefix caching is not yet supported for any of the above models. #### Encoder-Decoder Models diff --git a/pyproject.toml b/pyproject.toml index 013f2a6cd59e4..e63f8aeae2787 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging>=24.2", "setuptools>=77.0.3,<80.0.0", "setuptools-scm>=8.0", - "torch == 2.7.1", + "torch == 2.8.0", "wheel", "jinja2", ] diff --git a/requirements/build.txt b/requirements/build.txt index dd644d621efc1..5f826a1afa144 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -4,7 +4,8 @@ ninja packaging>=24.2 setuptools>=77.0.3,<80.0.0 setuptools-scm>=8 -torch==2.7.1 +torch==2.8.0 wheel jinja2>=3.1.6 regex +build diff --git a/requirements/cpu.txt b/requirements/cpu.txt index f4b95b72898cc..a48cb9fde000c 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -9,17 +9,16 @@ packaging>=24.2 setuptools>=77.0.3,<80.0.0 --extra-index-url https://download.pytorch.org/whl/cpu torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218 -torch==2.7.0; platform_system == "Darwin" -torch==2.7.0; platform_machine == "ppc64le" -torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960 +torch==2.8.0; platform_system == "Darwin" +torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" -torchaudio==2.7.0; platform_machine == "ppc64le" +torchaudio==2.8.0; platform_machine == "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" -torchvision==0.22.0; platform_machine == "ppc64le" +torchvision==0.23.0; platform_machine == "ppc64le" datasets # for benchmark scripts # Intel Extension for PyTorch, only for x86_64 CPUs diff --git a/requirements/cuda.txt b/requirements/cuda.txt index fb30e493f80b3..3f8b8fca3209a 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9' # Dependencies for NVIDIA GPUs ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. -torch==2.7.1 -torchaudio==2.7.1 +torch==2.8.0 +torchaudio==2.8.0 # These must be updated alongside torch -torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31 -xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 \ No newline at end of file +torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1 +xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8 diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index cbae9bbb8a9b3..affe562c24f6b 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -1,10 +1,10 @@ # Common dependencies -r common.txt ---extra-index-url https://download.pytorch.org/whl/rocm6.2.4 -torch==2.7.0 -torchvision==0.22.0 -torchaudio==2.7.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.3 +torch==2.8.0 +torchvision==0.23.0 +torchaudio==2.8.0 triton==3.3.0 cmake>=3.26.1,<4 diff --git a/requirements/test.in b/requirements/test.in index 92c577c501632..5b1688c76c954 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests timm >=1.0.17 # required for internvl and gemma3n-mm test -torch==2.7.1 -torchaudio==2.7.1 -torchvision==0.22.1 +torch==2.8.0 +torchaudio==2.8.0 +torchvision==0.23.0 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test mistral_common[image,audio] >= 1.8.2 # required for voxtral test diff --git a/requirements/test.txt b/requirements/test.txt index 0c27c9bb67e82..0b728ebfb0071 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -541,42 +541,42 @@ numpy==1.26.4 # tritonclient # vocos # xarray -nvidia-cublas-cu12==12.8.3.14 +nvidia-cublas-cu12==12.8.4.1 # via # nvidia-cudnn-cu12 # nvidia-cusolver-cu12 # torch -nvidia-cuda-cupti-cu12==12.8.57 +nvidia-cuda-cupti-cu12==12.8.90 # via torch -nvidia-cuda-nvrtc-cu12==12.8.61 +nvidia-cuda-nvrtc-cu12==12.8.93 # via torch -nvidia-cuda-runtime-cu12==12.8.57 +nvidia-cuda-runtime-cu12==12.8.90 # via torch -nvidia-cudnn-cu12==9.7.1.26 +nvidia-cudnn-cu12==9.10.2.21 # via torch -nvidia-cufft-cu12==11.3.3.41 +nvidia-cufft-cu12==11.3.3.83 # via torch -nvidia-cufile-cu12==1.13.0.11 +nvidia-cufile-cu12==1.13.1.3 # via torch -nvidia-curand-cu12==10.3.9.55 +nvidia-curand-cu12==10.3.9.90 # via torch -nvidia-cusolver-cu12==11.7.2.55 +nvidia-cusolver-cu12==11.7.3.90 # via torch -nvidia-cusparse-cu12==12.5.7.53 +nvidia-cusparse-cu12==12.5.8.93 # via # nvidia-cusolver-cu12 # torch -nvidia-cusparselt-cu12==0.6.3 +nvidia-cusparselt-cu12==0.7.1 # via torch -nvidia-nccl-cu12==2.26.2 +nvidia-nccl-cu12==2.27.3 # via torch -nvidia-nvjitlink-cu12==12.8.61 +nvidia-nvjitlink-cu12==12.8.93 # via # nvidia-cufft-cu12 # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 # torch -nvidia-nvtx-cu12==12.8.55 +nvidia-nvtx-cu12==12.8.90 # via torch omegaconf==2.3.0 # via @@ -1069,7 +1069,7 @@ tomli==2.2.1 # via schemathesis tomli-w==1.2.0 # via schemathesis -torch==2.7.1+cu128 +torch==2.8.0+cu128 # via # -r requirements/test.in # accelerate @@ -1098,7 +1098,7 @@ torch==2.7.1+cu128 # torchvision # vector-quantize-pytorch # vocos -torchaudio==2.7.1+cu128 +torchaudio==2.8.0+cu128 # via # -r requirements/test.in # encodec @@ -1111,7 +1111,7 @@ torchmetrics==1.7.4 # pytorch-lightning # terratorch # torchgeo -torchvision==0.22.1+cu128 +torchvision==0.23.0+cu128 # via # -r requirements/test.in # lightly @@ -1152,7 +1152,7 @@ transformers==4.55.2 # transformers-stream-generator transformers-stream-generator==0.0.5 # via -r requirements/test.in -triton==3.3.1 +triton==3.4.0 # via torch tritonclient==2.51.0 # via diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index 0e1059e654479..fcc2589e42116 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -4,32 +4,41 @@ import pytest import torch import vllm.envs as envs -from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass -from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe +from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant +# yapf conflicts with isort for this block +# yapf: disable +from vllm.compilation.activation_quant_fusion import ( + FUSED_OPS, SILU_MUL_OP, ActivationQuantFusionPass) +# yapf: enable +from vllm.compilation.fusion import QUANT_OPS from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.config import CompilationConfig, PassConfig, VllmConfig from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.quantization.utils.quant_utils import ( - GroupShape) + GroupShape, kFp8StaticTensorSym, kNvfp4Quant) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp) from vllm.platforms import current_platform from .backend import TestBackend +FP8_DTYPE = current_platform.fp8_dtype() +FP4_DTYPE = torch.uint8 -class TestModel(torch.nn.Module): - def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, *args, - **kwargs): - super().__init__(*args, **kwargs) +def is_nvfp4_supported(): + return current_platform.has_device_capability(100) + + +class TestSiluMulFp8QuantModel(torch.nn.Module): + + def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs): + super().__init__() self.silu_and_mul = SiluAndMul() self.wscale = torch.rand(1, dtype=torch.float32) self.scale = torch.rand(1, dtype=torch.float32) - self.w = (torch.rand( - hidden_size, - hidden_size).to(dtype=current_platform.fp8_dtype()).t()) + self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() self.fp8_linear = Fp8LinearOp( force_fp8_e4m3fnuz=force_fp8_e4m3fnuz, @@ -45,14 +54,56 @@ class TestModel(torch.nn.Module): input_scale=self.wscale) return x2 + def ops_in_model_before(self): + return [SILU_MUL_OP, QUANT_OPS[kFp8StaticTensorSym]] -@pytest.mark.parametrize("num_tokens", [256]) -@pytest.mark.parametrize("hidden_size", [64]) + def ops_in_model_after(self): + return [FUSED_OPS[kFp8StaticTensorSym]] + + +class TestSiluMulNvfp4QuantModel(torch.nn.Module): + + def __init__(self, hidden_size: int, **kwargs): + super().__init__() + self.silu_and_mul = SiluAndMul() + self.w = torch.randint(256, (hidden_size, hidden_size // 2), + dtype=FP4_DTYPE) + self.wscale = torch.randn(hidden_size, + hidden_size // 16).to(dtype=FP8_DTYPE) + self.wscale2 = torch.rand(1, dtype=torch.float32) + self.scale = torch.rand(1, dtype=torch.float32) + + def forward(self, x): + y = self.silu_and_mul(x) + y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale) + out = cutlass_scaled_fp4_mm(a=y_quant, + b=self.w, + block_scale_a=y_block_scale, + block_scale_b=self.wscale, + alpha=self.scale * self.wscale2, + out_dtype=y.dtype) + return out + + def ops_in_model_before(self): + return [SILU_MUL_OP, QUANT_OPS[kNvfp4Quant]] + + def ops_in_model_after(self): + return [FUSED_OPS[kNvfp4Quant]] + + +@pytest.mark.parametrize("num_tokens", [64]) +@pytest.mark.parametrize("hidden_size", [128]) +@pytest.mark.parametrize( + "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel] + if is_nvfp4_supported() else [TestSiluMulFp8QuantModel]) @pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm") -def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, +def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class, force_fp8_e4m3fnuz): + if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz: + pytest.skip("Duplicate tests for NVFP4") + torch.set_default_device("cuda") torch.set_default_dtype(torch.float16) @@ -63,7 +114,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, fusion_pass = ActivationQuantFusionPass(config) backend = TestBackend(NoOpEliminationPass(config), fusion_pass) - model = TestModel(hidden_size, force_fp8_e4m3fnuz) + model = model_class(hidden_size=hidden_size, + force_fp8_e4m3fnuz=force_fp8_e4m3fnuz) # First dimension dynamic x = torch.rand(num_tokens, hidden_size * 2) @@ -80,17 +132,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, atol=1e-3, rtol=1e-3) - # Check substitution worked - pre_nodes = backend.graph_pre_pass.nodes - post_nodes = backend.graph_post_pass.nodes + # In pre-nodes, quant op should be present and fused kernels should not + backend.check_before_ops(model.ops_in_model_before()) - silu_and_mul_quant = torch.ops._C.silu_and_mul_quant.default - fp8_quant = torch.ops._C.static_scaled_fp8_quant.default - - # In pre-nodes, fp8 quant should be present and fused kernels should not - assert find_auto_fn_maybe(pre_nodes, silu_and_mul_quant) is None - find_auto_fn(pre_nodes, fp8_quant) - - # In post-nodes, fused kernels should be present and fp8 quant should not - find_auto_fn(post_nodes, silu_and_mul_quant) - assert find_auto_fn_maybe(post_nodes, fp8_quant) is None + # In post-nodes, fused kernels should be present and quant op should not + backend.check_after_ops(model.ops_in_model_after()) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 28150d7682378..1afe9ea970c97 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -118,6 +118,8 @@ class PPTestSettings: multi_node_only: bool = False, load_format: Optional[str] = None, ): + vllm_major_versions = ["1"] if runner == "pooling" else ["0"] + return PPTestSettings( parallel_setups=[ ParallelSetup(tp_size=tp_base, @@ -126,7 +128,7 @@ class PPTestSettings: chunked_prefill=False), ], distributed_backends=["mp"], - vllm_major_versions=["0"], + vllm_major_versions=vllm_major_versions, runner=runner, test_options=PPTestOptions(multi_node_only=multi_node_only, load_format=load_format), @@ -213,7 +215,9 @@ TEXT_GENERATION_MODELS = { EMBEDDING_MODELS = { # type: ignore[var-annotated] # [Text-only] "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"), - "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"), + # TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883 + # is fixed + #"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"), "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast( load_format="dummy", runner="pooling" ), diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index 49b8eddecb4a9..c93b436f384b9 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -292,7 +292,7 @@ SP_TEST_MODELS = [ # TODO support other models # [LANGUAGE GENERATION] "meta-llama/Llama-3.2-1B-Instruct", - "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8" + "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", ] diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py index 57705ff669075..6c0c9cd015801 100644 --- a/tests/entrypoints/llm/test_classify.py +++ b/tests/entrypoints/llm/test_classify.py @@ -16,14 +16,6 @@ MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach" prompts = ["The chef prepared a delicious meal."] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to @@ -70,3 +62,9 @@ def test_encode_api(llm: LLM): err_msg = "pooling_task must be one of.+" with pytest.raises(ValueError, match=err_msg): llm.encode(prompts, use_tqdm=False) + + +def test_score_api(llm: LLM): + err_msg = "Score API is only enabled for num_labels == 1." + with pytest.raises(ValueError, match=err_msg): + llm.score("ping", "pong", use_tqdm=False) diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index cb54b16b0b044..eae3e234378f2 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -27,14 +27,6 @@ TOKEN_IDS = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py deleted file mode 100644 index a04f195692e9b..0000000000000 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ /dev/null @@ -1,80 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import weakref - -import pytest -# downloading lora to test lora requests -from huggingface_hub import snapshot_download - -from vllm import LLM -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.lora.request import LoRARequest - -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" - -PROMPTS = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] - -LORA_NAME = "typeof/zephyr-7b-beta-lora" - - -@pytest.fixture(scope="module") -def monkeypatch_module(): - from _pytest.monkeypatch import MonkeyPatch - mpatch = MonkeyPatch() - yield mpatch - mpatch.undo() - - -@pytest.fixture(scope="module", params=[False, True]) -def llm(request, monkeypatch_module): - - use_v1 = request.param - monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') - - # pytest caches the fixture so we use weakref.proxy to - # enable garbage collection - llm = LLM(model=MODEL_NAME, - tensor_parallel_size=1, - max_model_len=8192, - enable_lora=True, - max_loras=4, - max_lora_rank=64, - max_num_seqs=128, - enforce_eager=True) - - yield weakref.proxy(llm) - - del llm - - cleanup_dist_env_and_memory() - - -@pytest.fixture(scope="module") -def zephyr_lora_files(): - return snapshot_download(repo_id=LORA_NAME) - - -@pytest.mark.skip_global_cleanup -def test_multiple_lora_requests(llm: LLM, zephyr_lora_files): - lora_request = [ - LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files) - for idx in range(len(PROMPTS)) - ] - # Multiple SamplingParams should be matched with each prompt - outputs = llm.generate(PROMPTS, lora_request=lora_request) - assert len(PROMPTS) == len(outputs) - - # Exception raised, if the size of params does not match the size of prompts - with pytest.raises(ValueError): - outputs = llm.generate(PROMPTS, lora_request=lora_request[:1]) - - # Single LoRARequest should be applied to every prompt - single_lora_request = lora_request[0] - outputs = llm.generate(PROMPTS, lora_request=single_lora_request) - assert len(PROMPTS) == len(outputs) diff --git a/tests/entrypoints/llm/test_reward.py b/tests/entrypoints/llm/test_reward.py index de82cf8d40380..2cee3c8d94e36 100644 --- a/tests/entrypoints/llm/test_reward.py +++ b/tests/entrypoints/llm/test_reward.py @@ -16,14 +16,6 @@ MODEL_NAME = "internlm/internlm2-1_8b-reward" prompts = ["The chef prepared a delicious meal."] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to diff --git a/tests/entrypoints/llm/test_score.py b/tests/entrypoints/llm/test_score.py index 5a1339b2addf4..f715dacacb8ff 100644 --- a/tests/entrypoints/llm/test_score.py +++ b/tests/entrypoints/llm/test_score.py @@ -14,14 +14,6 @@ from ...models.utils import softmax MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index dd8d63ad319ac..a154bb1059aae 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -32,15 +32,16 @@ MODEL_CONFIGS = [ "tensor_parallel_size": 1, "tokenizer_mode": "mistral", }, - { - "model": "sentence-transformers/all-MiniLM-L12-v2", - "enforce_eager": True, - "gpu_memory_utilization": 0.20, - "max_model_len": 64, - "max_num_batched_tokens": 64, - "max_num_seqs": 64, - "tensor_parallel_size": 1, - }, + # TODO: re-enable once these tests are run with V1 + # { + # "model": "sentence-transformers/all-MiniLM-L12-v2", + # "enforce_eager": True, + # "gpu_memory_utilization": 0.20, + # "max_model_len": 64, + # "max_num_batched_tokens": 64, + # "max_num_seqs": 64, + # "tensor_parallel_size": 1, + # }, ] diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py index 58195f98bd351..0d0ce0be8c5f8 100644 --- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py +++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py @@ -49,8 +49,7 @@ async def transcribe_audio(client, tokenizer, y, sr): return latency, num_output_tokens, transcription.text -async def bound_transcribe(model_name, sem, client, audio, reference): - tokenizer = AutoTokenizer.from_pretrained(model_name) +async def bound_transcribe(sem, client, tokenizer, audio, reference): # Use semaphore to limit concurrent requests. async with sem: result = await transcribe_audio(client, tokenizer, *audio) @@ -63,15 +62,19 @@ async def bound_transcribe(model_name, sem, client, audio, reference): async def process_dataset(model, client, data, concurrent_request): sem = asyncio.Semaphore(concurrent_request) + # Load tokenizer once outside the loop + tokenizer = AutoTokenizer.from_pretrained(model) + # Warmup call as the first `librosa.load` server-side is quite slow. audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"] - _ = await bound_transcribe(model, sem, client, (audio, sr), "") + _ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "") tasks: list[asyncio.Task] = [] for sample in data: audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"] task = asyncio.create_task( - bound_transcribe(model, sem, client, (audio, sr), sample["text"])) + bound_transcribe(sem, client, tokenizer, (audio, sr), + sample["text"])) tasks.append(task) return await asyncio.gather(*tasks) diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py index 30078fe90257a..36c96d76c2e5f 100644 --- a/tests/entrypoints/openai/test_classification.py +++ b/tests/entrypoints/openai/test_classification.py @@ -226,3 +226,33 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str): }, ) assert response.json()["error"]["type"] == "BadRequestError" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_score(server: RemoteOpenAIServer, model_name: str): + # score api is only enabled for num_labels == 1. + response = requests.post( + server.url_for("score"), + json={ + "model": model_name, + "text_1": "ping", + "text_2": "pong", + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_rerank(server: RemoteOpenAIServer, model_name: str): + # rerank api is only enabled for num_labels == 1. + response = requests.post( + server.url_for("rerank"), + json={ + "model": model_name, + "query": "ping", + "documents": ["pong"], + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index b20838956d721..9a1c0ea13b54f 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -27,6 +27,28 @@ def serve_parser(): return make_arg_parser(parser) +### Test config parsing +def test_config_arg_parsing(serve_parser, cli_config_file): + args = serve_parser.parse_args([]) + assert args.port == 8000 + args = serve_parser.parse_args(['--config', cli_config_file]) + assert args.port == 12312 + args = serve_parser.parse_args([ + '--config', + cli_config_file, + '--port', + '9000', + ]) + assert args.port == 9000 + args = serve_parser.parse_args([ + '--port', + '9000', + '--config', + cli_config_file, + ]) + assert args.port == 9000 + + ### Tests for LoRA module parsing def test_valid_key_value_format(serve_parser): # Test old format: name=path diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index cf2442a569388..d46ab304ba6d5 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -24,14 +24,6 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + DTYPE = "bfloat16" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def server(): args = [ diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index f4801172580c6..818efd825640c 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -47,6 +47,7 @@ class MockModelConfig: allowed_local_media_path: str = "" encoder_config = None generation_config: str = "auto" + skip_tokenizer_init: bool = False def get_diff_sampling_param(self): return self.diff_sampling_param or {} diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py index 73364294cbcdc..ce4d6c5f5d337 100644 --- a/tests/entrypoints/openai/test_rerank.py +++ b/tests/entrypoints/openai/test_rerank.py @@ -14,14 +14,6 @@ MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def server(): args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index cb6ec795ae969..4fafcfb45fa22 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -12,15 +12,6 @@ from vllm.entrypoints.openai.protocol import ScoreResponse from ...utils import RemoteOpenAIServer - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - MODELS = [ { "name": "BAAI/bge-reranker-v2-m3", diff --git a/tests/entrypoints/openai/test_token_in_token_out.py b/tests/entrypoints/openai/test_token_in_token_out.py new file mode 100644 index 0000000000000..ed003939c44be --- /dev/null +++ b/tests/entrypoints/openai/test_token_in_token_out.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import tempfile + +import pytest + +from vllm.model_executor.model_loader.weight_utils import ( + download_weights_from_hf) +from vllm.transformers_utils.tokenizer import get_tokenizer + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b") + + +@pytest.fixture(scope="module") +def server(): + global MODEL_PATH + MODEL_PATH = download_weights_from_hf( + MODEL_NAME, + allow_patterns=["*"], + cache_dir=MODEL_PATH, + ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"]) + args = [ + "--max-model-len", + "2048", + "--max-num-seqs", + "128", + "--enforce-eager", + "--skip-tokenizer-init", + "--load-format", + "dummy", + ] + with RemoteOpenAIServer(MODEL_PATH, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +async def test_token_in_token_out_and_logprobs(server): + """ + Test token-in-token-out and token_ids align with prompt_logprobs + & logprobs when return_tokens_as_token_ids is enabled. + """ + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) + text = "Hello, world! How are you today?" + token_ids = tokenizer.encode(text) + async with server.get_async_client() as client: + # Test with both return_token_ids and return_tokens_as_token_ids enabled + completion = await client.completions.create( + model=MODEL_PATH, + prompt=token_ids, + max_tokens=20, + temperature=0, + echo=True, + extra_body={ + "return_token_ids": True, + }, + ) + + # Verify all fields are present + assert (completion.choices[0].token_ids is not None + and 0 < len(completion.choices[0].token_ids) <= 20) + assert completion.choices[0].prompt_token_ids is not None + + # Decode prompt tokens + if completion.choices[0].prompt_token_ids: + prompt_text = tokenizer.decode( + completion.choices[0].prompt_token_ids) + # The decoded prompt should match or close to original prompt + assert prompt_text == text diff --git a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py new file mode 100644 index 0000000000000..969f14cc3fe62 --- /dev/null +++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + +from tests.kernels.utils import opcheck +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types + +if not current_platform.has_device_capability(100): + pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.", + allow_module_level=True) + +DTYPES = [torch.float16, torch.bfloat16] +SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)] +SEEDS = [42] +CUDA_DEVICES = ['cuda:0'] + +FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() +FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max + +BLOCK_SIZE = 16 + + +def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor, + global_scale: torch.Tensor, + ref_output_scale: torch.Tensor) -> torch.Tensor: + silu_and_mul_out = silu_and_mul.forward_native(x) + assert not current_platform.is_rocm() + assert silu_and_mul_out.ndim >= 1, ( + f'input.ndim needs to be >= 1, but got {silu_and_mul_out.ndim}.') + other_dims = 1 if silu_and_mul_out.ndim == 1 else -1 + silu_and_mul_out = silu_and_mul_out.reshape(other_dims, + silu_and_mul_out.shape[-1]) + m, n = silu_and_mul_out.shape + device = silu_and_mul_out.device + + # Two fp4 values will be packed into an uint8. + out = torch.empty((m, n // 2), device=device, dtype=torch.uint8) + + output_scale = ref_output_scale + + torch.ops._C.scaled_fp4_quant(out, silu_and_mul_out, output_scale, + global_scale) + + return out, output_scale + + +def ops_impl(x: torch.Tensor, global_scale: torch.Tensor, + ref_output_scale: torch.Tensor) -> torch.Tensor: + out_shape = (x.shape[0], x.shape[1] // 4) + output_scale = ref_output_scale + out = torch.empty(out_shape, dtype=torch.uint8, device=x.device) + torch.ops._C.silu_and_mul_nvfp4_quant(out, output_scale, x, global_scale) + return out, output_scale + + +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_quantize_to_fp4( + dtype: torch.dtype, + shape: tuple[int, int], + seed: int, + device: str, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + + m, n = shape + + x = torch.randn((m, n), dtype=dtype) + tensor_amax = torch.abs(x).max().to(torch.float32) + global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax + + block_size = 16 + + assert n % block_size == 0, ( + f'last dim has to be multiple of 16, but got {n}.') + assert x.dtype in (torch.float16, torch.bfloat16), ( + f'input.dtype needs to be fp16 or bf16 but got {x.dtype}.') + + round_up = lambda x, y: (x + y - 1) // y * y + rounded_m = round_up(x.shape[0], 128) + scale_n = x.shape[1] // (2 * block_size) + rounded_n = round_up(scale_n, 4) + output_scale = torch.empty((rounded_m, rounded_n // 4), + device=x.device, + dtype=torch.int32) + + layer = SiluAndMul() + + ref_out, ref_out_scale = ref_impl(layer, x, global_scale, output_scale) + + fusion_out, fusion_out_scale = ops_impl(x, global_scale, output_scale) + + assert ref_out.dtype == torch.uint8 + assert fusion_out.dtype == torch.uint8 + assert ref_out.shape == fusion_out.shape + + assert ref_out_scale.dtype == torch.int32 + assert fusion_out_scale.dtype == torch.int32 + assert ref_out_scale.shape == fusion_out_scale.shape + + # Allow up to 2% of mismatched values since BF16 has accuracy issues. + mis_threshold = 0.02 + atol = 0.4 + rtol = 0.4 + ref_logits = ref_out[-1] + fusion_logits = fusion_out[-1] + + mis_count = torch.sum( + torch.abs(fusion_logits - ref_logits) > (atol + + rtol * torch.abs(ref_logits))) + mis_ratio = mis_count / fusion_logits.numel() + + assert mis_ratio < mis_threshold, \ + f"Mismatch ratio {mis_ratio} exceeds threshold {mis_threshold}" + + torch.testing.assert_close(ref_out_scale, fusion_out_scale) + + opcheck(torch.ops._C.silu_and_mul_nvfp4_quant, + (fusion_out, fusion_out_scale, x, global_scale)) diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index fb00e7b65b04a..5cffb8cfcc26d 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files): @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files): + # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for + # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use + # more GPU memory causing vLLM to OOM llm = vllm.LLM(MODEL_PATH, max_model_len=1024, enable_lora=True, @@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files): tensor_parallel_size=4, trust_remote_code=True, fully_sharded_loras=True, - enable_chunked_prefill=True) + enable_chunked_prefill=True, + gpu_memory_utilization=0.85) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) for i in range(len(EXPECTED_LORA_OUTPUT)): assert output1[i] == EXPECTED_LORA_OUTPUT[i] diff --git a/tests/lora/test_multi_loras_with_tp.py b/tests/lora/test_llm_with_multi_loras.py similarity index 80% rename from tests/lora/test_multi_loras_with_tp.py rename to tests/lora/test_llm_with_multi_loras.py index fe9bd3f269515..3d8dd512a2019 100644 --- a/tests/lora/test_multi_loras_with_tp.py +++ b/tests/lora/test_llm_with_multi_loras.py @@ -1,8 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ -Script to test multi loras service with tp >= 2 +This script contains: +1. test multi loras service with tp >= 2 +2. test multi loras request """ +import pytest + from tests.utils import multi_gpu_test from vllm import LLM, SamplingParams from vllm.lora.request import LoRARequest @@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync(): output_text = call_llm_get_outputs(prompt, "Alice") check_outputs(output_text, expected_output) + + +def test_multiple_lora_requests(): + llm = LLM( + model=MODEL_PATH, + enable_lora=True, + max_loras=4, + max_lora_rank=LORA_RANK, + max_model_len=512, + gpu_memory_utilization=0.5, + enforce_eager=True, + ) + PROMPTS = ["Hello, my name is"] * 2 + LORA_NAME = "Alice" + lora_request = [ + LoRARequest(LORA_NAME + str(idx), idx + 1, + LORA_NAME_PATH_MAP[LORA_NAME]) + for idx in range(len(PROMPTS)) + ] + # Multiple SamplingParams should be matched with each prompt + outputs = llm.generate(PROMPTS, lora_request=lora_request) + assert len(PROMPTS) == len(outputs) + + # Exception raised, if the size of params does not match the size of prompts + with pytest.raises(ValueError): + outputs = llm.generate(PROMPTS, lora_request=lora_request[:1]) + + # Single LoRARequest should be applied to every prompt + single_lora_request = lora_request[0] + outputs = llm.generate(PROMPTS, lora_request=single_lora_request) + assert len(PROMPTS) == len(outputs) diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 57382914bfea8..4c4434c94145a 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -92,7 +92,8 @@ AITER_MODEL_LIST = [ pytest.param( "allenai/OLMoE-1B-7B-0924-Instruct", marks=[pytest.mark.cpu_model], - ) + ), + pytest.param("swiss-ai/Apertus-8B"), # apertus ]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 2dd35c4151580..f918b2b91bcc3 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -10,14 +10,6 @@ from vllm.platforms import current_platform from ...utils import check_embeddings_close, check_transformers_version -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.mark.parametrize( "model", [ @@ -32,21 +24,15 @@ def v1(run_with_both_engines): "intfloat/e5-mistral-7b-instruct", # CPU v1 doesn't support sliding window marks=[pytest.mark.core_model]), - # the qwen models interfere with each other (see PR - # https://github.com/vllm-project/vllm/pull/18720). - # To avoid this problem, for now we skip v0 since it will be - # deprecated anyway. pytest.param("ssmits/Qwen2-7B-Instruct-embed-base", - marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]), + marks=[pytest.mark.cpu_model]), # [Encoder-only] pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-small"), - pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct", - marks=[pytest.mark.skip_v1]), + pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), # [Cross-Encoder] - pytest.param("sentence-transformers/stsb-roberta-base-v2", - marks=[pytest.mark.skip_v1]), + pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], ) def test_models( diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py index 8c6537f3193f8..5dd2d9eae9115 100644 --- a/tests/models/language/pooling/test_qwen3_reranker.py +++ b/tests/models/language/pooling/test_qwen3_reranker.py @@ -96,8 +96,5 @@ def test_rerank_models_mteb_tp(vllm_runner, "tensor_parallel_size": 2, } - mteb_test_rerank_models(Qwen3RerankerHfRunner, - vllm_runner, - model_info, - vllm_extra_kwargs, - atol=1.2e-2) + mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info, + vllm_extra_kwargs) diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py index beafa0aed9862..08722ac98b7ed 100644 --- a/tests/models/language/pooling/test_reward.py +++ b/tests/models/language/pooling/test_reward.py @@ -13,14 +13,6 @@ from ....conftest import HfRunner from ...utils import check_transformers_version -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture def math_step_prompts(): # ruff: noqa: E501 diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py index 6b5ff70681459..ef9d5530cde15 100644 --- a/tests/models/language/pooling/test_scoring.py +++ b/tests/models/language/pooling/test_scoring.py @@ -23,15 +23,6 @@ TEXTS_2 = [ "The capital of Germany is Berlin.", ] - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - DTYPE = "half" diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py index c53243b42e384..e369416fc49cc 100644 --- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py +++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py @@ -1,12 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Custom input builders for edge-cases in different models.""" -from io import BytesIO from typing import Callable -import requests -from PIL import Image - +from vllm.assets.image import ImageAsset from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import (rescale_video_size, resize_video, sample_frames_from_video) @@ -118,9 +115,9 @@ def different_patch_input_cases_internvl(): def windows_attention_image_qwen2_5_vl(): - # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 - image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg" - image = Image.open(BytesIO(requests.get(image_url).content)) + + # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501 + image = ImageAsset("hato").pil_image question = "Describe the image." img_prompt = "<|vision_start|><|image_pad|><|vision_end|>" diff --git a/tests/models/registry.py b/tests/models/registry.py index 85b4c96e3b1c3..a37ffdc311514 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -137,6 +137,9 @@ class _HfExamplesInfo: # yapf: disable _TEXT_GENERATION_EXAMPLE_MODELS = { # [Decoder-only] + "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B", + min_transformers_version="4.56.0", + trust_remote_code=True), "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True), "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", @@ -323,8 +326,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { _EMBEDDING_EXAMPLE_MODELS = { # [Text-only] - "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True), - "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True), # noqa: E501 + "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"), + "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), # noqa: E501 "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"), "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0", trust_remote_code=True), @@ -337,9 +340,9 @@ _EMBEDDING_EXAMPLE_MODELS = { "LlamaModel": _HfExamplesInfo("llama", is_available_online=False), "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base", - trust_remote_code=True, v0_only=True), + trust_remote_code=True), "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe", - trust_remote_code=True, v0_only=True), # noqa: E501 + trust_remote_code=True), # noqa: E501 "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B", max_transformers_version="4.53", @@ -347,9 +350,9 @@ _EMBEDDING_EXAMPLE_MODELS = { "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53", transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"), # noqa: E501 - "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True), # noqa: E501 - "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True), # noqa: E501 - "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True), # noqa: E501 + "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501 + "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"), # noqa: E501 + "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"), # noqa: E501 # [Multimodal] "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"), "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full", @@ -364,20 +367,19 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = { "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"), # noqa: E501 # [Cross-encoder] - "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True), # noqa: E501 + "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"), # noqa: E501 "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base", # noqa: E501 trust_remote_code=True, hf_overrides={ "architectures": ["GteNewForSequenceClassification"]}),# noqa: E501 - "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501 - "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True), # noqa: E501 - "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True), # noqa: E501 + "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base"), # noqa: E501 + "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"), # noqa: E501 + "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"), # noqa: E501 } _AUTOMATIC_CONVERTED_MODELS = { # Use as_seq_cls_model for automatic conversion "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma", # noqa: E501 - v0_only=True, hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501 "classifier_from_token": ["Yes"], # noqa: E501 "method": "no_post_processing"}), # noqa: E501 diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 8769ad45eb93e..36882aba5e941 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -24,6 +24,9 @@ from .registry import HF_EXAMPLE_MODELS @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs()) def test_registry_imports(model_arch): + # Skip if transformers version is incompatible + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + model_info.check_transformers_version(on_fail="skip") # Ensure all model classes can be imported successfully model_cls = ModelRegistry._try_load_model_cls(model_arch) assert model_cls is not None diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py deleted file mode 100644 index 520b88d03ac8e..0000000000000 --- a/tests/samplers/test_sampler.py +++ /dev/null @@ -1,769 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import itertools -import random -from dataclasses import dataclass -from typing import Optional -from unittest.mock import Mock, patch - -import pytest -import torch -from transformers import GenerationConfig, GenerationMixin - -import vllm.envs as envs -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import Counter, is_pin_memory_available - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This file tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -class MockLogitsSampler(Sampler): - - def __init__(self, fake_logits: torch.Tensor): - super().__init__() - self.fake_logits = fake_logits - - def forward(self, *args, **kwargs): - return super().forward(*args, **kwargs) - - -def _prepare_test( - batch_size: int -) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]: - input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) - fake_logits = torch.full((batch_size, VOCAB_SIZE), - 1e-2, - dtype=input_tensor.dtype) - sampler = MockLogitsSampler(fake_logits) - return input_tensor, fake_logits, sampler - - -VOCAB_SIZE = 32000 -RANDOM_SEEDS = list(range(128)) -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - - -def _do_sample( - batch_size: int, - input_tensor: torch.Tensor, - sampler: MockLogitsSampler, - sampling_params: SamplingParams, - device: str, -): - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - seq_lens: list[int] = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=sampling_params, - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available()) - return sampler(logits=input_tensor, sampling_metadata=sampling_metadata) - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_all_greedy(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - input_tensor, fake_logits, sampler = _prepare_test(batch_size) - - sampling_params = SamplingParams(temperature=0) - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - expected = torch.argmax(fake_logits, dim=-1) - for i, sequence_output in enumerate(sampler_output): - for nth_output in sequence_output.samples: - assert nth_output.output_token == expected[i].item() - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_all_random(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - - for i in range(batch_size): - fake_logits[i, i] = 1e2 - - sampling_params = SamplingParams( - temperature=1.0, - n=random.randint(1, 10), - ) - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - for i, sequence_output in enumerate(sampler_output): - for nth_output in sequence_output.samples: - assert nth_output.output_token == i - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_all_random_seed(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - - for i in range(batch_size): - fake_logits[i, i] = 1e2 - - sampling_params = SamplingParams( - temperature=1.0, - n=random.randint(1, 10), - seed=random.randint(0, 10000), - ) - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - for i, sequence_output in enumerate(sampler_output): - for nth_output in sequence_output.samples: - assert nth_output.output_token == i - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_all_random_seed_deterministic(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - - sampling_params = SamplingParams( - temperature=1.0, - n=random.randint(1, 10), - seed=random.randint(0, 10000), - ) - first_sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - second_sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - assert first_sampler_output == second_sampler_output - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_min_tokens_penalty(seed: int, device: str): - seq_id_counter = Counter(start=random.randint(0, 100)) - set_random_seed(seed) - torch.set_default_device(device) - - def create_sampling_params(min_tokens, - eos_token_id=0, - *, - stop_token_ids: Optional[list[int]] = None, - prompt_logprobs: Optional[int] = None): - sampling_params = SamplingParams( - min_tokens=min_tokens, - max_tokens=9999, # keep higher than max of min_tokens - stop_token_ids=stop_token_ids, - # requesting prompt_logprobs changes the structure of `logits` - prompt_logprobs=prompt_logprobs, - ) - sampling_params.all_stop_token_ids.add(eos_token_id) - return sampling_params - - def create_sequence_data(num_input=3, num_generated=0): - seq_data = SequenceData.from_seqs( - random.choices(range(0, VOCAB_SIZE), k=num_input)) - if num_generated > 0: - seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE), - k=num_generated) - return seq_data - - def generate_test_case(): - # generate multiple seq groups but limit total batch size - batch_size = random.randint(1, 128) - - expected_penalization = [] - sequence_metadata_list: list[SequenceGroupMetadata] = [] - # 20% chance to generate seq group metadata list with all prompts - is_prompt = random.random() < 0.2 - while batch_size > 0: - num_seqs = 1 if is_prompt else random.randint(1, batch_size) - - eos_token_id = random.randint(0, VOCAB_SIZE - 1) - min_tokens = random.randint(0, 50) - num_stop_tokens = random.randint(0, 8) - if num_stop_tokens > 0: - stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1), - k=num_stop_tokens) - else: - stop_token_ids = None - - sampling_params = create_sampling_params( - min_tokens=min_tokens, - eos_token_id=eos_token_id, - stop_token_ids=stop_token_ids) - - seq_data: dict[int, SequenceData] = {} - seq_group_penalization: list[bool] = [] - for _ in range(num_seqs): - num_input = random.randint(1, 100) - num_generated = 0 if is_prompt else random.randint(1, 100) - seq_data[next(seq_id_counter)] = create_sequence_data( - num_input=num_input, num_generated=num_generated) - seq_group_penalization.append(num_generated < min_tokens) - - expected_penalization.extend(seq_group_penalization) - sequence_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{batch_size}", - is_prompt=is_prompt, - seq_data=seq_data, - sampling_params=sampling_params, - block_tables={}, - )) - batch_size -= num_seqs - - return { - "expected_penalization": expected_penalization, - "seq_group_metadata_list": sequence_metadata_list, - } - - # define some explicit test cases for edge case behavior - prompt_without_penalization = { - "expected_penalization": [False], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_1", - is_prompt=True, - seq_data={ - next(seq_id_counter): create_sequence_data(), - }, - sampling_params=create_sampling_params(0), - block_tables={}, - ), - ] - } - - prompt_with_penalization = { - "expected_penalization": [True], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_1", - is_prompt=True, - seq_data={ - next(seq_id_counter): create_sequence_data(), - }, - sampling_params=create_sampling_params(1), - block_tables={}, - ), - ] - } - - prompt_with_penalization_and_prompt_logprobs = { - "expected_penalization": [False, False, True], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_1", - is_prompt=True, - seq_data={ - next(seq_id_counter): create_sequence_data(num_input=3), - }, - sampling_params=create_sampling_params(1, prompt_logprobs=3), - block_tables={}, - ), - ] - } - - stop_penalizing_after_min_tokens = { - "expected_penalization": [False], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_1", - is_prompt=False, - seq_data={ - next(seq_id_counter): - create_sequence_data(num_generated=1), - }, - sampling_params=create_sampling_params(1), - block_tables={}, - ) - ] - } - - stop_token_ids = [42, 99, 42, 0] # intentional duplication - prompt_combination = { - "expected_penalization": [False, True, False], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_2", - is_prompt=True, - seq_data={ - next(seq_id_counter): create_sequence_data(num_input=2), - }, - sampling_params=create_sampling_params(1, prompt_logprobs=3), - block_tables={}, - ), - SequenceGroupMetadata( - request_id="test_3", - is_prompt=True, - seq_data={ - next(seq_id_counter): create_sequence_data(), - }, - sampling_params=create_sampling_params( - 0, stop_token_ids=stop_token_ids), - block_tables={}, - ) - ] - } - - stop_token_ids = [1, 999, 37, 37] # intentional duplication - decode_combination = { - "expected_penalization": [True, False, False, True, False], - "seq_group_metadata_list": [ - SequenceGroupMetadata( - request_id="test_1", - is_prompt=False, - seq_data={ - next(seq_id_counter): - create_sequence_data(num_generated=1), - next(seq_id_counter): - create_sequence_data(num_generated=100), - }, - sampling_params=create_sampling_params( - 2, stop_token_ids=stop_token_ids), - block_tables={}, - ), - SequenceGroupMetadata( - request_id="test_2", - is_prompt=False, - seq_data={ - next(seq_id_counter): - create_sequence_data(num_generated=20), - next(seq_id_counter): - create_sequence_data(num_generated=1), - next(seq_id_counter): - create_sequence_data(num_generated=10), - }, - sampling_params=create_sampling_params( - 10, prompt_logprobs=5, stop_token_ids=stop_token_ids), - block_tables={}, - ), - ] - } - - if seed == 0: - test_cases = [ - prompt_without_penalization, - prompt_with_penalization, - prompt_with_penalization_and_prompt_logprobs, - stop_penalizing_after_min_tokens, - prompt_combination, - decode_combination, - ] - else: - test_cases = [generate_test_case()] - - def run_test_case(*, expected_penalization: list[bool], - seq_group_metadata_list: list[SequenceGroupMetadata]): - assert expected_penalization, \ - "Invalid test case, need expected_penalization" - assert seq_group_metadata_list, \ - "Invalid test case, need seq_group_metadata_list" - - batch_size = 0 - seq_lens: list[int] = [] - sampling_params_per_row: list[SamplingParams] = [] - for sgm in seq_group_metadata_list: - sampling_params = sgm.sampling_params - - num_rows = len(sgm.seq_data) - if sgm.is_prompt: - # a prompt seq_group has only one sequence - seq_data = next(iter(sgm.seq_data.values())) - prompt_len = seq_data.get_prompt_len() - seq_lens.append(prompt_len) - - assert sgm.sampling_params is not None - if sgm.sampling_params.prompt_logprobs: - # with prompt_logprobs each token in the prompt has a row in - # logits - num_rows = prompt_len - - batch_size += num_rows - sampling_params_per_row.extend( - itertools.repeat(sampling_params, num_rows)) - - assert len( - expected_penalization - ) == batch_size, \ - ("Invalid test case, expected_penalization does not match computed" - "batch size") - - _, fake_logits, sampler = _prepare_test(batch_size) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens=seq_lens if seq_lens else None, - query_lens=seq_lens if seq_lens else [1] * batch_size, - device=device, - pin_memory=is_pin_memory_available()) - # the logits tensor is modified in-place by the sampler - _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata) - - for logits_idx, (should_penalize, sampling_params) in enumerate( - zip(expected_penalization, sampling_params_per_row)): - - tokens_to_check = sampling_params.all_stop_token_ids - - if should_penalize: - for token_id in tokens_to_check: - assert fake_logits[logits_idx, token_id] == -float( - 'inf' - ), f"Expected token {token_id} for logits row {logits_idx}" - " to be penalized" - # no other tokens should be set to -inf - assert torch.count_nonzero( - fake_logits[logits_idx, :] == -float('inf')) == len( - tokens_to_check - ), f"Expected only {len(tokens_to_check)} to be penalized" - else: - # no tokens should be set to -inf - assert torch.count_nonzero( - fake_logits[logits_idx, :] == - -float('inf')) == 0, "No tokens should have been penalized" - - for test_case in test_cases: - run_test_case(**test_case) - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_mixed(seed: int, device: str): - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - input_tensor, fake_logits, sampler = _prepare_test(batch_size) - - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - expected_tokens: list[Optional[list[int]]] = [] - seq_lens: list[int] = [] - for i in range(batch_size): - expected: Optional[list[int]] = None - sampling_type = random.randint(0, 2) - if sampling_type == 0: - sampling_params = SamplingParams(temperature=0) - expected = [int(torch.argmax(fake_logits[i], dim=-1).item())] - elif sampling_type in (1, 2): - n = random.randint(1, 10) - sampling_params = SamplingParams( - temperature=random.random() + 0.1, - top_p=min(random.random() + 0.1, 1), - top_k=random.randint(0, 10), - n=n, - presence_penalty=random.randint(0, 1), - ) - if sampling_type == 2: - sampling_params.seed = random.randint(0, 10000) - else: - for idx in range(n): - fake_logits[i, i + idx] = 1e2 - expected = list(range(i, i + n)) - - expected_tokens.append(expected) - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=sampling_params, - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - generators: dict[str, torch.Generator] = {} - - def test_sampling(): - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available(), - generators=generators) - sampler_output = sampler(logits=fake_logits, - sampling_metadata=sampling_metadata) - - for i, (sequence_output, metadata) in enumerate( - zip(sampler_output, seq_group_metadata_list)): - assert metadata.sampling_params is not None - - if (metadata.sampling_params.seed is not None - and expected_tokens[i] is None): - # Record seeded random result to compare with results of - # second invocation - expected_tokens[i] = [ - nth_output.output_token - for nth_output in sequence_output.samples - ] - continue - - expected_tokens_item = expected_tokens[i] - assert expected_tokens_item is not None - - for n, nth_output in enumerate(sequence_output.samples): - assert metadata.sampling_params is not None - - if (metadata.sampling_params.temperature == 0 - or metadata.sampling_params.seed is not None): - # Ensure exact matches for greedy or random with seed - assert nth_output.output_token == expected_tokens_item[n] - else: - # For non-seeded random check that one of the high-logit - # tokens were chosen - assert nth_output.output_token in expected_tokens_item - - # Test batch - test_sampling() - - # Shuffle the batch and resample - target_index = list(range(batch_size)) - for list_to_shuffle in (target_index, seq_group_metadata_list, - expected_tokens, seq_lens): - random.Random(seed).shuffle(list_to_shuffle) - target_index = torch.tensor(target_index) - input_tensor.data = input_tensor.index_select(0, target_index) - fake_logits.data = fake_logits.index_select(0, target_index) - - # This time, results of seeded random samples will be compared with - # the corresponding sample in the pre-shuffled batch - test_sampling() - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_top_k_top_p(seed: int, device: str): - set_random_seed(seed) - batch_size = random.randint(1, 256) - top_k = random.randint(100, 500) - top_p = random.random() * 0.1 - vocab_size = 32000 - input_tensor = torch.rand((batch_size, 1024), - device=device, - dtype=torch.float16) - fake_logits = torch.normal(0, - 5, - size=(batch_size, vocab_size), - device=input_tensor.device, - dtype=input_tensor.dtype) - sampler = MockLogitsSampler(fake_logits) - - generation_model = GenerationMixin() - generation_config = GenerationConfig(top_k=top_k, - top_p=top_p, - do_sample=True) - - @dataclass - class MockConfig: - is_encoder_decoder: bool = False - - generation_model.config = MockConfig() # needed by the following method - generation_model._prepare_special_tokens(generation_config, device=device) - processors = generation_model._get_logits_processor(generation_config, - None, - None, - None, [], - device=device) - assert len(processors) == 2 # top_p and top_k - - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - seq_lens: list[int] = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=SamplingParams( - temperature=1, - top_k=top_k, - top_p=top_p, - ), - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available()) - - sample_probs = None - - def mock_sample(probs, *args, **kwargs): - nonlocal sample_probs - sample_probs = probs - return ([[prob.topk(1, dim=-1).indices.tolist(), [0]] - for prob in probs], None) - - # top-k and top-p is only calculated when flashinfer kernel is not available - with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \ - patch("vllm.model_executor.layers.sampler." - "flashinfer_top_k_top_p_sampling", None): - sampler(logits=fake_logits, sampling_metadata=sampling_metadata) - - assert sample_probs is not None - - hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone()) - hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) - torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5) - assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_flashinfer_fallback(seed: int, device: str): - if not envs.VLLM_USE_FLASHINFER_SAMPLER: - pytest.skip("Flashinfer sampler is disabled") - - pytest.skip("After FlashInfer 0.2.3, sampling will never fail") - - set_random_seed(seed) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - - def failing_flashinfer_sampling(*_args, **_kwargs): - return None, torch.zeros(batch_size, device=device, dtype=torch.int32) - - sampling_params = SamplingParams( - temperature=1.0, - n=random.randint(1, 10), - seed=random.randint(0, 10000), - ) - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - with patch( - "vllm.model_executor.layers.sampler." - "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling): - fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - assert sampler_output == fallback_sampler_output - - -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_repetition_penalty_mixed(device: str): - - vocab_size = 8 - - def test_sampling_params(sampling_params: list[SamplingParams]): - - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - seq_lens: list[int] = [] - for i in range(2): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=sampling_params[i], - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=is_pin_memory_available()) - - fake_logits = torch.full((2, vocab_size), - 1e-2, - device=device, - dtype=torch.float16) - - fake_logits[:, 5] = 1.1e-2 - fake_logits[:, 1] = 1.2e-2 - - sampler = MockLogitsSampler(fake_logits) - - sampler_output = sampler(logits=fake_logits, - sampling_metadata=sampling_metadata) - - generated_tokens = [] - for output in sampler_output: - generated_tokens.append(output.samples[0].output_token) - - return generated_tokens - - # one configuration is greedy with repetition_penalty - sampling_params_rep = SamplingParams( - temperature=0.0, - repetition_penalty=2.0, - ) - - # other configuration is sampling w/o repetition_penalty - sampling_params_sample = SamplingParams( - temperature=1.0, - top_k=1, - seed=42, - ) - - tokens1 = test_sampling_params( - [sampling_params_rep, sampling_params_sample]) - - tokens2 = test_sampling_params( - [sampling_params_sample, sampling_params_rep]) - - assert tokens1[0] == tokens2[1] - assert tokens1[1] == tokens2[0] - - -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_sampler_include_gpu_probs_tensor(device: str): - set_random_seed(42) - torch.set_default_device(device) - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - sampler.include_gpu_probs_tensor = True - sampler.should_modify_greedy_probs_inplace = False - - sampling_params = SamplingParams(temperature=0) - - mock_inplace = Mock() - with patch( - "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace", - mock_inplace): - - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - mock_inplace.assert_not_called() - - assert sampler_output.sampled_token_probs is not None - assert sampler_output.logprobs is not None - assert sampler_output.sampled_token_ids is not None diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py deleted file mode 100644 index 5a0efd98acc16..0000000000000 --- a/tests/samplers/test_seeded_generate.py +++ /dev/null @@ -1,86 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Verify that seeded random sampling is deterministic. - -Run `pytest tests/samplers/test_seeded_generate.py`. -""" -import copy -import random -from itertools import combinations - -import pytest - -from vllm import SamplingParams -from vllm.model_executor.utils import set_random_seed - -MODEL = "facebook/opt-125m" -RANDOM_SEEDS = list(range(5)) - - -@pytest.fixture -def vllm_model(vllm_runner, monkeypatch): - # This file relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") - with vllm_runner(MODEL, dtype="half") as vllm_model: - yield vllm_model - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -def test_random_sample_with_seed( - vllm_model, - example_prompts, - seed: int, -) -> None: - set_random_seed(seed) - - sampling_params = SamplingParams( - # Parameters to ensure sufficient randomness - temperature=3.0, - top_p=min(random.random() + 0.3, 1), - top_k=random.randint(5, 20), - n=random.randint(1, 10), - presence_penalty=random.randint(0, 1), - max_tokens=8, - ignore_eos=True, - ) - - sampling_params_seed_1 = copy.deepcopy(sampling_params) - sampling_params_seed_1.seed = 100 - sampling_params_seed_2 = copy.deepcopy(sampling_params) - sampling_params_seed_2.seed = 200 - - llm = vllm_model.llm - - for prompt in example_prompts: - for params in ( - sampling_params, - sampling_params_seed_1, - sampling_params_seed_2, - sampling_params, - sampling_params_seed_1, - sampling_params_seed_2, - ): - llm._add_request(prompt, params=params) - - results = llm._run_engine(use_tqdm=False) - all_outputs = [[out.token_ids for out in output.outputs] - for output in results] - - for i in range(0, len(example_prompts), 6): - outputs = all_outputs[i:i + 6] - - # verify all non-seeded requests differ - for output_a, output_b in combinations( - (outputs[0], outputs[1], outputs[2], outputs[3]), - 2, - ): - assert output_a != output_b - - # verify requests with the same seed match - assert outputs[1] == outputs[4] - assert outputs[2] == outputs[5] - - # verify generations within the same parallel sampling group differ - for output in outputs: - for sub_output_a, sub_output_b in combinations(output, 2): - assert sub_output_a != sub_output_b diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index ccafc88461275..ea7ccfbb2b456 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -64,8 +64,6 @@ def _run_incremental_decode(tokenizer, request = EngineCoreRequest("", prompt_token_ids, None, - None, - None, params, None, None, diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 04195ea0cf92e..66124dd854ee0 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -379,9 +379,9 @@ def test_duplicate_dict_args(caplog_vllm, parser): def test_supports_kw(callable,kw_name,requires_kw_only, allow_var_kwargs,is_supported): assert supports_kw( - callable=callable, - kw_name=kw_name, - requires_kw_only=requires_kw_only, + callable=callable, + kw_name=kw_name, + requires_kw_only=requires_kw_only, allow_var_kwargs=allow_var_kwargs ) == is_supported @@ -948,6 +948,36 @@ def test_join_host_port(): assert join_host_port("::1", 5555) == "[::1]:5555" +def test_json_count_leaves(): + """Test json_count_leaves function from jsontree utility.""" + from vllm.utils.jsontree import json_count_leaves + + # Single leaf values + assert json_count_leaves(42) == 1 + assert json_count_leaves("hello") == 1 + assert json_count_leaves(None) == 1 + + # Empty containers + assert json_count_leaves([]) == 0 + assert json_count_leaves({}) == 0 + assert json_count_leaves(()) == 0 + + # Flat structures + assert json_count_leaves([1, 2, 3]) == 3 + assert json_count_leaves({"a": 1, "b": 2}) == 2 + assert json_count_leaves((1, 2, 3)) == 3 + + # Nested structures + nested_dict = {"a": 1, "b": {"c": 2, "d": 3}} + assert json_count_leaves(nested_dict) == 3 + + nested_list = [1, [2, 3], 4] + assert json_count_leaves(nested_list) == 4 + + mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4} + assert json_count_leaves(mixed_nested) == 4 + + def test_convert_ids_list_to_tokens(): tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") token_ids = tokenizer.encode("Hello, world!") diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 47c74aff1e753..e738f2bd46472 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -7,7 +7,8 @@ import pytest import torch from vllm.config import ModelConfig, SchedulerConfig, VllmConfig -from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.inputs import (MultiModalFeatureSpec, + MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit from vllm.v1.core.kv_cache_manager import KVCacheManager @@ -37,17 +38,20 @@ def make_request( mm_hashes: Optional[list[str]] = None, cache_salt: Optional[str] = None, ): - if mm_positions is None: - mm_kwargs = None - else: - mm_item = MultiModalKwargsItem.dummy("dummy_m") - mm_kwargs = [mm_item] * len(mm_positions) + mm_features = [] + if mm_positions is not None: + for j, position in enumerate(mm_positions): + identifier = mm_hashes[j] if mm_hashes else f"hash_{j}" + mm_feature = MultiModalFeatureSpec( + data=MultiModalKwargsItem.dummy("dummy_m"), + mm_position=position, + identifier=identifier, + modality="image") + mm_features.append(mm_feature) return Request(request_id=request_id, prompt_token_ids=prompt_token_ids, - multi_modal_kwargs=mm_kwargs, - multi_modal_hashes=mm_hashes, - multi_modal_placeholders=mm_positions, + mm_features=mm_features if mm_features else None, sampling_params=SamplingParams(max_tokens=17), pooling_params=None, eos_token_id=100, @@ -597,8 +601,14 @@ def test_unify_kv_cache_configs(): ] unify_kv_cache_configs(need_sort_kv_cache_config) - assert need_sort_kv_cache_config[0].num_blocks == 10 - assert need_sort_kv_cache_config[1].num_blocks == 10 + sorted_kv_cache_groups = [ + KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)), + ] + assert ( + need_sort_kv_cache_config[0].kv_cache_groups == sorted_kv_cache_groups) + assert ( + need_sort_kv_cache_config[1].kv_cache_groups == sorted_kv_cache_groups) diff_kv_cache_config = [ KVCacheConfig( diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 89824768ed909..e7a8f63702b30 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -9,7 +9,8 @@ import pytest import torch from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved -from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.inputs import (MultiModalFeatureSpec, + MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams from vllm.utils import sha256, sha256_cbor_64bit from vllm.v1.core.block_pool import BlockPool @@ -32,17 +33,20 @@ def make_request( prompt_logprobs: Optional[int] = None, cache_salt: Optional[str] = None, ): - if mm_positions is None: - mm_kwargs = None - else: - mm_item = MultiModalKwargsItem.dummy("dummy_m") - mm_kwargs = [mm_item] * len(mm_positions) + mm_features = [] + if mm_positions is not None: + for j, position in enumerate(mm_positions): + identifier = mm_hashes[j] if mm_hashes else f"hash_{j}" + mm_feature = MultiModalFeatureSpec( + data=MultiModalKwargsItem.dummy("dummy_m"), + mm_position=position, + identifier=identifier, + modality="image") + mm_features.append(mm_feature) return Request(request_id=request_id, prompt_token_ids=prompt_token_ids, - multi_modal_kwargs=mm_kwargs, - multi_modal_hashes=mm_hashes, - multi_modal_placeholders=mm_positions, + mm_features=mm_features if mm_features else None, sampling_params=SamplingParams( max_tokens=17, prompt_logprobs=prompt_logprobs), pooling_params=None, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 70e8691788045..572d6c9c889f6 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -8,7 +8,8 @@ import torch from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) -from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.inputs import (MultiModalFeatureSpec, + MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import GuidedDecodingParams, SamplingParams from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler @@ -1308,21 +1309,24 @@ def create_requests_with_priority( prompt_logprobs=prompt_logprobs) requests = [] for i in range(num_requests): + mm_features = [] if mm_positions is not None: mm_position = mm_positions[i] - mm_item = MultiModalKwargsItem.dummy("dummy_m") - mm_kwargs = [mm_item] * len(mm_position) - else: - mm_position = None - mm_kwargs = None + for j, position in enumerate(mm_position): + identifier = f"hash{i}_{j}" + mm_feature = MultiModalFeatureSpec( + data=MultiModalKwargsItem.dummy("dummy_m"), + mm_position=position, + identifier=identifier, + modality="image") + mm_features.append(mm_feature) + request = Request( request_id=f"{i + starting_idx}", prompt_token_ids=[i + starting_idx] * num_tokens, sampling_params=sampling_params, pooling_params=None, - multi_modal_kwargs=mm_kwargs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=None, + mm_features=mm_features if mm_features else None, eos_token_id=EOS_TOKEN_ID, arrival_time=arrival_times[i], priority=priorities[i], @@ -1801,9 +1805,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): request = Request( request_id="0", prompt_token_ids=[0, 1], - multi_modal_kwargs=None, - multi_modal_hashes=None, - multi_modal_placeholders=None, + mm_features=None, sampling_params=sampling_params, pooling_params=None, eos_token_id=EOS_TOKEN_ID, diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 78a71f10a5940..e392c2c336e9b 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -6,7 +6,8 @@ import torch from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) -from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.inputs import (MultiModalFeatureSpec, + MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, init_none_hash) @@ -139,19 +140,20 @@ def create_requests( prompt_logprobs=prompt_logprobs) requests = [] for i in range(num_requests): + mm_features = [] if mm_positions is not None: mm_position = mm_positions[i] - mm_item = MultiModalKwargsItem.dummy("dummy_m") - mm_kwargs = [mm_item] * len(mm_position) - # Dummy hash for each mm item should be unique - # since encoder cache tracks entries by hash - mm_hashes = [ - "hash" + str(i) + "_" + str(j) for j in range(len(mm_position)) - ] - else: - mm_position = None - mm_kwargs = None - mm_hashes = None + for j, position in enumerate(mm_position): + # Dummy hash for each mm item should be unique + # since encoder cache tracks entries by hash + identifier = f"hash{i}_{j}" + mm_feature = MultiModalFeatureSpec( + data=MultiModalKwargsItem.dummy("dummy_m"), + mm_position=position, + identifier=identifier, + modality="image") + mm_features.append(mm_feature) + prompt_token_ids = ([0] * num_tokens if same_prompt else [i] * num_tokens) request = Request( @@ -159,9 +161,7 @@ def create_requests( prompt_token_ids=prompt_token_ids, sampling_params=sampling_params, pooling_params=None, - multi_modal_kwargs=mm_kwargs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=mm_hashes, + mm_features=mm_features if mm_features else None, eos_token_id=EOS_TOKEN_ID, block_hasher=block_hasher, ) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index d72e50e5196b8..6bc9b2b1d82d2 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random -from typing import Optional, Union import pytest import torch @@ -10,12 +9,6 @@ import torch from vllm import LLM, SamplingParams from vllm.config import CompilationConfig, CompilationLevel from vllm.distributed import cleanup_dist_env_and_memory -from vllm.forward_context import get_forward_context -from vllm.model_executor.models.gemma3n_mm import ( - Gemma3nForConditionalGeneration) -from vllm.model_executor.models.registry import ModelRegistry -from vllm.model_executor.models.utils import extract_layer_index -from vllm.sequence import IntermediateTensors from ...utils import fork_new_process_for_each_test @@ -23,54 +16,6 @@ from ...utils import fork_new_process_for_each_test SEED = 42 -class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration): - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - **kwargs, - ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = super().forward(input_ids, positions, - intermediate_tensors, inputs_embeds, - **kwargs) - attn_metadata = get_forward_context().attn_metadata - # attn_metadata is None during dummy runs - if (attn_metadata is not None - and self.language_model.cache_config.kv_sharing_fast_prefill): - assert isinstance(attn_metadata, dict) # true in V1 - # Gemma3n-E2B has 30 layers, with last 20 layers being - # cross-decoder layers. Check attention metadata is correct - for layer_name, metadata in attn_metadata.items(): - layer_idx = extract_layer_index(layer_name) - if layer_idx >= 20: - assert hasattr(metadata, 'logits_indices_padded') - assert hasattr(metadata, 'num_logits_indices') - else: - assert not hasattr(metadata, 'logits_indices_padded') - assert not hasattr(metadata, 'num_logits_indices') - - # Last layer will be a KV sharing layer - layer_attn_metadata = attn_metadata[ - self.language_model.model.layers[-1].self_attn.attn.layer_name] - logits_indices_padded = (layer_attn_metadata.logits_indices_padded) - assert logits_indices_padded is not None - num_logits_indices = layer_attn_metadata.num_logits_indices - assert num_logits_indices > 0 - # Reset hidden states to random values and - # only set logits at logits_indices to valid values - # Because logits_indices are the only positions that are used - # for output token sampling, this still produces same outputs - logits_hs = hidden_states[logits_indices_padded] - hidden_states = torch.randn_like(hidden_states) - gen_indices = logits_indices_padded[:num_logits_indices] - hidden_states[gen_indices] = logits_hs[:num_logits_indices] - - return hidden_states - - @pytest.fixture def test_prompts(): """ @@ -119,13 +64,12 @@ def cleanup(llm: LLM, compilation_config: CompilationConfig): @fork_new_process_for_each_test @pytest.mark.parametrize("enforce_eager", [True]) +@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill") def test_kv_sharing_fast_prefill( monkeypatch: pytest.MonkeyPatch, enforce_eager: bool, test_prompts: list[str], ): - ModelRegistry.register_model("Gemma3nForConditionalGeneration", - TestGemma3nForConditionalGeneration) sampling_params = SamplingParams(temperature=0.0, max_tokens=100) compilation_config = CompilationConfig( # This allows vLLM compilation backend to handle allocating and diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 2ea957a3e230f..98265c6349578 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -35,9 +35,7 @@ def make_request() -> EngineCoreRequest: return EngineCoreRequest( request_id=str(uuid.uuid4()), prompt_token_ids=PROMPT_TOKENS, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, sampling_params=SamplingParams(), pooling_params=None, eos_token_id=None, @@ -308,17 +306,17 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): # Schedule Batch 1: (10, req0) assert engine_core.step_with_batch_queue()[0] is None - assert engine_core.batch_queue.qsize() == 1 - scheduler_output = engine_core.batch_queue.queue[-1][1] + assert len(engine_core.batch_queue) == 1 + scheduler_output = engine_core.batch_queue[-1][1] assert scheduler_output.num_scheduled_tokens["0"] == 10 # num_computed_tokens should have been updated immediately. assert engine_core.scheduler.requests[ req0.request_id].num_computed_tokens == 10 # Schedule Batch 2: (2, req0), (8, req1) - assert engine_core.step_with_batch_queue()[0] is None - assert engine_core.batch_queue.qsize() == 2 - scheduler_output = engine_core.batch_queue.queue[-1][1] + assert engine_core.step_with_batch_queue()[0] == {} + assert len(engine_core.batch_queue) == 1 + scheduler_output = engine_core.batch_queue[-1][1] assert scheduler_output.num_scheduled_tokens["0"] == 2 assert scheduler_output.num_scheduled_tokens["1"] == 8 # num_computed_tokens should have been updated immediately. @@ -327,42 +325,32 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): assert engine_core.scheduler.get_num_unfinished_requests() == 2 - # Batch queue is full. Finish Batch 1. - engine_core.step_with_batch_queue() - - # Schedule Batch 3: (4, req1). Note that req0 cannot be scheduled + # Finish Batch 1 and schedule Batch 3: (4, req1). + # Note that req0 cannot be scheduled # because it is in the decoding stage now. engine_core.step_with_batch_queue() - assert engine_core.batch_queue.qsize() == 2 - scheduler_output = engine_core.batch_queue.queue[-1][1] + assert len(engine_core.batch_queue) == 1 + scheduler_output = engine_core.batch_queue[-1][1] assert scheduler_output.num_scheduled_tokens["1"] == 4 - # Batch queue is full. Finish Batch 2. Get first token of req0. + # Finish Batch 2. Get first token of req0. + # Schedule Batch 4: (1, req0). output = engine_core.step_with_batch_queue()[0].get(0) assert output is not None assert len(output.outputs) == 1 assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13 - - # Schedule Batch 4: (1, req0). - engine_core.step_with_batch_queue() - assert engine_core.batch_queue.qsize() == 2 - scheduler_output = engine_core.batch_queue.queue[-1][1] + scheduler_output = engine_core.batch_queue[-1][1] assert scheduler_output.num_scheduled_tokens["0"] == 1 - # Batch queue is full. Finish Batch 3. Get first token of req1. + # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1). output = engine_core.step_with_batch_queue()[0].get(0) assert output is not None assert len(output.outputs) == 1 assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13 - - # Schedule Batch 5: (1, req1). - engine_core.step_with_batch_queue() - assert engine_core.batch_queue.qsize() == 2 - scheduler_output = engine_core.batch_queue.queue[-1][1] + scheduler_output = engine_core.batch_queue[-1][1] assert scheduler_output.num_scheduled_tokens["1"] == 1 # Loop until req0 is finished. - step = 0 req_id = 0 expected_num_tokens = [ engine_core.scheduler.requests["0"].num_tokens + 1, @@ -370,19 +358,14 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): ] while engine_core.scheduler.get_num_unfinished_requests() == 2: output = engine_core.step_with_batch_queue()[0] - if step % 2 == 0: - # Even steps consumes an output. - assert output is not None - assert len(output[0].outputs) == 1 - if req_id in engine_core.scheduler.requests: - assert engine_core.scheduler.requests[ - req_id].num_tokens == expected_num_tokens[req_id] - expected_num_tokens[req_id] += 1 - req_id = (req_id + 1) % 2 - else: - # Odd steps schedules a new batch. - assert output is None - step += 1 + # Every step consumes an output. + assert output is not None + assert len(output[0].outputs) == 1 + if req_id in engine_core.scheduler.requests: + assert engine_core.scheduler.requests[ + req_id].num_tokens == expected_num_tokens[req_id] + expected_num_tokens[req_id] += 1 + req_id = (req_id + 1) % 2 @multi_gpu_test(num_gpus=2) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 37eb869fe69a3..625a3470e8025 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -52,9 +52,7 @@ def make_request( return EngineCoreRequest( request_id=str(uuid.uuid4()), prompt_token_ids=prompt_tokens_ids, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, sampling_params=params, pooling_params=None, eos_token_id=None, diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py index f028b4ab1d73f..f3d8e13088b03 100644 --- a/tests/v1/engine/test_fast_incdec_prefix_err.py +++ b/tests/v1/engine/test_fast_incdec_prefix_err.py @@ -26,16 +26,14 @@ def test_fast_inc_detok_invalid_utf8_err_case(): prompt_token_ids = [107, 4606, 236787, 107] params = SamplingParams(skip_special_tokens=True) request = EngineCoreRequest( - "test", - prompt_token_ids, - None, - None, - None, - params, - None, - None, - 0.0, - None, + request_id="test", + prompt_token_ids=prompt_token_ids, + mm_features=None, + sampling_params=params, + pooling_params=None, + eos_token_id=None, + arrival_time=0.0, + lora_request=None, cache_salt=None, data_parallel_rank=None, ) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index c113439a70228..6544e8b017e70 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -52,11 +52,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind, requests = [ EngineCoreRequest(request_id=f"request-{idx}", prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, eos_token_id=None, + arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, @@ -401,11 +399,9 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind, requests = [ EngineCoreRequest(request_id=request_id_list[idx], prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, eos_token_id=None, + arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, @@ -566,11 +562,9 @@ def test_stop_token(include_stop_str_in_output: bool, request = EngineCoreRequest( request_id=request_id, prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, eos_token_id=eos_token_id, + arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, @@ -665,11 +659,9 @@ def test_stop_string(include_stop_str_in_output: bool, EngineCoreRequest( request_id=request_id_list[idx], prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, eos_token_id=None, + arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, @@ -781,11 +773,9 @@ def test_iteration_stats(dummy_test_vectors): EngineCoreRequest( request_id=f"request-{idx}", prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_kwargs=None, - mm_hashes=None, - mm_placeholders=None, + mm_features=None, eos_token_id=None, + arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py new file mode 100644 index 0000000000000..970a59eca8ece --- /dev/null +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -0,0 +1,229 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig +from vllm.platforms.interface import UnspecifiedPlatform +from vllm.sampling_params import SamplingParams +from vllm.v1.engine import processor as processor_mod +from vllm.v1.engine.processor import Processor + +cherry_pil_image = ImageAsset("cherry_blossom").pil_image +stop_pil_image = ImageAsset("stop_sign").pil_image +baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays + + +# Mock processor for testing +def _mk_processor(monkeypatch, + *, + mm_cache_gb: float = 4.0, + enable_prefix_caching: bool = True) -> Processor: + """ + Create a Processor instance with minimal configuration suitable for unit + tests without accessing external resources. + """ + monkeypatch.setattr(ModelConfig, + "try_get_generation_config", + lambda self: {}, + raising=True) + monkeypatch.setattr(ModelConfig, + "__post_init__", + lambda self: None, + raising=True) + monkeypatch.setattr(UnspecifiedPlatform, + "is_async_output_supported", + classmethod(lambda cls, enforce_eager: True), + raising=True) + monkeypatch.setattr( + ModelConfig, + "verify_async_output_proc", + lambda self, parallel_config, speculative_config, device_config: None, + raising=True) + monkeypatch.setattr(ModelConfig, + "verify_with_parallel_config", + lambda self, parallel_config: None, + raising=True) + monkeypatch.setattr(processor_mod, + "processor_cache_from_config", + lambda vllm_config, mm_registry: None, + raising=True) + + monkeypatch.setattr(VllmConfig, + "__post_init__", + lambda self: None, + raising=True) + + model_config = ModelConfig( + skip_tokenizer_init=True, + max_model_len=128, + mm_processor_cache_gb=mm_cache_gb, + generation_config="vllm", + tokenizer="dummy", + ) + + # Minimal multimodal_config to satisfy references in + # Processor.process_inputs. + class _MockMMConfig: + + def __init__(self, gb: float): + self.mm_processor_cache_gb = gb + + model_config.multimodal_config = _MockMMConfig( + mm_cache_gb) # type: ignore[attr-defined] + vllm_config = VllmConfig( + model_config=model_config, + cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching), + device_config=DeviceConfig(device="cpu"), + ) + + # Pass tokenizer=None; InputPreprocessor handles None when + # skip_tokenizer_init is True. + return Processor(vllm_config, tokenizer=None) # type: ignore[arg-type] + + +def test_multi_modal_uuids_length_mismatch_raises(monkeypatch): + processor = _mk_processor(monkeypatch) + + prompt = { + "prompt": "USER: \nDescribe\nASSISTANT:", + "multi_modal_data": { + "image": [cherry_pil_image, stop_pil_image] + }, + # Mismatch: 2 items but only 1 uuid provided + "multi_modal_uuids": { + "image": ["hash_cherry"] + }, + } + + with pytest.raises(ValueError, match="must have same length as data"): + processor.process_inputs( + request_id="req-1", + prompt=prompt, # type: ignore[arg-type] + params=SamplingParams(), + ) + + +def test_multi_modal_uuids_missing_modality_raises(monkeypatch): + processor = _mk_processor(monkeypatch) + + prompt = { + "prompt": "USER: