diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh index c8db951381b0b..0745da8dc418d 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on chartqa for vllm. # # Make sure you have lm-eval-harness installed: -# pip install lm-eval==0.4.9 +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh index 897f84d1e360d..5c17a06245bcf 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on GSM for transformers. # # Make sure you have lm-eval-harness installed: -# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index 792f355c47a51..1b617ff17c41c 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh index d85a1721db9a5..12336d7f85bc9 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +# pip install "lm-eval[api]>=0.4.9.2" usage() { echo`` diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index cbb2527a4ff0a..6959f81eab373 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ + && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index f022fa3672eeb..eafc82b98439b 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ + && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index a4d89a46b01ac..f28785e1ad205 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -162,7 +162,10 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py + # Need tf32 to avoid conflicting precision issue with terratorch on ROCm. + # TODO: Remove after next torch update + - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py - pytest -v -s entrypoints/test_chat_utils.py - label: Entrypoints Integration Test (API Server 2) @@ -219,6 +222,9 @@ steps: - tests/v1/engine/test_engine_core_client.py - tests/distributed/test_symm_mem_allreduce.py commands: + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 # test with torchrun tp=2 and external_dp=2 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with torchrun tp=2 and pp=2 @@ -267,9 +273,10 @@ steps: - vllm/v1/executor/uniproc_executor.py - vllm/v1/worker/gpu_worker.py commands: - # https://github.com/NVIDIA/nccl/issues/1838 - #- export NCCL_CUMEM_HOST_ENABLE=0 # test with torchrun tp=2 and dp=4 with ep + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - label: EPLB Algorithm Test # 5min @@ -979,7 +986,10 @@ steps: - export MIOPEN_DEBUG_CONV_GEMM=0 - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py + # Need tf32 to avoid conflicting precision issue with terratorch on ROCm. + # TODO: Remove after next torch update + - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - label: Multi-Modal Accuracy Eval (Small Models) # 5min @@ -1288,6 +1298,9 @@ steps: - tests/v1/shutdown - tests/v1/worker/test_worker_memory_snapshot.py commands: + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -1341,7 +1354,9 @@ steps: # end platform plugin tests # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin - pip install -e ./plugins/prithvi_io_processor_plugin - - pytest -v -s plugins_tests/test_io_processor_plugins.py + # Need tf32 to avoid conflicting precision issue with terratorch on ROCm. + # TODO: Remove after next torch update + - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y # end io_processor plugins test # begin stat_logger plugins test @@ -1510,7 +1525,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### diff --git a/csrc/cache.h b/csrc/cache.h index cbe44c09eb624..42ccb589683a9 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -9,16 +9,6 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst, const torch::Tensor& block_mapping); -// Note: the key_caches and value_caches vectors are constant but -// not the Tensors they contain. The vectors need to be const refs -// in order to satisfy pytorch's C++ operator registration code. -void copy_blocks(std::vector const& key_caches, - std::vector const& value_caches, - const torch::Tensor& block_mapping); - -void copy_blocks_mla(std::vector const& kv_caches, - const torch::Tensor& block_mapping); - void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index f11c5f24c12ec..a02fcb617910f 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -119,94 +119,6 @@ __global__ void copy_blocks_mla_kernel( } // namespace vllm -// Note: the key_caches and value_caches vectors are constant but -// not the Tensors they contain. The vectors need to be const refs -// in order to satisfy pytorch's C++ operator registration code. -void copy_blocks(std::vector const& key_caches, - std::vector const& value_caches, - const torch::Tensor& block_mapping) { - int num_layers = key_caches.size(); - TORCH_CHECK(num_layers == value_caches.size()); - if (num_layers == 0) { - return; - } - torch::Device cache_device = key_caches[0].device(); - TORCH_CHECK(cache_device.is_cuda()); - - // Create data structures for the kernel. - // Create an array of pointers to the key and value caches. - int64_t key_cache_ptrs[num_layers]; - int64_t value_cache_ptrs[num_layers]; - for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) { - key_cache_ptrs[layer_idx] = - reinterpret_cast(key_caches[layer_idx].data_ptr()); - value_cache_ptrs[layer_idx] = - reinterpret_cast(value_caches[layer_idx].data_ptr()); - } - - // block_mapping is a 2D tensor with shape (num_pairs, 2). - int num_pairs = block_mapping.size(0); - - // Move the data structures to the GPU. - // NOTE: This synchronizes the CPU and GPU. - torch::Tensor key_cache_ptrs_tensor = - torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64) - .to(cache_device); - torch::Tensor value_cache_ptrs_tensor = - torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64) - .to(cache_device); - - // Launch the kernel. - const int numel_per_block = key_caches[0][0].numel(); - dim3 grid(num_layers, num_pairs); - dim3 block(std::min(1024, numel_per_block)); - const at::cuda::OptionalCUDAGuard device_guard(cache_device); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( - key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] { - vllm::copy_blocks_kernel<<>>( - key_cache_ptrs_tensor.data_ptr(), - value_cache_ptrs_tensor.data_ptr(), - block_mapping.data_ptr(), numel_per_block); - })); -} - -// copy blocks kernel for MLA (assumes a joint KV-cache) -void copy_blocks_mla(std::vector const& kv_caches, - const torch::Tensor& block_mapping) { - int num_layers = kv_caches.size(); - if (num_layers == 0) { - return; - } - torch::Device cache_device = kv_caches[0].device(); - TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA"); - - std::vector cache_ptrs(num_layers); - for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) { - cache_ptrs[layer_idx] = - reinterpret_cast(kv_caches[layer_idx].data_ptr()); - } - torch::Tensor cache_ptrs_tensor = - torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64) - .to(cache_device); - - int num_pairs = block_mapping.size(0); - // We use the stride instead of numel in case the cache is padded for memory - // alignment reasons, we assume the blocks data (inclusive of any padding) - // is contiguous in memory - int mem_footprint_per_block = kv_caches[0].stride(0); - dim3 grid(num_layers, num_pairs); - dim3 block(std::min(1024, mem_footprint_per_block)); - const at::cuda::OptionalCUDAGuard device_guard(cache_device); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( - kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] { - vllm::copy_blocks_mla_kernel<<>>( - cache_ptrs_tensor.data_ptr(), - block_mapping.data_ptr(), mem_footprint_per_block); - })); -} - namespace vllm { // Used to copy/convert one element @@ -539,9 +451,6 @@ __global__ void indexer_k_quant_and_cache_kernel( for (int i = 0; i < VEC_SIZE; i++) { amax = fmaxf(amax, fabsf(float(k_val_ptr[i]))); } -#ifndef USE_ROCM - __syncwarp(); -#endif // Reduced amax for (int mask = 16; mask > 0; mask /= 2) { @@ -551,9 +460,7 @@ __global__ void indexer_k_quant_and_cache_kernel( amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask)); #endif } -#ifndef USE_ROCM - __syncwarp(); -#endif + #if defined(__gfx942__) float scale = fmaxf(amax, 1e-4) / 224.0f; #else diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index 88bc3c509790c..f2085b73b6a48 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -24,6 +24,8 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { #ifndef VLLM_NUMA_DISABLED std::string init_cpu_threads_env(const std::string& cpu_ids) { bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str()); + TORCH_CHECK(omp_cpu_mask != nullptr, + "Failed to parse CPU string: " + cpu_ids); TORCH_CHECK(omp_cpu_mask->size > 0); std::vector omp_cpu_ids; omp_cpu_ids.reserve(omp_cpu_mask->size); @@ -44,20 +46,12 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { // Memory node binding if (numa_available() != -1) { - int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front()); std::set node_ids; for (const auto& cpu_id : omp_cpu_ids) { int node_id = numa_node_of_cpu(cpu_id); if (node_id != -1) { node_ids.insert(node_id); } - if (node_id != mem_node_id) { - TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ", - omp_cpu_ids.front(), " is on NUMA node ", mem_node_id, - ". All CPUs should be on the same NUMA node for optimal " - "performance. Memory will be bound to NUMA node ", - mem_node_id, "."); - } } // Concatenate all node_ids into a single comma-separated string if (!node_ids.empty()) { @@ -70,7 +64,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { } bitmask* mask = numa_parse_nodestring(node_ids_str.c_str()); - bitmask* src_mask = numa_get_membind(); + bitmask* src_mask = numa_get_mems_allowed(); int pid = getpid(); @@ -83,15 +77,46 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { std::to_string(errno)); } - // restrict memory allocation node. - numa_set_membind(mask); + // Restrict memory allocation to the selected NUMA node(s). + // Enhances memory locality for the threads bound to those NUMA CPUs. + if (node_ids.size() > 1) { + errno = 0; + numa_set_interleave_mask(mask); + if (errno != 0) { + TORCH_WARN("numa_set_interleave_mask failed. errno: " + + std::to_string(errno)); + } else { + TORCH_WARN( + "NUMA binding: Using INTERLEAVE policy for memory " + "allocation across multiple NUMA nodes (nodes: " + + node_ids_str + + "). Memory allocations will be " + "interleaved across the specified NUMA nodes."); + } + } else { + errno = 0; + numa_set_membind(mask); + if (errno != 0) { + TORCH_WARN("numa_set_membind failed. errno: " + + std::to_string(errno)); + } else { + TORCH_WARN( + "NUMA binding: Using MEMBIND policy for memory " + "allocation on the NUMA nodes (" + + node_ids_str + + "). Memory allocations will be " + "strictly bound to these NUMA nodes."); + } + } + numa_set_strict(1); numa_free_nodemask(mask); numa_free_nodemask(src_mask); } else { - TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " + - std::to_string(errno)); + TORCH_WARN( + "numa_parse_nodestring or numa_get_run_node_mask failed. errno: " + + std::to_string(errno)); } } } diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu index 6acadb4cefd2c..8e38deeb6607f 100644 --- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu +++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu @@ -35,7 +35,7 @@ template __host__ __device__ inline Int round_up(Int x, Int y) { static_assert(std::is_integral_v, "round_up argument must be integral type"); - return (x + y - 1) / y * y; + return ((x + y - 1) / y) * y; } // Compute effective rows for grid configuration with swizzled SF layouts. @@ -61,37 +61,47 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) int sf_m = round_up(numRows, 128); int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE; int sf_n_int = round_up(sf_n_unpadded, 4) / 4; - for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) { - // Each thread writes 4 uint32_t elements. - for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int; - col += blockDim.x * 4) { - SFout[row * sf_n_int + col] = 0x00; - } - } + int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE; // Get the global scaling factor, which will be applied to the SF. // Note SFScale is the same as next GEMM's alpha, which is // (448.f / (Alpha_A / 6.f)). float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0]; - // Input tensor row/col loops. - for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) { - for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; + // Iterate over all rows and cols including padded ones - + // ensures we visit every single scale factor address to initialize it. + for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) { + for (int colIdx = threadIdx.x; + colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD; colIdx += blockDim.x) { + int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD; + + PackedVec in_vec; int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; - PackedVec in_vec = reinterpret_cast(in)[inOffset]; - // Get the output tensor offset. - // Same as inOffset because 8 elements are packed into one uint32_t. - int64_t outOffset = inOffset; - auto& out_pos = out[outOffset]; + + // If we are outside valid rows OR outside valid columns -> Use Zeros + if (rowIdx >= numRows || elem_idx >= numCols) { + memset(&in_vec, 0, sizeof(PackedVec)); + + } else { + // Valid Region: Load actual data + in_vec = reinterpret_cast(in)[inOffset]; + } auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( rowIdx, colIdx, numKTiles, SFout); - out_pos = + auto out_val = cvt_warp_fp16_to_fp4(in_vec, global_scale, sf_out); + + // We do NOT write output for padding because the 'out' tensor is not + // padded. + if (rowIdx < numRows && elem_idx < numCols) { + // Same as inOffset because 8 elements are packed into one uint32_t. + out[inOffset] = out_val; + } } } } @@ -134,4 +144,4 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, m, n, input_ptr, input_sf_ptr, reinterpret_cast(output_ptr), reinterpret_cast(sf_out)); }); -} +} \ No newline at end of file diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 461f74ca184fd..6f2c8e915b5cb 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -685,16 +685,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()"); cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks); - // Copy the cache blocks from src to dst. - cache_ops.def( - "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, " - "Tensor block_mapping) -> ()"); - cache_ops.impl("copy_blocks", torch::kCUDA, ©_blocks); - - cache_ops.def( - "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()"); - cache_ops.impl("copy_blocks_mla", torch::kCUDA, ©_blocks_mla); - // Reshape the key and value tensors and cache them. cache_ops.def( "reshape_and_cache(Tensor key, Tensor value," diff --git a/docker/Dockerfile b/docker/Dockerfile index e61021b6eeb85..679ffc4a7df5f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -183,7 +183,7 @@ ARG nvcc_threads=8 ENV NVCC_THREADS=$nvcc_threads ARG USE_SCCACHE -ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz +ARG SCCACHE_DOWNLOAD_URL ARG SCCACHE_ENDPOINT ARG SCCACHE_BUCKET_NAME=vllm-build-sccache ARG SCCACHE_REGION_NAME=us-west-2 @@ -201,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build" RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$USE_SCCACHE" = "1" ]; then \ echo "Installing sccache..." \ + && case "${TARGETPLATFORM}" in \ + linux/arm64) SCCACHE_ARCH="aarch64" ;; \ + linux/amd64) SCCACHE_ARCH="x86_64" ;; \ + *) echo "Unsupported TARGETPLATFORM for sccache: ${TARGETPLATFORM}" >&2; exit 1 ;; \ + esac \ + && export SCCACHE_DOWNLOAD_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \ && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \ && tar -xzf sccache.tar.gz \ - && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ - && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ + && sudo mv sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \ + && rm -rf sccache.tar.gz sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl \ && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \ && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ diff --git a/docs/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md index 37b29aa1a4876..06ad5f29a1a65 100644 --- a/docs/deployment/integrations/kserve.md +++ b/docs/deployment/integrations/kserve.md @@ -2,4 +2,4 @@ vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. -Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe. +You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview). diff --git a/docs/deployment/integrations/llm-d.md b/docs/deployment/integrations/llm-d.md new file mode 100644 index 0000000000000..cccf1773c6be6 --- /dev/null +++ b/docs/deployment/integrations/llm-d.md @@ -0,0 +1,5 @@ +# llm-d + +vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers. + +You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview). diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index 05814cbad9bfc..77a159009aa8d 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following: - [Helm](frameworks/helm.md) - [InftyAI/llmaz](integrations/llmaz.md) +- [llm-d](integrations/llm-d.md) - [KAITO](integrations/kaito.md) - [KServe](integrations/kserve.md) - [Kthena](integrations/kthena.md) diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index d4a6176b236f1..f17ef89a5cbf9 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio Install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` Load and run the model in `vllm`: diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index 9752039097d63..049a7ceed079b 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -18,7 +18,7 @@ pip install llmcompressor Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` ## Quantization Process diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index 701ca6378cb16..8af3e24c7357c 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -23,7 +23,7 @@ pip install llmcompressor Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` ## Quantization Process diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index c54d7d2251999..bbab97740ff19 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -20,7 +20,7 @@ for more installation details. Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: ```bash -pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] +pip install vllm "lm-eval[api]>=0.4.9.2" ``` ## Quantization Process diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 90d4ff96c52f7..6838fc227f355 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -490,6 +490,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A | `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | | `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | | `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | +| `LlamaBidirectionalModel`C | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ | | `LlamaModel`C, `LlamaForCausalLM`C, `MistralModel`C, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | | `Qwen2Model`C, `Qwen2ForCausalLM`C | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | | `Qwen3Model`C, `Qwen3ForCausalLM`C | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | @@ -543,8 +544,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | | `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | -| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | -| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | +| `LlamaBidirectionalForSequenceClassification`C | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2` (see note), etc. | ✅︎ | ✅︎ | +| `Qwen2ForSequenceClassification`C | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | +| `Qwen3ForSequenceClassification`C | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | | `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | @@ -562,6 +564,11 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A !!! note The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture. +!!! note + `nvidia/llama-nemotron-rerank-1b-v2` require a specific prompt format to work correctly. + + Examples : [offline_using_template.py](../../examples/pooling/score/offline_using_template.py) [online_using_template.py](../../examples/pooling/score/online_using_template.py) + !!! note Load the official original `mxbai-rerank-v2` by using the following command. diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 6a08f872def15..fb4b0b634145b 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -669,6 +669,21 @@ You can find the documentation for cross encoder models at [sbert.net](https://w Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py) +#### Score Template + +Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](#chat-template)). + +Score templates are supported for **cross-encoder** models only. If you are using an **embedding** model for scoring, vLLM does not apply a score template. + +Like chat templates, the score template receives a `messages` list. For scoring, each message has a `role` attribute—either `"query"` or `"document"`. For the usual kind of point-wise cross-encoder, you can expect exactly two messages: one query and one document. To access the query and document content, use Jinja's `selectattr` filter: + +- **Query**: `{{ (messages | selectattr("role", "eq", "query") | first).content }}` +- **Document**: `{{ (messages | selectattr("role", "eq", "document") | first).content }}` + +This approach is more robust than index-based access (`messages[0]`, `messages[1]`) because it selects messages by their semantic role. It also avoids assumptions about message ordering if additional message types are added to `messages` in the future. + +Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja) + #### Single inference You can pass a string to both `text_1` and `text_2`, forming a single sentence pair. diff --git a/examples/pooling/score/offline_using_template.py b/examples/pooling/score/offline_using_template.py new file mode 100644 index 0000000000000..427cbaab6fbc8 --- /dev/null +++ b/examples/pooling/score/offline_using_template.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 +from pathlib import Path + +from vllm import LLM + +model_name = "nvidia/llama-nemotron-rerank-1b-v2" + +# Path to template file +template_path = Path(__file__).parent / "template" / "nemotron-rerank.jinja" +chat_template = template_path.read_text() + +llm = LLM(model=model_name, runner="pooling", trust_remote_code=True) + +query = "how much protein should a female eat?" +documents = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", + "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.", + "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.", +] + +outputs = llm.score(query, documents, chat_template=chat_template) + +print("-" * 30) +print([output.outputs.score for output in outputs]) +print("-" * 30) diff --git a/examples/pooling/score/online_using_template.py b/examples/pooling/score/online_using_template.py new file mode 100644 index 0000000000000..66b22e0a9563f --- /dev/null +++ b/examples/pooling/score/online_using_template.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 +""" +Example of using the rerank API with template. + +run: + vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja +""" + +import json + +import requests + +url = "http://127.0.0.1:8000/rerank" + +headers = {"accept": "application/json", "Content-Type": "application/json"} + +query = "how much protein should a female eat?" +documents = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", + "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.", + "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.", +] + +data = { + "model": "nvidia/llama-nemotron-rerank-1b-v2", + "query": query, + "documents": documents, +} + + +def main(): + response = requests.post(url, headers=headers, json=data) + + # Check the response + if response.status_code == 200: + print("Request successful!") + print(json.dumps(response.json(), indent=2)) + else: + print(f"Request failed with status code: {response.status_code}") + print(response.text) + + +if __name__ == "__main__": + main() diff --git a/examples/pooling/score/template/nemotron-rerank.jinja b/examples/pooling/score/template/nemotron-rerank.jinja new file mode 100644 index 0000000000000..0447d7bcd5d59 --- /dev/null +++ b/examples/pooling/score/template/nemotron-rerank.jinja @@ -0,0 +1,3 @@ +question:{{ (messages | selectattr("role", "eq", "query") | first).content }} + + passage:{{ (messages | selectattr("role", "eq", "document") | first).content }} \ No newline at end of file diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 7b2c665448a3b..a5f6ac00d1c89 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.5 # required for voxtral test num2words # required for smolvlm test opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test -lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test +lm-eval[api]>=0.4.9.2 # required for model evaluation test mteb>=1.38.11, <2 # required for mteb test transformers==4.57.3 tokenizers==0.22.0 diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index 3f0fd235fba50..e4a3dd379d272 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -58,7 +58,7 @@ schemathesis==3.39.15 # OpenAI schema test # Evaluation and benchmarking -lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d +lm-eval[api]>=0.4.9.2 jiwer==4.0.0 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test diff --git a/requirements/test.in b/requirements/test.in index 55452ce83f232..b3fd733fb1bc0 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -34,8 +34,7 @@ num2words # required for smolvlm test open_clip_torch==2.32.0 # Required for nemotron_vl test opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test -# TODO: Use lm-eval[api]==0.4.10 once released -lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test +lm-eval[api]>=0.4.9.2 # required for model evaluation test mteb[bm25s]>=2, <3 # required for mteb test transformers==4.57.3 tokenizers==0.22.0 diff --git a/requirements/test.txt b/requirements/test.txt index ea2093e4347fe..4012c2d3b212b 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -441,7 +441,7 @@ lightning-utilities==0.14.3 # torchmetrics llvmlite==0.44.0 # via numba -lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d +lm-eval==0.4.9.2 # via -r requirements/test.in lxml==5.3.0 # via diff --git a/setup.py b/setup.py index 6fcb6653bc4a3..581d3c80c3d06 100644 --- a/setup.py +++ b/setup.py @@ -50,15 +50,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin")) sys.platform, ) VLLM_TARGET_DEVICE = "empty" -elif ( - sys.platform.startswith("linux") - and torch.version.cuda is None - and os.getenv("VLLM_TARGET_DEVICE") is None - and torch.version.hip is None -): - # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set, - # fallback to cpu - VLLM_TARGET_DEVICE = "cpu" +elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None: + if torch.version.hip is not None: + VLLM_TARGET_DEVICE = "rocm" + logger.info("Auto-detected ROCm") + elif torch.version.cuda is not None: + VLLM_TARGET_DEVICE = "cuda" + logger.info("Auto-detected CUDA") + else: + VLLM_TARGET_DEVICE = "cpu" def is_sccache_available() -> bool: @@ -108,20 +108,26 @@ class cmake_build_ext(build_ext): num_jobs = os.cpu_count() nvcc_threads = None - if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"): - # `nvcc_threads` is either the value of the NVCC_THREADS - # environment variable (if defined) or 1. - # when it is set, we reduce `num_jobs` to avoid - # overloading the system. - nvcc_threads = envs.NVCC_THREADS - if nvcc_threads is not None: - nvcc_threads = int(nvcc_threads) - logger.info( - "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads - ) - else: - nvcc_threads = 1 - num_jobs = max(1, num_jobs // nvcc_threads) + if _is_cuda() and CUDA_HOME is not None: + try: + nvcc_version = get_nvcc_cuda_version() + if nvcc_version >= Version("11.2"): + # `nvcc_threads` is either the value of the NVCC_THREADS + # environment variable (if defined) or 1. + # when it is set, we reduce `num_jobs` to avoid + # overloading the system. + nvcc_threads = envs.NVCC_THREADS + if nvcc_threads is not None: + nvcc_threads = int(nvcc_threads) + logger.info( + "Using NVCC_THREADS=%d as the number of nvcc threads.", + nvcc_threads, + ) + else: + nvcc_threads = 1 + num_jobs = max(1, num_jobs // nvcc_threads) + except Exception as e: + logger.warning("Failed to get NVCC version: %s", e) return num_jobs, nvcc_threads @@ -199,9 +205,9 @@ class cmake_build_ext(build_ext): # Default build tool to whatever cmake picks. build_tool = [] # Make sure we use the nvcc from CUDA_HOME - if _is_cuda(): + if _is_cuda() and CUDA_HOME is not None: cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"] - elif _is_hip(): + elif _is_hip() and ROCM_HOME is not None: cmake_args += [f"-DROCM_PATH={ROCM_HOME}"] other_cmake_args = os.environ.get("CMAKE_ARGS") @@ -339,6 +345,89 @@ class precompiled_wheel_utils: wheels = json.loads(resp.read().decode("utf-8")) return wheels, repo_url + @staticmethod + def is_rocm_system() -> bool: + """Detect ROCm without relying on torch (for build environment).""" + if os.getenv("ROCM_PATH"): + return True + if os.path.isdir("/opt/rocm"): + return True + if which("rocminfo") is not None: + return True + try: + import torch + + return torch.version.hip is not None + except ImportError: + return False + + @staticmethod + def find_local_rocm_wheel() -> str | None: + """Search for a local vllm wheel in common locations.""" + import glob + + for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]: + wheels = glob.glob(pattern) + if wheels: + return sorted(wheels)[-1] + return None + + @staticmethod + def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str: + """Fetch the latest wheel URL from a PyPI-style simple index.""" + import platform + from html.parser import HTMLParser + from urllib.parse import urljoin + from urllib.request import urlopen + + arch = platform.machine() + + class WheelLinkParser(HTMLParser): + def __init__(self): + super().__init__() + self.wheels = [] + + def handle_starttag(self, tag, attrs): + if tag == "a": + for name, value in attrs: + if name == "href" and value.endswith(".whl"): + self.wheels.append(value) + + simple_url = f"{index_url.rstrip('/')}/{package}/" + print(f"Fetching wheel list from {simple_url}") + with urlopen(simple_url) as resp: + html = resp.read().decode("utf-8") + + parser = WheelLinkParser() + parser.feed(html) + + for wheel in reversed(parser.wheels): + if arch in wheel: + if wheel.startswith("http"): + return wheel + return urljoin(simple_url, wheel) + + raise ValueError(f"No compatible wheel found for {arch} at {simple_url}") + + @staticmethod + def determine_wheel_url_rocm() -> tuple[str, str | None]: + """Determine the precompiled wheel for ROCm.""" + # Search for local wheel first + local_wheel = precompiled_wheel_utils.find_local_rocm_wheel() + if local_wheel is not None: + print(f"Found local ROCm wheel: {local_wheel}") + return local_wheel, None + + # Fall back to AMD's PyPI index + index_url = os.getenv( + "VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple" + ) + print(f"Fetching ROCm precompiled wheel from {index_url}") + wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url) + download_filename = wheel_url.split("/")[-1].split("#")[0] + print(f"Using ROCm precompiled wheel: {wheel_url}") + return wheel_url, download_filename + @staticmethod def determine_wheel_url() -> tuple[str, str | None]: """ @@ -359,6 +448,11 @@ class precompiled_wheel_utils: print(f"Using user-specified precompiled wheel location: {wheel_location}") return wheel_location, None else: + # ROCm: use local wheel or AMD's PyPI index + # TODO: When we have ROCm nightly wheels, we can update this logic. + if precompiled_wheel_utils.is_rocm_system(): + return precompiled_wheel_utils.determine_wheel_url_rocm() + import platform arch = platform.machine() @@ -465,6 +559,8 @@ class precompiled_wheel_utils: "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/cumem_allocator.abi3.so", + # ROCm-specific libraries + "vllm/_rocm_C.abi3.so", ] flash_attn_regex = re.compile( @@ -601,6 +697,8 @@ def get_rocm_version(): # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21 try: + if ROCM_HOME is None: + return None librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so" if not librocm_core_file.is_file(): return None @@ -745,7 +843,9 @@ if _is_hip(): if _is_cuda(): ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) - if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"): + if envs.VLLM_USE_PRECOMPILED or ( + CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3") + ): # FA3 requires CUDA 12.3 or later ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) # Optional since this doesn't get built (produce an .so file) when diff --git a/tests/conftest.py b/tests/conftest.py index a03f40a9a72ac..30e25294925ca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -410,7 +410,7 @@ class HfRunner: # don't put this import at the top level # it will call torch.cuda.device_count() - from transformers import AutoProcessor # noqa: F401 + from transformers import AutoProcessor self.processor = AutoProcessor.from_pretrained( model_name, diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 25a5e00cc0e16..5bb5fcea2a94c 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -511,6 +511,16 @@ def test_human_readable_model_len(): args = parser.parse_args(["--max-model-len", "10.2123451234567t"]) assert args.max_model_len == 10212345123456 + # Special value -1 for auto-fit to GPU memory + args = parser.parse_args(["--max-model-len", "-1"]) + assert args.max_model_len == -1 + + # 'auto' is an alias for -1 + args = parser.parse_args(["--max-model-len", "auto"]) + assert args.max_model_len == -1 + args = parser.parse_args(["--max-model-len", "AUTO"]) + assert args.max_model_len == -1 + # Invalid (do not allow decimals with binary multipliers) for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]: with pytest.raises(ArgumentError): diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py index b40079d8dc3d5..098a9a72325ba 100644 --- a/tests/entrypoints/openai/conftest.py +++ b/tests/entrypoints/openai/conftest.py @@ -5,6 +5,30 @@ import pytest from vllm.assets.audio import AudioAsset +def add_attention_backend(server_args, attention_config): + """Append attention backend CLI arg if specified. + + Args: + server_args: List of server arguments to extend in-place. + attention_config: Dict with 'backend' key, or None. + """ + if attention_config and "backend" in attention_config: + server_args.extend(["--attention-backend", attention_config["backend"]]) + + +@pytest.fixture(scope="module") +def rocm_aiter_fa_attention(): + """Return attention config for transcription/translation tests on ROCm. + + On ROCm, audio tests require ROCM_AITER_FA attention backend. + """ + from vllm.platforms import current_platform + + if current_platform.is_rocm(): + return {"backend": "ROCM_AITER_FA"} + return None + + @pytest.fixture def mary_had_lamb(): path = AudioAsset("mary_had_lamb").get_local_path() diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py index 682420a83a442..1d3d110d30271 100644 --- a/tests/entrypoints/openai/test_async_tokenization.py +++ b/tests/entrypoints/openai/test_async_tokenization.py @@ -15,7 +15,7 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" @pytest.fixture(scope="module") -def server(): # noqa: F811 +def server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index af5f2fec402ed..9fe1d906d857e 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -8,7 +8,7 @@ import pytest import pytest_asyncio from vllm.assets.audio import AudioAsset -from vllm.multimodal.utils import encode_audio_base64, fetch_audio +from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio from ...utils import RemoteOpenAIServer @@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]: } +@pytest.fixture(scope="session") +def url_encoded_audio() -> dict[str, str]: + return { + audio_url: encode_audio_url(*fetch_audio(audio_url)) + for audio_url in TEST_AUDIO_URLS + } + + def dummy_messages_from_audio_url( audio_urls: str | list[str], content_text: str = "What's happening in this audio?", @@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded( client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: dict[str, str], + url_encoded_audio: dict[str, str], ): - messages = dummy_messages_from_audio_url( - f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" - ) + messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url]) # test single completion chat_completion = await client.chat.completions.create( diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index b2909f21e4dd8..ae94c149017e7 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -28,7 +28,7 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def server(zephyr_lora_files): # noqa: F811 +def server(zephyr_lora_files): args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -254,12 +254,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): {"role": "system", "content": "you are a helpful assistant"}, {"role": "user", "content": "what is 1+1?"}, ] - # test single completion chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_completion_tokens=10, + max_completion_tokens=5, logprobs=True, top_logprobs=5, ) @@ -267,13 +266,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] + assert choice.finish_reason == "length" assert chat_completion.usage == openai.types.CompletionUsage( - completion_tokens=10, prompt_tokens=37, total_tokens=47 + completion_tokens=5, prompt_tokens=37, total_tokens=42 ) message = choice.message - assert message.content is not None and len(message.content) >= 10 + assert message.content is not None and len(message.content) >= 5 assert message.role == "assistant" messages.append({"role": "assistant", "content": message.content}) @@ -282,7 +282,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_completion_tokens=10, + max_completion_tokens=5, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py index 7b3092b563030..445fa389d0007 100644 --- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py +++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py @@ -12,7 +12,7 @@ MODEL_NAME = "Qwen/QwQ-32B" @pytest.fixture(scope="module") -def server(): # noqa: F811 +def server(): args = [ "--max-model-len", "8192", diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 53369f074eca8..c6a5841ec3bfb 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -125,7 +125,7 @@ messages = [ @pytest.fixture(scope="module") -def server(): # noqa: F811 +def server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -212,7 +212,7 @@ async def test_function_tool_use( @pytest.fixture(scope="module") -def k2_server(): # noqa: F811 +def k2_server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py index 818ee2644b547..dd8f9d67d6903 100644 --- a/tests/entrypoints/openai/test_default_mm_loras.py +++ b/tests/entrypoints/openai/test_default_mm_loras.py @@ -23,7 +23,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original @pytest.fixture(scope="module") -def multimodal_server(): # noqa: F811 +def multimodal_server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py index 9d527c45c1fae..8e7e34ee2b71b 100644 --- a/tests/entrypoints/openai/test_enable_force_include_usage.py +++ b/tests/entrypoints/openai/test_enable_force_include_usage.py @@ -8,7 +8,7 @@ from ...utils import RemoteOpenAIServer @pytest.fixture(scope="module") -def chat_server_with_force_include_usage(request): # noqa: F811 +def chat_server_with_force_include_usage(request): args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/entrypoints/openai/test_messages.py b/tests/entrypoints/openai/test_messages.py index 8de6c4cb6c887..ce8c3ff4a71a5 100644 --- a/tests/entrypoints/openai/test_messages.py +++ b/tests/entrypoints/openai/test_messages.py @@ -11,7 +11,7 @@ MODEL_NAME = "Qwen/Qwen3-0.6B" @pytest.fixture(scope="module") -def server(): # noqa: F811 +def server(): args = [ "--max-model-len", "2048", diff --git a/tests/entrypoints/openai/test_optional_middleware.py b/tests/entrypoints/openai/test_optional_middleware.py index b67d6147937d1..c2c7fbdb01140 100644 --- a/tests/entrypoints/openai/test_optional_middleware.py +++ b/tests/entrypoints/openai/test_optional_middleware.py @@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest): "2", *passed_params, ] + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 8ef0d7f277d5f..718e0edba8373 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -504,7 +504,11 @@ async def test_web_search(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_code_interpreter(client: OpenAI, model_name: str): - response = await client.responses.create( + # Code interpreter may need more time for container init + code execution + timeout_value = client.timeout * 3 + client_with_timeout = client.with_options(timeout=timeout_value) + + response = await client_with_timeout.responses.create( model=model_name, # TODO: Ideally should be able to set max tool calls # to prevent multi-turn, but it is not currently supported @@ -868,6 +872,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.flaky(reruns=3) async def test_function_call_with_previous_input_messages( client: OpenAI, model_name: str ): diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index d4d9a6c5b6120..05a36febad0cc 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -37,7 +37,7 @@ def default_server_args(qwen3_lora_files): @pytest.fixture(scope="module") -def server_fixture(request, default_server_args): # noqa: F811 +def server_fixture(request, default_server_args): use_server_flag = request.param if use_server_flag: args_with_flag = default_server_args + ["--return-tokens-as-token-ids"] diff --git a/tests/entrypoints/openai/test_serving_tokens.py b/tests/entrypoints/openai/test_serving_tokens.py index 62d843e35b86f..acbbaa659c82b 100644 --- a/tests/entrypoints/openai/test_serving_tokens.py +++ b/tests/entrypoints/openai/test_serving_tokens.py @@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages): add_generation_prompt=True, enable_thinking=False, # default with Qwen3 ) + for ignore_eos in [True, False]: payload = { "model": MODEL_NAME, @@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages): } generate_resp = await client.post(GEN_ENDPOINT, json=payload) generate_data = generate_resp.json() - generate_res = tokenizer.decode( - generate_data["choices"][0]["token_ids"], skip_special_tokens=True - ) + gen_token_ids = generate_data["choices"][0]["token_ids"] + generate_res = tokenizer.decode(gen_token_ids, skip_special_tokens=True) payload = { "model": MODEL_NAME, @@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages): "temperature": 0.0, "stream": False, "ignore_eos": ignore_eos, - "chat_template_kwargs": dict(enable_thinking=False), + "chat_template_kwargs": {"enable_thinking": False}, } completions_resp = await client.post("/v1/chat/completions", json=payload) completions_data = completions_resp.json() completions_res = completions_data["choices"][0]["message"]["content"] + if ignore_eos: + # When ignoring EOS, only compare up to the first EOS token + # Post-EOS generation is undefined and may differ + eos_tokens = { + tokenizer.eos_token_id, + *tokenizer.additional_special_tokens_ids, + } + # Find first EOS in generated tokens + eos_pos = None + for i, tid in enumerate(gen_token_ids): + if tid in eos_tokens: + eos_pos = i + break + if eos_pos is not None: + gen_token_ids_truncated = gen_token_ids[:eos_pos] + generate_res = tokenizer.decode( + gen_token_ids_truncated, skip_special_tokens=True + ) + # Truncate completions_res to same length for comparison + completions_res = completions_res[: len(generate_res)] + assert generate_res == completions_res diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index d75119cb7b43d..a2ac49bcb0b25 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -9,10 +9,16 @@ import time import openai import pytest +from vllm.platforms import current_platform from vllm.utils.network_utils import get_open_port MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" +# GPU initialization might take take longer +_IS_ROCM = current_platform.is_rocm() +_SERVER_STARTUP_TIMEOUT = 120 +_PROCESS_EXIT_TIMEOUT = 15 + @pytest.mark.asyncio async def test_shutdown_on_engine_failure(): @@ -45,9 +51,11 @@ async def test_shutdown_on_engine_failure(): "2", "--disable-frontend-multiprocessing", ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, + # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when + # stdout/stderr pipes are enabled during ROCm GPU initialization. + stdout=None if _IS_ROCM else subprocess.PIPE, + stderr=None if _IS_ROCM else subprocess.PIPE, + text=None if _IS_ROCM else True, preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN), ) @@ -61,7 +69,7 @@ async def test_shutdown_on_engine_failure(): ) # Poll until server is ready - while time.time() - start_time < 30: + while time.time() - start_time < _SERVER_STARTUP_TIMEOUT: try: await client.completions.create( model=MODEL_NAME, prompt="Hello", max_tokens=1 @@ -70,14 +78,18 @@ async def test_shutdown_on_engine_failure(): except Exception: time.sleep(0.5) if proc.poll() is not None: - stdout, stderr = proc.communicate(timeout=1) - pytest.fail( - f"Server died during startup. stdout: {stdout}, stderr: {stderr}" - ) + if _IS_ROCM: + pytest.fail(f"Server died during startup: {proc.returncode}") + else: + stdout, stderr = proc.communicate(timeout=1) + pytest.fail( + f"Server died during startup. " + f"stdout: {stdout}, stderr: {stderr}" + ) else: proc.terminate() - proc.wait(timeout=5) - pytest.fail("Server failed to start in 30 seconds") + proc.wait(timeout=_PROCESS_EXIT_TIMEOUT) + pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds") # Kill server to simulate crash proc.terminate() @@ -89,5 +101,5 @@ async def test_shutdown_on_engine_failure(): model=MODEL_NAME, prompt="This should fail", max_tokens=1 ) - return_code = proc.wait(timeout=5) + return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT) assert return_code is not None diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 8045ab1468d6a..ee8dea4e949bc 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -7,6 +7,7 @@ import json import pytest from ...utils import RemoteOpenAIServer +from .conftest import add_attention_backend MISTRAL_FORMAT_ARGS = [ "--tokenizer_mode", @@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [ @pytest.mark.asyncio @pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"]) -async def test_basic_audio(mary_had_lamb, model_name): +async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention): server_args = ["--enforce-eager"] if model_name.startswith("mistralai"): server_args += MISTRAL_FORMAT_ARGS + add_attention_backend(server_args, rocm_aiter_fa_attention) + # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. with RemoteOpenAIServer(model_name, server_args) as remote_server: client = remote_server.get_async_client() @@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name): @pytest.mark.asyncio -async def test_basic_audio_with_lora(mary_had_lamb): +async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention): """Ensure STT (transcribe) requests can pass LoRA through to generate.""" + # ROCm SPECIFIC CONFIGURATION: + # To ensure the test passes on ROCm, we modify the max model length to 512. + # We DO NOT apply this to other platforms to maintain strict upstream parity. + from vllm.platforms import current_platform + model_name = "ibm-granite/granite-speech-3.3-2b" lora_model_name = "speech" server_args = [ @@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb): "--lora-modules", f"{lora_model_name}={model_name}", "--max-model-len", - "2048", + "512" if current_platform.is_rocm() else "2048", "--max-num-seqs", "1", ] + add_attention_backend(server_args, rocm_aiter_fa_attention) + # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. with RemoteOpenAIServer(model_name, server_args) as remote_server: client = remote_server.get_async_client() @@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb): @pytest.mark.asyncio -async def test_basic_audio_gemma(foscolo): +async def test_basic_audio_gemma(foscolo, rocm_aiter_fa_attention): # Gemma accuracy on some of the audio samples we use is particularly bad, # hence we use a different one here. WER is evaluated separately. model_name = "google/gemma-3n-E2B-it" server_args = ["--enforce-eager"] + add_attention_backend(server_args, rocm_aiter_fa_attention) + with RemoteOpenAIServer( model_name, server_args, max_wait_seconds=480 ) as remote_server: diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index 2c577237691ab..cae45872ee6a6 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -14,16 +14,26 @@ import pytest_asyncio import soundfile as sf from ...utils import RemoteOpenAIServer +from .conftest import add_attention_backend SERVER_ARGS = ["--enforce-eager"] +def _get_server_args(attention_config): + """Get server args with attention backend if specified.""" + args = SERVER_ARGS.copy() + add_attention_backend(args, attention_config) + return args + + @pytest.fixture( scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"] ) -def server(request): +def server(request, rocm_aiter_fa_attention): # Parametrize over model name - with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server: + with RemoteOpenAIServer( + request.param, _get_server_args(rocm_aiter_fa_attention) + ) as remote_server: yield remote_server, request.param @@ -35,10 +45,12 @@ async def client_and_model(server): @pytest.mark.asyncio -async def test_non_asr_model(foscolo): +async def test_non_asr_model(foscolo, rocm_aiter_fa_attention): # text to text model model_name = "JackFram/llama-68m" - with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server: + with RemoteOpenAIServer( + model_name, _get_server_args(rocm_aiter_fa_attention) + ) as remote_server: client = remote_server.get_async_client() res = await client.audio.translations.create( model=model_name, file=foscolo, temperature=0.0 @@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo): @pytest.mark.asyncio -async def test_basic_audio_with_lora(mary_had_lamb): +async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention): """Ensure STT (translate) requests can pass LoRA through to generate.""" + # ROCm SPECIFIC CONFIGURATION: + # To ensure the test passes on ROCm, we modify the max model length to 512. + # We DO NOT apply this to other platforms to maintain strict upstream parity. + from vllm.platforms import current_platform + # NOTE - careful to call this test before the module scoped server # fixture, otherwise it'll OOMkill the CI model_name = "ibm-granite/granite-speech-3.3-2b" @@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb): "--lora-modules", f"{lora_model_name}={model_name}", "--max-model-len", - "2048", + "512" if current_platform.is_rocm() else "2048", "--max-num-seqs", "1", ] + add_attention_backend(server_args, rocm_aiter_fa_attention) + # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. with RemoteOpenAIServer(model_name, server_args) as remote_server: client = remote_server.get_async_client() diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 7ecdac518f97f..65bda9e8bc010 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -7,7 +7,8 @@ import openai import pytest import pytest_asyncio -from vllm.multimodal.utils import encode_video_base64, fetch_video +from vllm.multimodal.utils import encode_video_url, fetch_video +from vllm.platforms import current_platform from ...utils import RemoteOpenAIServer @@ -37,7 +38,16 @@ def server(): json.dumps({"video": MAXIMUM_VIDEOS}), ] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + # ROCm: Increase timeouts to handle potential network delays and slower + # video processing when downloading multiple videos from external sources + env_overrides = {} + if current_platform.is_rocm(): + env_overrides = { + "VLLM_VIDEO_FETCH_TIMEOUT": "120", + "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300", + } + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server: yield remote_server @@ -48,9 +58,9 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_video() -> dict[str, str]: +def url_encoded_video() -> dict[str, str]: return { - video_url: encode_video_base64(fetch_video(video_url)[0]) + video_url: encode_video_url(fetch_video(video_url)[0]) for video_url in TEST_VIDEO_URLS } @@ -175,11 +185,9 @@ async def test_single_chat_session_video_base64encoded( client: openai.AsyncOpenAI, model_name: str, video_url: str, - base64_encoded_video: dict[str, str], + url_encoded_video: dict[str, str], ): - messages = dummy_messages_from_video_url( - f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" - ) + messages = dummy_messages_from_video_url(url_encoded_video[video_url]) # test single completion chat_completion = await client.chat.completions.create( @@ -223,11 +231,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch( client: openai.AsyncOpenAI, model_name: str, video_url: str, - base64_encoded_video: dict[str, str], + url_encoded_video: dict[str, str], ): - messages = dummy_messages_from_video_url( - f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" - ) + messages = dummy_messages_from_video_url(url_encoded_video[video_url]) chat_completion = await client.chat.completions.create( model=model_name, @@ -291,6 +297,11 @@ async def test_chat_streaming_video( @pytest.mark.parametrize( "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))] ) +@pytest.mark.flaky( + reruns=2, + reruns_delay=5, + condition=current_platform.is_rocm(), +) async def test_multi_video_input( client: openai.AsyncOpenAI, model_name: str, video_urls: list[str] ): diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index ae8860ee877b4..00823ff5f78ca 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -9,7 +9,8 @@ import pytest_asyncio from transformers import AutoProcessor from vllm.multimodal.base import MediaWithBytes -from vllm.multimodal.utils import encode_image_base64, fetch_image +from vllm.multimodal.utils import encode_image_url, fetch_image +from vllm.platforms import current_platform from ...utils import RemoteOpenAIServer @@ -35,7 +36,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [ ], [ "The image shows a Venn diagram with three over", - "The image shows a colorful Venn diagram with", + "The image displays a Venn diagram with three over", ], [ "This image displays a gradient of colors ranging from", @@ -43,6 +44,27 @@ EXPECTED_MM_BEAM_SEARCH_RES = [ ], ] +EXPECTED_MM_BEAM_SEARCH_RES_ROCM = [ + # MultiHeadAttention attn_backend: FLASH_ATTN + # with Triton Attention backend + [ + "The image shows a wooden boardwalk leading through a", + "The image shows a wooden boardwalk extending into a", + ], + [ + "The image shows two parrots perched on", + "The image shows two birds perched on a cur", + ], + [ + "The image shows a Venn diagram with three over", + "The image contains a Venn diagram with three over", + ], + [ + "This image displays a gradient of colors ranging from", + "This image displays a gradient of colors transitioning from", + ], +] + @pytest.fixture(scope="module") def server(): @@ -59,7 +81,16 @@ def server(): json.dumps({"image": MAXIMUM_IMAGES}), ] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + # ROCm: Increase timeouts to handle potential network delays and slower + # video processing when downloading multiple videos from external sources + env_overrides = {} + if current_platform.is_rocm(): + env_overrides = { + "VLLM_VIDEO_FETCH_TIMEOUT": "120", + "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300", + } + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server: yield remote_server @@ -70,11 +101,9 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_image(local_asset_server) -> dict[str, str]: +def url_encoded_image(local_asset_server) -> dict[str, str]: return { - image_asset: encode_image_base64( - local_asset_server.get_image_asset(image_asset) - ) + image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset)) for image_asset in TEST_IMAGE_ASSETS } @@ -234,11 +263,11 @@ async def test_single_chat_session_image_base64encoded( model_name: str, raw_image_url: str, image_url: str, - base64_encoded_image: dict[str, str], + url_encoded_image: dict[str, str], ): content_text = "What's in this image?" messages = dummy_messages_from_image_url( - f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", + url_encoded_image[raw_image_url], content_text, ) @@ -288,15 +317,20 @@ async def test_single_chat_session_image_base64encoded_beamsearch( client: openai.AsyncOpenAI, model_name: str, image_idx: int, - base64_encoded_image: dict[str, str], + url_encoded_image: dict[str, str], ): + # ROCm: Switch expected results based on platform + from vllm.platforms import current_platform + # NOTE: This test also validates that we pass MM data through beam search raw_image_url = TEST_IMAGE_ASSETS[image_idx] - expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] - messages = dummy_messages_from_image_url( - f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" - ) + if current_platform.is_rocm(): + expected_res = EXPECTED_MM_BEAM_SEARCH_RES_ROCM[image_idx] + else: + expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] + + messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url]) chat_completion = await client.chat.completions.create( model=model_name, diff --git a/tests/entrypoints/openai/test_vision_embeds.py b/tests/entrypoints/openai/test_vision_embeds.py index 42d9fe4840bbe..067a00c6b9382 100644 --- a/tests/entrypoints/openai/test_vision_embeds.py +++ b/tests/entrypoints/openai/test_vision_embeds.py @@ -33,6 +33,7 @@ def _terratorch_dummy_messages(): ] +@pytest.mark.asyncio @pytest.mark.parametrize( "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] ) diff --git a/tests/entrypoints/pooling/basic/test_encode.py b/tests/entrypoints/pooling/basic/test_encode.py index f86ecef2e4744..ab3a0610c3e17 100644 --- a/tests/entrypoints/pooling/basic/test_encode.py +++ b/tests/entrypoints/pooling/basic/test_encode.py @@ -9,11 +9,6 @@ from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "intfloat/multilingual-e5-small" PROMPTS = [ @@ -35,6 +30,12 @@ TOKEN_IDS = [ @pytest.fixture(scope="module") def llm(): + # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend + # that supports encoder-only models on ROCm. + attention_config = None + if current_platform.is_rocm(): + attention_config = {"backend": "FLEX_ATTENTION"} + # pytest caches the fixture so we use weakref.proxy to # enable garbage collection llm = LLM( @@ -44,6 +45,7 @@ def llm(): gpu_memory_utilization=0.75, enforce_eager=True, seed=0, + attention_config=attention_config, ) yield weakref.proxy(llm) diff --git a/tests/entrypoints/pooling/basic/test_truncation.py b/tests/entrypoints/pooling/basic/test_truncation.py index 0d2d385840402..5d099dd1f4391 100644 --- a/tests/entrypoints/pooling/basic/test_truncation.py +++ b/tests/entrypoints/pooling/basic/test_truncation.py @@ -9,11 +9,6 @@ import pytest_asyncio from tests.utils import RemoteOpenAIServer from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" max_model_len = 128 @@ -44,6 +39,10 @@ def server(): str(max_model_len), ] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/embed/conftest.py b/tests/entrypoints/pooling/embed/conftest.py new file mode 100644 index 0000000000000..002b85874049c --- /dev/null +++ b/tests/entrypoints/pooling/embed/conftest.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Pytest configuration for vLLM pooling embed tests.""" + +import warnings + +import torch + +from vllm.platforms import current_platform + + +def pytest_collection_modifyitems(config, items): + """Configure ROCm-specific settings based on collected tests.""" + if not current_platform.is_rocm(): + return + + # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers + # accuracy issues: https://github.com/vllm-project/vllm/issues/30167 + # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_mem_efficient_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + warnings.warn( + "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp " + "to avoid HuggingFace Transformers accuracy issues", + UserWarning, + stacklevel=1, + ) diff --git a/tests/entrypoints/pooling/embed/test_correctness_mteb.py b/tests/entrypoints/pooling/embed/test_correctness_mteb.py index 64673534fd32a..4c8d9f0d82a24 100644 --- a/tests/entrypoints/pooling/embed/test_correctness_mteb.py +++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py @@ -4,7 +4,7 @@ import os import pytest -from tests.models.language.pooling_mteb_test.mteb_utils import ( +from tests.models.language.pooling_mteb_test.mteb_embed_utils import ( MTEB_EMBED_TASKS, MTEB_EMBED_TOL, OpenAIClientMtebEncoder, @@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.utils import RemoteOpenAIServer from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" MODEL_NAME = "intfloat/e5-small" @@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667 def server(): args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/embed/test_offline.py b/tests/entrypoints/pooling/embed/test_offline.py index 12b47b1a08a8b..ea46b7401f6d7 100644 --- a/tests/entrypoints/pooling/embed/test_offline.py +++ b/tests/entrypoints/pooling/embed/test_offline.py @@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "intfloat/multilingual-e5-small" prompts = ["The chef prepared a delicious meal."] @@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."] @pytest.fixture(scope="module") def llm(): + # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend + # that supports encoder-only models on ROCm. + attention_config = None + if current_platform.is_rocm(): + attention_config = {"backend": "FLEX_ATTENTION"} + # pytest caches the fixture so we use weakref.proxy to # enable garbage collection llm = LLM( @@ -32,6 +33,7 @@ def llm(): gpu_memory_utilization=0.75, enforce_eager=True, seed=0, + attention_config=attention_config, ) yield weakref.proxy(llm) diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py index f96338c47f0be..f5e563daeaa03 100644 --- a/tests/entrypoints/pooling/embed/test_online.py +++ b/tests/entrypoints/pooling/embed/test_online.py @@ -28,16 +28,20 @@ from vllm.utils.serial_utils import ( decode_pooling_output, ) -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "intfloat/multilingual-e5-small" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 DTYPE = "bfloat16" +if current_platform.is_rocm(): + # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers + # accuracy issues: https://github.com/vllm-project/vllm/issues/30167 + # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_mem_efficient_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + + @pytest.fixture(scope="module") def server(): args = [ @@ -53,6 +57,10 @@ def server(): DUMMY_CHAT_TEMPLATE, ] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/embed/test_online_dimensions.py b/tests/entrypoints/pooling/embed/test_online_dimensions.py index 26aa57742b02a..0545b8a0ae2fc 100644 --- a/tests/entrypoints/pooling/embed/test_online_dimensions.py +++ b/tests/entrypoints/pooling/embed/test_online_dimensions.py @@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODELS = [ EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), EmbedModelInfo( @@ -62,6 +57,10 @@ def server(model_info, dtype: str): ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}'] ) + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(model_info.name, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/embed/test_online_long_text.py b/tests/entrypoints/pooling/embed/test_online_long_text.py index 0be7eebc2017d..316a8526404c0 100644 --- a/tests/entrypoints/pooling/embed/test_online_long_text.py +++ b/tests/entrypoints/pooling/embed/test_online_long_text.py @@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - def _generate_random_text(word_count: int) -> str: """Generate random text with approximately the specified word count.""" @@ -228,6 +223,10 @@ def server_with_chunked_processing(): "0.8", ] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/embed/test_online_vision.py b/tests/entrypoints/pooling/embed/test_online_vision.py index eebbcdd2e4396..46b2d8a84d5ae 100644 --- a/tests/entrypoints/pooling/embed/test_online_vision.py +++ b/tests/entrypoints/pooling/embed/test_online_vision.py @@ -10,7 +10,7 @@ from transformers import AutoProcessor from tests.utils import VLLM_PATH, RemoteOpenAIServer from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.multimodal.base import MediaWithBytes -from vllm.multimodal.utils import encode_image_base64, fetch_image +from vllm.multimodal.utils import fetch_image MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MAXIMUM_IMAGES = 2 @@ -48,14 +48,6 @@ def server(): yield remote_server -@pytest.fixture(scope="session") -def base64_encoded_image(local_asset_server) -> dict[str, str]: - return { - image_url: encode_image_base64(local_asset_server.get_image_asset(image_url)) - for image_url in TEST_IMAGE_ASSETS - } - - def get_hf_prompt_tokens(model_name, content, image_url): processor = AutoProcessor.from_pretrained( model_name, trust_remote_code=True, num_crops=4 diff --git a/tests/entrypoints/pooling/score/test_correctness_mteb.py b/tests/entrypoints/pooling/score/test_correctness_mteb.py index 81ad0097187b0..1ee45b44596fa 100644 --- a/tests/entrypoints/pooling/score/test_correctness_mteb.py +++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py @@ -4,7 +4,7 @@ import os import pytest -from tests.models.language.pooling_mteb_test.mteb_utils import ( +from tests.models.language.pooling_mteb_test.mteb_score_utils import ( MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL, @@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.utils import RemoteOpenAIServer from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2" @@ -30,6 +25,10 @@ st_main_score = 0.33457 def server(): args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/score/test_offline.py b/tests/entrypoints/pooling/score/test_offline.py index ce36d61cb8476..c02c02cf234a6 100644 --- a/tests/entrypoints/pooling/score/test_offline.py +++ b/tests/entrypoints/pooling/score/test_offline.py @@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" @pytest.fixture(scope="module") def llm(): + # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend + # that supports encoder-only models on ROCm. + attention_config = None + if current_platform.is_rocm(): + attention_config = {"backend": "FLEX_ATTENTION"} + # pytest caches the fixture so we use weakref.proxy to # enable garbage collection llm = LLM( @@ -30,6 +31,7 @@ def llm(): gpu_memory_utilization=0.75, enforce_eager=True, seed=0, + attention_config=attention_config, ) yield weakref.proxy(llm) diff --git a/tests/entrypoints/pooling/score/test_online_rerank.py b/tests/entrypoints/pooling/score/test_online_rerank.py index f262dd4cb06b6..7f2af611d2e43 100644 --- a/tests/entrypoints/pooling/score/test_online_rerank.py +++ b/tests/entrypoints/pooling/score/test_online_rerank.py @@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse from vllm.entrypoints.pooling.score.protocol import RerankResponse from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" @@ -24,6 +19,10 @@ DTYPE = "bfloat16" def server(): args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/score/test_online_score.py b/tests/entrypoints/pooling/score/test_online_score.py index 30ef55c8b6756..6c08027ee50b7 100644 --- a/tests/entrypoints/pooling/score/test_online_score.py +++ b/tests/entrypoints/pooling/score/test_online_score.py @@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.pooling.score.protocol import ScoreResponse from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODELS = [ {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True}, {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False}, @@ -44,6 +39,10 @@ def model(request): def server(model: dict[str, Any]): args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(model["name"], args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/score/test_utils.py b/tests/entrypoints/pooling/score/test_utils.py new file mode 100644 index 0000000000000..356fd0ad6678f --- /dev/null +++ b/tests/entrypoints/pooling/score/test_utils.py @@ -0,0 +1,351 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import patch + +import pytest + +from vllm.config import ModelConfig +from vllm.entrypoints.chat_utils import ChatTemplateResolutionError +from vllm.entrypoints.score_utils import get_score_prompt +from vllm.inputs import TokensPrompt +from vllm.tokenizers import get_tokenizer + +# A cross-encoder model for testing +CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2" + + +def assert_prompt_tokenization_consistent( + tokenizer, full_prompt, engine_prompt, add_special_tokens=True +): + """Verify that engine_prompt token_ids match tokenizing full_prompt.""" + expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[ + "input_ids" + ] + actual_ids = engine_prompt["prompt_token_ids"] + assert actual_ids == expected_ids, ( + f"Token IDs don't match.\nExpected: {expected_ids}\nActual: {actual_ids}" + ) + + +@pytest.fixture(scope="module") +def cross_encoder_model_config(): + return ModelConfig( + CROSS_ENCODER_MODEL_ID, + runner="pooling", + ) + + +@pytest.fixture(scope="module") +def cross_encoder_tokenizer(cross_encoder_model_config): + return get_tokenizer( + CROSS_ENCODER_MODEL_ID, + trust_remote_code=cross_encoder_model_config.trust_remote_code, + ) + + +@pytest.fixture(scope="module") +def llm_reranker_model_config(): + """Model config for LLM-as-reranker style (no pad token).""" + config = ModelConfig( + CROSS_ENCODER_MODEL_ID, + runner="pooling", + ) + # use_pad_token is a property that reads from hf_config, + # so we set it there to override the default (True) + config.hf_config.use_pad_token = False + return config + + +@pytest.fixture +def tokenization_kwargs(): + """Common tokenization kwargs used across tests.""" + return {"add_special_tokens": True, "return_tensors": None} + + +@pytest.fixture +def mock_model_with_score_template(): + """Mock model class that supports score template and tracks post_process calls.""" + + class MockModelWithScoreTemplate: + supports_score_template = True + post_process_called: list[TokensPrompt] = [] + + @staticmethod + def get_score_template(p1: str, p2: str) -> str: + return f"[QUERY]{p1}[SEP][DOC]{p2}" + + @staticmethod + def post_process_tokens(prompt: TokensPrompt) -> None: + MockModelWithScoreTemplate.post_process_called.append(prompt) + + return MockModelWithScoreTemplate + + +@pytest.fixture +def mock_model_no_score_template(): + """Mock model class that does not support score template.""" + + class MockModelNoScoreTemplate: + supports_score_template = False + + return MockModelNoScoreTemplate + + +class TestGetScorePrompt: + """Tests for the get_score_prompt function.""" + + def test_tokenization_kwargs_passed_through( + self, + llm_reranker_model_config, + cross_encoder_tokenizer, + ): + """Test that tokenization kwargs are properly passed through.""" + data_1 = "Query text" + data_2 = "Document text" + + # Test with truncation - custom kwargs for this test + custom_tokenization_kwargs = { + "add_special_tokens": True, + "return_tensors": None, + "truncation": True, + "max_length": 20, + } + + full_prompt, engine_prompt = get_score_prompt( + llm_reranker_model_config, + cross_encoder_tokenizer, + custom_tokenization_kwargs, + data_1, + data_2, + ) + + assert isinstance(full_prompt, str) + assert "prompt_token_ids" in engine_prompt + # With max_length=20 and truncation, should not exceed this + assert len(engine_prompt["prompt_token_ids"]) <= 20 + # Since truncation was applied, token_ids should be a prefix of full encoding + full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[ + "input_ids" + ] + actual_ids = engine_prompt["prompt_token_ids"] + assert full_ids[: len(actual_ids)] == actual_ids, ( + f"Token IDs are not a prefix of full encoding.\n" + f"Full IDs: {full_ids}\n" + f"Actual IDs: {actual_ids}" + ) + + def test_model_supports_score_template( + self, + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_with_score_template, + ): + """Test when model supports score template (no score_template arg).""" + with patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_with_score_template, + ): + full_prompt, engine_prompt = get_score_prompt( + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + "query text", + "document text", + ) + + assert full_prompt == "[QUERY]query text[SEP][DOC]document text" + assert "prompt_token_ids" in engine_prompt + assert len(engine_prompt["prompt_token_ids"]) > 0 + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, full_prompt, engine_prompt + ) + + def test_model_supports_score_template_but_custom_template_provided( + self, + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_with_score_template, + ): + """Test when model supports score template but custom template is provided.""" + template = ( + 'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}' + ) + with ( + patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_with_score_template, + ), + ): + full_prompt, engine_prompt = get_score_prompt( + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + "query", + "doc", + score_template=template, # Providing a template + ) + + assert "prompt_token_ids" in engine_prompt + assert full_prompt == "TEMPLATE_USED query doc" + + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, full_prompt, engine_prompt + ) + + def test_not_using_default_template( + self, + llm_reranker_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_no_score_template, + ): + # FIXME: For now, we only apply a template when one is explicitly provided. + # We cannot rely on the tokenizer's chat template because many models + # inherit junk templates from their base LLM, which breaks both the models + # and the tests that use them. + with ( + patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_no_score_template, + ), + patch( + "vllm.entrypoints.score_utils.apply_hf_chat_template", + return_value="test querytest doc", + ), + ): + full_prompt, engine_prompt = get_score_prompt( + llm_reranker_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + "test query", + "test doc", + ) + + assert full_prompt == "test querytest doc" + assert "prompt_token_ids" in engine_prompt + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, full_prompt, engine_prompt + ) + + def test_fallback_with_pad_token( + self, + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_no_score_template, + ): + """Test fallback path when ChatTemplateResolutionError + and use_pad_token=True.""" + with ( + patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_no_score_template, + ), + patch( + "vllm.entrypoints.score_utils.apply_hf_chat_template", + side_effect=ChatTemplateResolutionError("No template"), + ), + ): + full_prompt, engine_prompt = get_score_prompt( + cross_encoder_model_config, # use_pad_token=True + cross_encoder_tokenizer, + tokenization_kwargs, + "query", + "document", + ) + + assert "prompt_token_ids" in engine_prompt + # Should have token_type_ids from text_pair encoding + assert "token_type_ids" in engine_prompt + assert "query" in full_prompt + assert "document" in full_prompt + assert full_prompt != "querydocument" + assert ( + engine_prompt["prompt_token_ids"] + == cross_encoder_tokenizer( + "query", text_pair="document", add_special_tokens=True + )["input_ids"] + ) + + # FIXME(?): add_special_tokens=False is needed because in this case + # full_prompt is obtained by decoding the tokenized prompt, which includes + # special tokens and we would get duplicated special tokens otherwise. + # This is inconsistent with other cases. + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, + full_prompt, + engine_prompt, + add_special_tokens=False, + ) + + def test_fallback_without_pad_token( + self, + llm_reranker_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_no_score_template, + ): + """Test fallback path when ChatTemplateResolutionError + and use_pad_token=False.""" + with ( + patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_no_score_template, + ), + patch( + "vllm.entrypoints.score_utils.apply_hf_chat_template", + side_effect=ChatTemplateResolutionError("No template"), + ), + ): + full_prompt, engine_prompt = get_score_prompt( + llm_reranker_model_config, # use_pad_token=False + cross_encoder_tokenizer, + tokenization_kwargs, + "query", + "document", + ) + + assert full_prompt == "querydocument" + assert "prompt_token_ids" in engine_prompt + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, full_prompt, engine_prompt + ) + + def test_post_process_tokens_called( + self, + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_with_score_template, + ): + """Test that post_process_tokens is called on the engine prompt.""" + # Reset the call tracker + mock_model_with_score_template.post_process_called.clear() + + with ( + patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_with_score_template, + ), + patch( + "vllm.entrypoints.score_utils.apply_hf_chat_template", + side_effect=ChatTemplateResolutionError("No template"), + ), + ): + full_prompt, engine_prompt = get_score_prompt( + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + "query", + "doc", + ) + + # post_process_tokens should have been called once + assert len(mock_model_with_score_template.post_process_called) == 1 + assert mock_model_with_score_template.post_process_called[0] is engine_prompt + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, full_prompt, engine_prompt + ) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index a87a4c35d3dc7..6df2d26f2f0da 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -25,9 +25,9 @@ from vllm.entrypoints.chat_utils import ( ) from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import ( - encode_audio_base64, - encode_image_base64, - encode_video_base64, + encode_audio_url, + encode_image_url, + encode_video_url, ) from vllm.tokenizers import get_tokenizer from vllm.tokenizers.mistral import MistralTokenizer @@ -141,22 +141,19 @@ def mistral_model_config(): @pytest.fixture(scope="module") def image_url(): image = ImageAsset("cherry_blossom") - base64 = encode_image_base64(image.pil_image) - return f"data:image/jpeg;base64,{base64}" + return encode_image_url(image.pil_image) @pytest.fixture(scope="module") def video_url(): video = VideoAsset("baby_reading", 1) - base64 = encode_video_base64(video.np_ndarrays) - return f"data:video/jpeg;base64,{base64}" + return encode_video_url(video.np_ndarrays) @pytest.fixture(scope="module") def audio_url(): audio = AudioAsset("mary_had_lamb") - base64 = encode_audio_base64(*audio.audio_and_sample_rate) - return f"data:audio/ogg;base64,{base64}" + return encode_audio_url(*audio.audio_and_sample_rate) def _assert_mm_data_is_image_input( diff --git a/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml new file mode 100644 index 0000000000000..9fae32734d753 --- /dev/null +++ b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml @@ -0,0 +1,11 @@ +model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 5 +server_args: >- + --max-model-len 4096 + --tensor-parallel-size 2 + --enable-expert-parallel + --async-scheduling +env: + VLLM_USE_FLASHINFER_MOE_FP8: "1" diff --git a/tests/evals/gsm8k/configs/models-blackwell.txt b/tests/evals/gsm8k/configs/models-blackwell.txt index 39978aa6ffbe9..c27031d25fb8c 100644 --- a/tests/evals/gsm8k/configs/models-blackwell.txt +++ b/tests/evals/gsm8k/configs/models-blackwell.txt @@ -4,3 +4,4 @@ Qwen1.5-MoE-W4A16-CT.yaml DeepSeek-V2-Lite-Instruct-FP8.yaml Qwen3-30B-A3B-NVFP4.yaml Qwen3-Next-80B-A3B-NVFP4-EP2.yaml +Qwen3-Next-FP8-EP2.yaml diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py index ea6715f5cb532..dd0d3ae0cca47 100644 --- a/tests/evals/gsm8k/test_gsm8k_correctness.py +++ b/tests/evals/gsm8k/test_gsm8k_correctness.py @@ -71,6 +71,7 @@ def test_gsm8k_correctness(config_filename): print(f"Number of questions: {eval_config['num_questions']}") print(f"Number of few-shot examples: {eval_config['num_fewshot']}") print(f"Server args: {' '.join(server_args)}") + print(f"Environment variables: {env_dict}") # Launch server and run evaluation with RemoteOpenAIServer( diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index acf46d75d62eb..3f76033254d32 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -40,93 +40,6 @@ KV_CACHE_DTYPE = ["auto", "fp8"] RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"] -@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) -@pytest.mark.parametrize("num_layers", NUM_LAYERS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@torch.inference_mode() -def test_copy_blocks( - kv_cache_factory, - num_mappings: int, - num_layers: int, - num_heads: int, - head_size: int, - block_size: int, - num_blocks: int, - dtype: torch.dtype, - seed: int, - kv_cache_dtype: str, - device: str, -) -> None: - if kv_cache_dtype == "fp8" and head_size % 16: - pytest.skip() - current_platform.seed_everything(seed) - torch.set_default_device(device) - torch.cuda.set_device(device) - # Generate random block mappings where each source block is mapped to two - # destination blocks. - assert 2 * num_mappings <= num_blocks - src_blocks = random.sample(range(num_blocks), num_mappings) - remaining_blocks = list(set(range(num_blocks)) - set(src_blocks)) - dst_blocks = random.sample(remaining_blocks, 2 * num_mappings) - block_mapping: list[tuple[int, int]] = [] - for i in range(num_mappings): - src = src_blocks[i] - dst1 = dst_blocks[2 * i] - dst2 = dst_blocks[2 * i + 1] - block_mapping.append((src, dst1)) - block_mapping.append((src, dst2)) - - # Create the KV caches. - key_caches, value_caches = kv_cache_factory( - num_blocks, - block_size, - num_layers, - num_heads, - head_size, - kv_cache_dtype, - dtype, - seed, - device, - ) - - # Clone the KV caches. - cloned_key_caches = [key_cache.clone() for key_cache in key_caches] - cloned_value_caches = [value_cache.clone() for value_cache in value_caches] - - # Call the copy blocks kernel. - block_mapping_tensor = torch.tensor( - block_mapping, dtype=torch.int64, device=device - ).view(-1, 2) - - opcheck( - torch.ops._C_cache_ops.copy_blocks, - (key_caches, value_caches, block_mapping_tensor), - test_utils=DEFAULT_OPCHECK_TEST_UTILS, - cond=(head_size == HEAD_SIZES[0]), - ) - ops.copy_blocks(key_caches, value_caches, block_mapping_tensor) - - # Run the reference implementation. - for src, dst in block_mapping: - for cloned_key_cache in cloned_key_caches: - cloned_key_cache[dst].copy_(cloned_key_cache[src]) - for cloned_value_cache in cloned_value_caches: - cloned_value_cache[dst].copy_(cloned_value_cache[src]) - - # Compare the results. - for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches): - torch.testing.assert_close(key_cache, cloned_key_cache) - for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches): - torch.testing.assert_close(value_cache, cloned_value_cache) - - @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -763,73 +676,6 @@ def test_concat_and_cache_ds_mla( torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1) -@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) -@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) -@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) -@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA) -@pytest.mark.parametrize("num_layers", NUM_LAYERS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@torch.inference_mode() -def test_copy_blocks_mla( - kv_lora_rank: int, - qk_rope_head_dim: int, - block_size: int, - num_blocks: int, - num_layers: int, - dtype: torch.dtype, - seed: int, - device: str, - kv_cache_dtype: str, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - torch.cuda.set_device(device) - - entry_size = kv_lora_rank + qk_rope_head_dim - - kv_caches = [] - for _ in range(num_layers): - kv_cache = _create_mla_cache( - num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device - ) - _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype) - kv_caches.append(kv_cache) - - ref_caches = [kv_cache.clone() for kv_cache in kv_caches] - - num_mappings = min(2, num_blocks // 2) - src_blocks = random.sample(range(num_blocks), num_mappings) - remaining = list(set(range(num_blocks)) - set(src_blocks)) - dst_blocks = random.sample(remaining, 2 * num_mappings) - block_mapping = [] - for i in range(num_mappings): - src = src_blocks[i] - dst1 = dst_blocks[2 * i] - dst2 = dst_blocks[2 * i + 1] - block_mapping.append((src, dst1)) - block_mapping.append((src, dst2)) - block_mapping_tensor = torch.tensor( - block_mapping, dtype=torch.int64, device=device - ).view(-1, 2) - - for src, dst in block_mapping: - for ref_cache in ref_caches: - ref_cache[dst].copy_(ref_cache[src]) - - opcheck( - torch.ops._C_cache_ops.copy_blocks_mla, - (kv_caches, block_mapping_tensor), - test_utils=DEFAULT_OPCHECK_TEST_UTILS, - ) - ops.copy_blocks_mla(kv_caches, block_mapping_tensor) - - for kv_cache, ref_cache in zip(kv_caches, ref_caches): - torch.testing.assert_close(kv_cache, ref_cache) - - @pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) @pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) @pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py similarity index 50% rename from tests/models/language/pooling_mteb_test/mteb_utils.py rename to tests/models/language/pooling_mteb_test/mteb_embed_utils.py index 189cdbae99dcd..a0b469f930644 100644 --- a/tests/models/language/pooling_mteb_test/mteb_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py @@ -1,11 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import tempfile - import mteb import numpy as np -import requests import torch from mteb.models import ModelMeta from mteb.types import Array @@ -14,7 +11,6 @@ from torch.utils.data import DataLoader import tests.ci_envs as ci_envs from tests.models.utils import ( EmbedModelInfo, - RerankModelInfo, check_embeddings_close, get_vllm_extra_kwargs, ) @@ -27,10 +23,6 @@ from tests.models.utils import ( MTEB_EMBED_TASKS = ["STS12"] MTEB_EMBED_TOL = 1e-4 -# See #19344 -MTEB_RERANK_TASKS = ["NFCorpus"] -MTEB_RERANK_LANGS = ["eng"] -MTEB_RERANK_TOL = 2e-3 _empty_model_meta = ModelMeta( loader=None, @@ -54,29 +46,9 @@ _empty_model_meta = ModelMeta( ) -class VllmMtebEncoder(mteb.EncoderProtocol): +class MtebEmbedMixin(mteb.EncoderProtocol): mteb_model_meta = _empty_model_meta - def __init__(self, vllm_model): - self.llm = vllm_model - self.rng = np.random.default_rng(seed=42) - - def encode( - self, - inputs: DataLoader[mteb.types.BatchedInput], - *args, - **kwargs, - ) -> np.ndarray: - # Hoping to discover potential scheduling - # issues by randomizing the order. - sentences = [text for batch in inputs for text in batch["text"]] - r = self.rng.permutation(len(sentences)) - sentences = [sentences[i] for i in r] - outputs = self.llm.embed(sentences, use_tqdm=False) - embeds = np.array(outputs) - embeds = embeds[np.argsort(r)] - return embeds - def similarity( self, embeddings1: np.ndarray, @@ -102,31 +74,29 @@ class VllmMtebEncoder(mteb.EncoderProtocol): return sim -class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol): - mteb_model_meta = _empty_model_meta - +class VllmMtebEncoder(MtebEmbedMixin): def __init__(self, vllm_model): self.llm = vllm_model self.rng = np.random.default_rng(seed=42) - def predict( + def encode( self, - inputs1: DataLoader[mteb.types.BatchedInput], - inputs2: DataLoader[mteb.types.BatchedInput], + inputs: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: - queries = [text for batch in inputs1 for text in batch["text"]] - corpus = [text for batch in inputs2 for text in batch["text"]] - - outputs = self.llm.score( - queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False - ) - scores = np.array(outputs) - return scores + # Hoping to discover potential scheduling + # issues by randomizing the order. + sentences = [text for batch in inputs for text in batch["text"]] + r = self.rng.permutation(len(sentences)) + sentences = [sentences[i] for i in r] + outputs = self.llm.embed(sentences, use_tqdm=False) + embeds = np.array(outputs) + embeds = embeds[np.argsort(r)] + return embeds -class OpenAIClientMtebEncoder(VllmMtebEncoder): +class OpenAIClientMtebEncoder(MtebEmbedMixin): def __init__(self, model_name: str, client): self.model_name = model_name self.client = client @@ -153,58 +123,6 @@ class OpenAIClientMtebEncoder(VllmMtebEncoder): return embeds -class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol): - mteb_model_meta = _empty_model_meta - - def __init__(self, model_name: str, url): - self.model_name = model_name - self.url = url - self.rng = np.random.default_rng(seed=42) - - def predict( - self, - inputs1: DataLoader[mteb.types.BatchedInput], - inputs2: DataLoader[mteb.types.BatchedInput], - *args, - **kwargs, - ) -> np.ndarray: - queries = [text for batch in inputs1 for text in batch["text"]] - full_corpus = [text for batch in inputs2 for text in batch["text"]] - - outputs = [] - for query, corpus in zip(queries, full_corpus): - outputs.append(self.get_score(query, corpus)) - - scores = np.array(outputs) - return scores - - def get_score(self, query, corpus): - response = requests.post( - self.url, - json={ - "model": self.model_name, - "text_1": query, - "text_2": corpus, - "truncate_prompt_tokens": -1, - }, - ).json() - return response["data"][0]["score"] - - -class RerankClientMtebEncoder(ScoreClientMtebEncoder): - def get_score(self, query, corpus): - response = requests.post( - self.url, - json={ - "model": self.model_name, - "query": query, - "documents": [corpus], - "truncate_prompt_tokens": -1, - }, - ).json() - return response["results"][0]["relevance_score"] - - def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks): tasks = mteb.get_tasks(tasks=tasks) results = mteb.evaluate( @@ -243,12 +161,21 @@ def mteb_test_embed_models( if model_info.architecture: assert model_info.architecture in model_config.architectures - # Confirm whether vllm uses the correct default_pooling_type, which - # relates to whether chunked prefill and prefix caching are enabled - assert ( - model_config._model_info.default_pooling_type - == model_info.default_pooling_type - ) + # Confirm whether the important configs in model_config are correct. + if model_info.pooling_type is not None: + assert model_config.pooler_config.pooling_type == model_info.pooling_type + if model_info.attn_type is not None: + assert model_config.attn_type == model_info.attn_type + if model_info.is_prefix_caching_supported is not None: + assert ( + model_config.is_prefix_caching_supported + == model_info.is_prefix_caching_supported + ) + if model_info.is_chunked_prefill_supported is not None: + assert ( + model_config.is_chunked_prefill_supported + == model_info.is_chunked_prefill_supported + ) vllm_main_score = run_mteb_embed_task( VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS @@ -299,117 +226,3 @@ def mteb_test_embed_models( # We are not concerned that the vllm mteb results are better # than SentenceTransformers, so we only perform one-sided testing. assert st_main_score - vllm_main_score < atol - - -def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages): - with tempfile.TemporaryDirectory() as prediction_folder: - bm25s = mteb.get_model("bm25s") - eval_splits = ["test"] - - mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks( - tasks=tasks, languages=languages, eval_splits=eval_splits - ) - - mteb.evaluate( - bm25s, - mteb_tasks, - prediction_folder=prediction_folder, - show_progress_bar=False, - # don't save results for test runs - cache=None, - overwrite_strategy="always", - ) - - second_stage_tasks = [] - for task in mteb_tasks: - second_stage_tasks.append( - task.convert_to_reranking( - prediction_folder, - top_k=10, - ) - ) - - results = mteb.evaluate( - cross_encoder, - second_stage_tasks, - show_progress_bar=False, - cache=None, - ) - main_score = results[0].scores["test"][0]["main_score"] - return main_score - - -def mteb_test_rerank_models_hf( - hf_runner, model_name, hf_dtype="float32", hf_model_callback=None -): - with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model: - if hf_model_callback is not None: - hf_model_callback(hf_model) - - st_main_score = run_mteb_rerank( - hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS - ) - st_dtype = next(hf_model.model.model.parameters()).dtype - return st_main_score, st_dtype - - -def mteb_test_rerank_models( - hf_runner, - vllm_runner, - model_info: RerankModelInfo, - vllm_extra_kwargs=None, - hf_model_callback=None, - vllm_mteb_encoder=VllmMtebCrossEncoder, - atol=MTEB_RERANK_TOL, -): - vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) - - with vllm_runner( - model_info.name, - runner="pooling", - max_model_len=None, - max_num_seqs=8, - **vllm_extra_kwargs, - ) as vllm_model: - model_config = vllm_model.llm.llm_engine.model_config - - # Confirm whether vllm is using the correct architecture - if model_info.architecture: - assert model_info.architecture in model_config.architectures - - # Score API is only enabled for num_labels == 1 - assert model_config.hf_config.num_labels == 1 - - # Confirm whether vllm uses the correct default_pooling_type, which - # relates to whether chunked prefill and prefix caching are enabled - assert ( - model_config._model_info.default_pooling_type - == model_info.default_pooling_type - ) - - vllm_main_score = run_mteb_rerank( - vllm_mteb_encoder(vllm_model), - tasks=MTEB_RERANK_TASKS, - languages=MTEB_RERANK_LANGS, - ) - vllm_dtype = model_config.dtype - head_dtype = model_config.head_dtype - - # Accelerate mteb test by setting - # SentenceTransformers mteb score to a constant - if model_info.mteb_score is None: - st_main_score, st_dtype = mteb_test_rerank_models_hf( - hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback - ) - else: - st_main_score = model_info.mteb_score - st_dtype = "Constant" - - print("Model:", model_info.name) - print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score) - print("SentenceTransformers:", st_dtype, st_main_score) - print("Difference:", st_main_score - vllm_main_score) - - # We are not concerned that the vllm mteb results are better - # than SentenceTransformers, so we only perform one-sided testing. - assert st_main_score - vllm_main_score < atol diff --git a/tests/models/language/pooling_mteb_test/mteb_score_utils.py b/tests/models/language/pooling_mteb_test/mteb_score_utils.py new file mode 100644 index 0000000000000..6c13502317736 --- /dev/null +++ b/tests/models/language/pooling_mteb_test/mteb_score_utils.py @@ -0,0 +1,259 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import tempfile +from pathlib import Path + +import mteb +import numpy as np +import requests +from mteb.models import ModelMeta +from torch.utils.data import DataLoader + +from tests.models.utils import ( + RerankModelInfo, + get_vllm_extra_kwargs, +) + +# See #19344 +MTEB_RERANK_TASKS = ["NFCorpus"] +MTEB_RERANK_LANGS = ["eng"] +MTEB_RERANK_TOL = 2e-3 + +template_home = ( + Path(__file__).parent.parent.parent.parent.parent + / "examples/pooling/score/template" +) + +_empty_model_meta = ModelMeta( + loader=None, + name="vllm/model", + revision="1", + release_date=None, + languages=None, + framework=[], + similarity_fn_name=None, + n_parameters=None, + memory_usage_mb=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=None, + public_training_code=None, + public_training_data=None, + use_instructions=None, + training_datasets=None, + modalities=["text"], # 'image' can be added to evaluate multimodal models +) + + +class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol): + mteb_model_meta = _empty_model_meta + + +class VllmMtebCrossEncoder(MtebCrossEncoderMixin): + def __init__(self, vllm_model): + self.llm = vllm_model + self.rng = np.random.default_rng(seed=42) + self.chat_template: str | None = getattr(vllm_model, "chat_template", None) + + def predict( + self, + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: + queries = [text for batch in inputs1 for text in batch["text"]] + corpus = [text for batch in inputs2 for text in batch["text"]] + + outputs = self.llm.score( + queries, + corpus, + truncate_prompt_tokens=-1, + use_tqdm=False, + chat_template=self.chat_template, + ) + scores = np.array(outputs) + return scores + + +class ScoreClientMtebEncoder(MtebCrossEncoderMixin): + mteb_model_meta = _empty_model_meta + + def __init__(self, model_name: str, url): + self.model_name = model_name + self.url = url + self.rng = np.random.default_rng(seed=42) + + def predict( + self, + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: + queries = [text for batch in inputs1 for text in batch["text"]] + full_corpus = [text for batch in inputs2 for text in batch["text"]] + + outputs = [] + for query, corpus in zip(queries, full_corpus): + outputs.append(self.get_score(query, corpus)) + + scores = np.array(outputs) + return scores + + def get_score(self, query, corpus): + response = requests.post( + self.url, + json={ + "model": self.model_name, + "text_1": query, + "text_2": corpus, + "truncate_prompt_tokens": -1, + }, + ).json() + return response["data"][0]["score"] + + +class RerankClientMtebEncoder(ScoreClientMtebEncoder): + def get_score(self, query, corpus): + response = requests.post( + self.url, + json={ + "model": self.model_name, + "query": query, + "documents": [corpus], + "truncate_prompt_tokens": -1, + }, + ).json() + return response["results"][0]["relevance_score"] + + +def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages): + with tempfile.TemporaryDirectory() as prediction_folder: + bm25s = mteb.get_model("bm25s") + eval_splits = ["test"] + + mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks( + tasks=tasks, languages=languages, eval_splits=eval_splits + ) + + mteb.evaluate( + bm25s, + mteb_tasks, + prediction_folder=prediction_folder, + show_progress_bar=False, + # don't save results for test runs + cache=None, + overwrite_strategy="always", + ) + + second_stage_tasks = [] + for task in mteb_tasks: + second_stage_tasks.append( + task.convert_to_reranking( + prediction_folder, + top_k=10, + ) + ) + + results = mteb.evaluate( + cross_encoder, + second_stage_tasks, + show_progress_bar=False, + cache=None, + ) + main_score = results[0].scores["test"][0]["main_score"] + return main_score + + +def mteb_test_rerank_models_hf( + hf_runner, model_name, hf_dtype="float32", hf_model_callback=None +): + with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model: + if hf_model_callback is not None: + hf_model_callback(hf_model) + + st_main_score = run_mteb_rerank( + hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS + ) + st_dtype = next(hf_model.model.model.parameters()).dtype + return st_main_score, st_dtype + + +def mteb_test_rerank_models( + hf_runner, + vllm_runner, + model_info: RerankModelInfo, + vllm_extra_kwargs=None, + hf_model_callback=None, + vllm_mteb_encoder=VllmMtebCrossEncoder, + atol=MTEB_RERANK_TOL, +): + vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) + + with vllm_runner( + model_info.name, + runner="pooling", + max_model_len=None, + max_num_seqs=8, + **vllm_extra_kwargs, + ) as vllm_model: + model_config = vllm_model.llm.llm_engine.model_config + + # Confirm whether vllm is using the correct architecture + if model_info.architecture: + assert model_info.architecture in model_config.architectures + + # Score API is only enabled for num_labels == 1 + assert model_config.hf_config.num_labels == 1 + + # Maybe load chat_template. + chat_template: str | None = None + if model_info.chat_template_name is not None: + chat_template = (template_home / model_info.chat_template_name).read_text() + vllm_model.chat_template = chat_template + + # Confirm whether the important configs in model_config are correct. + if model_info.pooling_type is not None: + assert model_config.pooler_config.pooling_type == model_info.pooling_type + if model_info.attn_type is not None: + assert model_config.attn_type == model_info.attn_type + if model_info.is_prefix_caching_supported is not None: + assert ( + model_config.is_prefix_caching_supported + == model_info.is_prefix_caching_supported + ) + if model_info.is_chunked_prefill_supported is not None: + assert ( + model_config.is_chunked_prefill_supported + == model_info.is_chunked_prefill_supported + ) + + vllm_main_score = run_mteb_rerank( + vllm_mteb_encoder(vllm_model), + tasks=MTEB_RERANK_TASKS, + languages=MTEB_RERANK_LANGS, + ) + vllm_dtype = model_config.dtype + head_dtype = model_config.head_dtype + + # Accelerate mteb test by setting + # SentenceTransformers mteb score to a constant + if model_info.mteb_score is None: + st_main_score, st_dtype = mteb_test_rerank_models_hf( + hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback + ) + else: + st_main_score = model_info.mteb_score + st_dtype = "Constant" + + print("Model:", model_info.name) + print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score) + print("SentenceTransformers:", st_dtype, st_main_score) + print("Difference:", st_main_score - vllm_main_score) + + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < atol diff --git a/tests/models/language/pooling_mteb_test/test_baai.py b/tests/models/language/pooling_mteb_test/test_baai.py index bad13e2457146..2e55622a5d48c 100644 --- a/tests/models/language/pooling_mteb_test/test_baai.py +++ b/tests/models/language/pooling_mteb_test/test_baai.py @@ -4,90 +4,94 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.utils import ( - CLSPoolingEmbedModelInfo, - CLSPoolingRerankModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo, RerankModelInfo, ) -from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models +from .mteb_embed_utils import mteb_test_embed_models +from .mteb_score_utils import mteb_test_rerank_models MODELS = [ ########## BertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-base-en", architecture="BertModel", mteb_score=0.779336792, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-base-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-small-en", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-small-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-large-en", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "BAAI/bge-large-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False), + EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo( "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False ), ########## XLMRobertaModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-m3", architecture="XLMRobertaModel", mteb_score=0.787343078, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ########## Qwen2Model - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "BAAI/bge-code-v1", architecture="Qwen2Model", mteb_score=0.75724465, dtype="float32", + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), ] RERANK_MODELS = [ ########## XLMRobertaForSequenceClassification - CLSPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-base", architecture="XLMRobertaForSequenceClassification", mteb_score=0.32398, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-large", architecture="XLMRobertaForSequenceClassification", enable_test=False, ), - CLSPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-v2-m3", architecture="XLMRobertaForSequenceClassification", enable_test=False, diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py index 6b2e469644926..00f2d33546efc 100644 --- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py @@ -9,14 +9,12 @@ import torch from torch.utils.data import DataLoader from tests.conftest import HfRunner -from tests.models.language.pooling_mteb_test.mteb_utils import ( - VllmMtebCrossEncoder, - mteb_test_rerank_models, -) -from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo +from tests.models.utils import RerankModelInfo + +from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models RERANK_MODELS = [ - LASTPoolingRerankModelInfo( + RerankModelInfo( "BAAI/bge-reranker-v2-gemma", architecture="GemmaForSequenceClassification", mteb_score=0.33757, @@ -25,6 +23,10 @@ RERANK_MODELS = [ "classifier_from_token": ["Yes"], "method": "no_post_processing", }, + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_cross_encoder.py b/tests/models/language/pooling_mteb_test/test_cross_encoder.py index 638ffc7a62b0e..8bca49bb5b023 100644 --- a/tests/models/language/pooling_mteb_test/test_cross_encoder.py +++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py @@ -3,23 +3,29 @@ import pytest from tests.models.utils import ( - CLSPoolingRerankModelInfo, - LASTPoolingRerankModelInfo, RerankModelInfo, ) -from .mteb_utils import mteb_test_rerank_models +from .mteb_score_utils import mteb_test_rerank_models RERANK_MODELS = [ - CLSPoolingRerankModelInfo( + RerankModelInfo( "cross-encoder/ms-marco-TinyBERT-L-2-v2", mteb_score=0.32898, architecture="BertForSequenceClassification", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ), - LASTPoolingRerankModelInfo( + RerankModelInfo( "tomaarsen/Qwen3-Reranker-0.6B-seq-cls", mteb_score=0.25736, architecture="Qwen3ForSequenceClassification", + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_gte.py b/tests/models/language/pooling_mteb_test/test_gte.py index a22821fd65b5a..3d1d5aa84091e 100644 --- a/tests/models/language/pooling_mteb_test/test_gte.py +++ b/tests/models/language/pooling_mteb_test/test_gte.py @@ -5,36 +5,32 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.utils import ( - CLSPoolingEmbedModelInfo, - CLSPoolingRerankModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo, RerankModelInfo, ) -from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models +from .mteb_embed_utils import mteb_test_embed_models +from .mteb_score_utils import mteb_test_rerank_models MODELS = [ ########## BertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "thenlper/gte-large", mteb_score=0.76807651, architecture="BertModel", + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( - "thenlper/gte-base", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "thenlper/gte-small", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False), + EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False), + EmbedModelInfo( "thenlper/gte-large-zh", architecture="BertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( - "thenlper/gte-base-zh", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False), + EmbedModelInfo( "thenlper/gte-small-zh", architecture="BertModel", enable_test=False ), ########### NewModel @@ -43,48 +39,64 @@ MODELS = [ # - whether to use token_type_embeddings # - whether to use context expansion # So only test one (the most widely used) model - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-multilingual-base", architecture="GteNewModel", mteb_score=0.775074696, hf_overrides={"architectures": ["GteNewModel"]}, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-base-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-large-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, enable_test=False, ), ########### Qwen2ForCausalLM - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-Qwen2-1.5B-instruct", mteb_score=0.758473459018872, architecture="Qwen2ForCausalLM", + pooling_type="LAST", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ########## ModernBertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Alibaba-NLP/gte-modernbert-base", mteb_score=0.748193353, architecture="ModernBertModel", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ########## Qwen3ForCausalLM - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "Qwen/Qwen3-Embedding-0.6B", mteb_score=0.771163695, architecture="Qwen3ForCausalLM", dtype="float32", + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "Qwen/Qwen3-Embedding-4B", architecture="Qwen3ForCausalLM", dtype="float32", @@ -93,18 +105,26 @@ MODELS = [ ] RERANK_MODELS = [ - CLSPoolingRerankModelInfo( + RerankModelInfo( # classifier_pooling: mean "Alibaba-NLP/gte-reranker-modernbert-base", mteb_score=0.33386, architecture="ModernBertForSequenceClassification", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingRerankModelInfo( + RerankModelInfo( "Alibaba-NLP/gte-multilingual-reranker-base", mteb_score=0.33062, architecture="GteNewForSequenceClassification", hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_intfloat.py b/tests/models/language/pooling_mteb_test/test_intfloat.py index 1d078db69236a..377ab600aa443 100644 --- a/tests/models/language/pooling_mteb_test/test_intfloat.py +++ b/tests/models/language/pooling_mteb_test/test_intfloat.py @@ -3,40 +3,44 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models -from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from tests.models.utils import EmbedModelInfo -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models MODELS = [ ########## BertModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/e5-small", architecture="BertModel", mteb_score=0.742285423, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( - "intfloat/e5-base", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( - "intfloat/e5-large", architecture="BertModel", enable_test=False - ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False), + EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False), + EmbedModelInfo( "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False ), ########## XLMRobertaModel - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/multilingual-e5-base", architecture="XLMRobertaModel", mteb_score=0.779325955, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/multilingual-e5-large", architecture="XLMRobertaModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "intfloat/multilingual-e5-large-instruct", architecture="XLMRobertaModel", enable_test=False, diff --git a/tests/models/language/pooling_mteb_test/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py index c2065bcd6eb4c..b98ac91b97573 100644 --- a/tests/models/language/pooling_mteb_test/test_jina.py +++ b/tests/models/language/pooling_mteb_test/test_jina.py @@ -10,30 +10,37 @@ from tests.models.language.pooling.embed_utils import ( matryoshka_fy, ) from tests.models.utils import ( - CLSPoolingEmbedModelInfo, - CLSPoolingRerankModelInfo, EmbedModelInfo, RerankModelInfo, ) from vllm import PoolingParams -from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models +from .mteb_embed_utils import mteb_test_embed_models +from .mteb_score_utils import mteb_test_rerank_models EMBEDDING_MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "jinaai/jina-embeddings-v3", mteb_score=0.824413164, architecture="XLMRobertaModel", is_matryoshka=True, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, dtype="float32", ) ] RERANK_MODELS = [ - CLSPoolingRerankModelInfo( + RerankModelInfo( "jinaai/jina-reranker-v2-base-multilingual", mteb_score=0.33643, architecture="XLMRobertaForSequenceClassification", + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ) ] diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py index a6f2a89b268f1..50dc6a0bd0ad1 100644 --- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py +++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py @@ -6,9 +6,9 @@ import pytest import torch from tests.conftest import HfRunner -from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo +from tests.models.utils import RerankModelInfo -from .mteb_utils import mteb_test_rerank_models +from .mteb_score_utils import mteb_test_rerank_models mxbai_rerank_hf_overrides = { "architectures": ["Qwen2ForSequenceClassification"], @@ -17,14 +17,18 @@ mxbai_rerank_hf_overrides = { } RERANK_MODELS = [ - LASTPoolingRerankModelInfo( + RerankModelInfo( "mixedbread-ai/mxbai-rerank-base-v2", architecture="Qwen2ForSequenceClassification", hf_overrides=mxbai_rerank_hf_overrides, mteb_score=0.273, + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), - LASTPoolingRerankModelInfo( + RerankModelInfo( "mixedbread-ai/mxbai-rerank-large-v2", architecture="Qwen2ForSequenceClassification", hf_overrides=mxbai_rerank_hf_overrides, diff --git a/tests/models/language/pooling_mteb_test/test_nemotron.py b/tests/models/language/pooling_mteb_test/test_nemotron.py new file mode 100644 index 0000000000000..c91616c9ec01e --- /dev/null +++ b/tests/models/language/pooling_mteb_test/test_nemotron.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from tests.models.language.pooling_mteb_test.mteb_embed_utils import ( + mteb_test_embed_models, +) +from tests.models.language.pooling_mteb_test.mteb_score_utils import ( + mteb_test_rerank_models, +) +from tests.models.utils import ( + EmbedModelInfo, + RerankModelInfo, +) + +EMBEDDING_MODELS = [ + EmbedModelInfo( + "nvidia/llama-nemotron-embed-1b-v2", + architecture="LlamaBidirectionalModel", + mteb_score=0.689164662128673, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, + ) +] + +RERANK_MODELS = [ + RerankModelInfo( + "nvidia/llama-nemotron-rerank-1b-v2", + architecture="LlamaBidirectionalForSequenceClassification", + chat_template_name="nemotron-rerank.jinja", + mteb_score=0.33994, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, + ), +] + + +@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) +def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: + mteb_test_embed_models(hf_runner, vllm_runner, model_info) + + +@pytest.mark.parametrize("model_info", RERANK_MODELS) +def test_rerank_models_mteb( + hf_runner, vllm_runner, model_info: RerankModelInfo +) -> None: + mteb_test_rerank_models(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/pooling_mteb_test/test_nomic.py b/tests/models/language/pooling_mteb_test/test_nomic.py index c54a43052483a..06c568026a75a 100644 --- a/tests/models/language/pooling_mteb_test/test_nomic.py +++ b/tests/models/language/pooling_mteb_test/test_nomic.py @@ -4,30 +4,38 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models -from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from tests.models.utils import EmbedModelInfo -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/nomic-embed-text-v1", architecture="NomicBertModel", mteb_score=0.737568559, enable_test=True, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/nomic-embed-text-v1.5", architecture="NomicBertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "nomic-ai/nomic-embed-text-v2-moe", architecture="NomicBertModel", mteb_score=0.715488912, enable_test=True, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py index 9a1be6c0be1d6..a8e79c8391072 100644 --- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py +++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py @@ -6,10 +6,10 @@ import pytest import torch from tests.conftest import HfRunner -from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo +from tests.models.utils import RerankModelInfo from tests.utils import multi_gpu_test -from .mteb_utils import mteb_test_rerank_models +from .mteb_score_utils import mteb_test_rerank_models qwen3_reranker_hf_overrides = { "architectures": ["Qwen3ForSequenceClassification"], @@ -18,14 +18,18 @@ qwen3_reranker_hf_overrides = { } RERANK_MODELS = [ - LASTPoolingRerankModelInfo( + RerankModelInfo( "Qwen/Qwen3-Reranker-0.6B", architecture="Qwen3ForSequenceClassification", mteb_score=0.25736, hf_overrides=qwen3_reranker_hf_overrides, + pooling_type="LAST", + attn_type="decoder", + is_prefix_caching_supported=True, + is_chunked_prefill_supported=True, enable_test=True, ), - LASTPoolingRerankModelInfo( + RerankModelInfo( "Qwen/Qwen3-Reranker-4B", architecture="Qwen3ForSequenceClassification", hf_overrides=qwen3_reranker_hf_overrides, diff --git a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py index 3c30628aeaa49..37597a7e9ebab 100644 --- a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py @@ -4,62 +4,82 @@ import pytest from tests.models.language.pooling.embed_utils import correctness_test_embed_models -from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from tests.models.utils import EmbedModelInfo -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-xs", is_matryoshka=False, architecture="BertModel", mteb_score=0.714927797, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-s", is_matryoshka=False, architecture="BertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m", is_matryoshka=False, architecture="BertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m-long", is_matryoshka=False, architecture="NomicBertModel", mteb_score=0.681146831, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-l", is_matryoshka=False, architecture="BertModel", enable_test=False, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m-v1.5", is_matryoshka=True, architecture="BertModel", mteb_score=0.649088363, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-l-v2.0", is_matryoshka=True, architecture="XLMRobertaModel", mteb_score=0.712258299, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "Snowflake/snowflake-arctic-embed-m-v2.0", is_matryoshka=True, architecture="GteModel", mteb_score=0.706622444, + pooling_type="CLS", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), ] diff --git a/tests/models/language/pooling_mteb_test/test_st_projector.py b/tests/models/language/pooling_mteb_test/test_st_projector.py index 74fe4b9bcc03f..c1fd61b8e2270 100644 --- a/tests/models/language/pooling_mteb_test/test_st_projector.py +++ b/tests/models/language/pooling_mteb_test/test_st_projector.py @@ -3,25 +3,31 @@ import pytest from tests.models.utils import ( - CLSPoolingEmbedModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo, ) -from .mteb_utils import mteb_test_embed_models +from .mteb_embed_utils import mteb_test_embed_models # ST models with projector (Dense) layers ST_PROJECTOR_MODELS = [ - CLSPoolingEmbedModelInfo( + EmbedModelInfo( "TencentBAC/Conan-embedding-v1", architecture="BertModel", mteb_score=0.688611955, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, ), - LASTPoolingEmbedModelInfo( + EmbedModelInfo( "google/embeddinggemma-300m", architecture="Gemma3TextModel", mteb_score=0.7473819294684156, + pooling_type="MEAN", + attn_type="encoder_only", + is_prefix_caching_supported=False, + is_chunked_prefill_supported=False, enable_test=True, dtype="float32", ), diff --git a/tests/models/multimodal/conftest.py b/tests/models/multimodal/conftest.py index 4243298cdc896..31d99218c8276 100644 --- a/tests/models/multimodal/conftest.py +++ b/tests/models/multimodal/conftest.py @@ -19,7 +19,7 @@ def pytest_collection_modifyitems(config, items): return # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers - # accuracy issues + # accuracy issues: https://github.com/vllm-project/vllm/issues/30167 # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace torch.backends.cuda.enable_flash_sdp(False) torch.backends.cuda.enable_mem_efficient_sdp(False) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 6640e1ff9474d..299f57f6c4f86 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -513,6 +513,7 @@ VLM_TEST_SETTINGS = { max_model_len=8192, use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, + num_logprobs=10 if current_platform.is_rocm() else 5, ), "intern_vl-hf": VLMTestInfo( models=["OpenGVLab/InternVL3-1B-hf"], diff --git a/tests/models/multimodal/generation/test_keye.py b/tests/models/multimodal/generation/test_keye.py index 6f98bde1d91ea..4205a8b2d1ac4 100644 --- a/tests/models/multimodal/generation/test_keye.py +++ b/tests/models/multimodal/generation/test_keye.py @@ -8,7 +8,7 @@ from PIL.Image import Image from transformers import AutoProcessor from vllm import LLM, EngineArgs, SamplingParams -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview" @@ -31,10 +31,7 @@ def test_keye_vl( question: str, ): images = [asset.pil_image for asset in image_assets] - - image_urls = [ - f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images - ] + image_urls = [encode_image_url(image) for image in images] engine_args = EngineArgs( model=MODEL_NAME, diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index e1b7dbf99f1fd..d46dd640229d0 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -267,7 +267,7 @@ def run_embedding_input_test( """Inference result should be the same between original image/video input and image/video embeddings input. """ - from transformers import AutoProcessor # noqa: F401 + from transformers import AutoProcessor processor = AutoProcessor.from_pretrained(model) diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py index a4e4ce312ddd4..8cea6135ba6a2 100644 --- a/tests/models/multimodal/generation/test_vit_backend_functionality.py +++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py @@ -15,7 +15,7 @@ from transformers import AutoProcessor from vllm import LLM, EngineArgs, SamplingParams from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url from vllm.multimodal.video import sample_frames_from_video from vllm.platforms import current_platform @@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config): """Build Dots.OCR specific prompt with OCR instructions.""" # Use only stop_sign image for Dots.OCR image = images[0] # Already filtered to stop_sign - - image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}" + image_url = encode_image_url(image) placeholders = [{"type": "image_url", "image_url": {"url": image_url}}] messages = [ @@ -204,9 +203,7 @@ def build_processor_prompt(images, config): config["model_name"], trust_remote_code=True ) - image_urls = [ - f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images - ] + image_urls = [encode_image_url(img) for img in images] placeholders = [{"type": "image", "image": url} for url in image_urls] messages = [ { @@ -225,9 +222,7 @@ def build_processor_prompt(images, config): def build_ovis_prompt(images, config): """Build Ovis2.5 specific prompt with custom format.""" - image_urls = [ - f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images - ] + image_urls = [encode_image_url(img) for img in images] placeholders = "\n".join( f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py index 0eaef49e2395c..9f8415c0c390c 100644 --- a/tests/models/multimodal/generation/test_voxtral.py +++ b/tests/models/multimodal/generation/test_voxtral.py @@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets): assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] + assert choice.message.content == "In the first audio clip, you hear a brief" assert choice.finish_reason == "length" diff --git a/tests/models/registry.py b/tests/models/registry.py index 081167c6aedf7..2922414cdaa6a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -488,6 +488,9 @@ _EMBEDDING_EXAMPLE_MODELS = { ), "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"), "LlamaModel": _HfExamplesInfo("llama", is_available_online=False), + "LlamaBidirectionalModel": _HfExamplesInfo( + "nvidia/llama-nemotron-embed-1b-v2", trust_remote_code=True + ), "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "ModernBertModel": _HfExamplesInfo( "Alibaba-NLP/gte-modernbert-base", trust_remote_code=True @@ -554,6 +557,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = { trust_remote_code=True, hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, ), + "LlamaBidirectionalForSequenceClassification": _HfExamplesInfo( + "nvidia/llama-nemotron-rerank-1b-v2", trust_remote_code=True + ), "ModernBertForSequenceClassification": _HfExamplesInfo( "Alibaba-NLP/gte-reranker-modernbert-base" ), @@ -854,6 +860,11 @@ _MULTIMODAL_EXAMPLE_MODELS = { # disable this temporarily until we support HF format is_available_online=False, ), + "VoxtralStreamingGeneration": _HfExamplesInfo( + "", + # disable this temporarily until we support HF format + is_available_online=False, + ), # [Encoder-decoder] "WhisperForConditionalGeneration": _HfExamplesInfo( "openai/whisper-large-v3-turbo", diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py index 15764145bc1a2..24b624e269583 100644 --- a/tests/models/test_terratorch.py +++ b/tests/models/test_terratorch.py @@ -38,7 +38,7 @@ def test_inference( max_num_seqs=32, default_torch_num_threads=1, ) as vllm_model: - vllm_output = vllm_model.llm.encode(prompt) + vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin") assert torch.equal( torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False) ) diff --git a/tests/models/utils.py b/tests/models/utils.py index d84b4b820533e..12544bc96bb5a 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -10,7 +10,7 @@ import torch import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.config.model import ModelConfig, ModelDType, RunnerOption +from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.multimodal.processing import InputProcessingContext from vllm.tokenizers import cached_tokenizer_from_config @@ -375,7 +375,10 @@ class ModelInfo: max_model_len: int | None = None hf_dtype: str = "float32" hf_overrides: dict[str, Any] | None = None - default_pooling_type: str = "" + pooling_type: str | None = None + attn_type: AttnTypeStr | None = None + is_prefix_caching_supported: bool | None = None + is_chunked_prefill_supported: bool | None = None enable_test: bool = True @@ -386,29 +389,10 @@ class EmbedModelInfo(ModelInfo): matryoshka_dimensions: list[int] | None = None -@dataclass -class CLSPoolingEmbedModelInfo(EmbedModelInfo): - default_pooling_type: str = "CLS" - - -@dataclass -class LASTPoolingEmbedModelInfo(EmbedModelInfo): - default_pooling_type: str = "LAST" - - @dataclass class RerankModelInfo(ModelInfo): mteb_score: float | None = None - - -@dataclass -class CLSPoolingRerankModelInfo(RerankModelInfo): - default_pooling_type: str = "CLS" - - -@dataclass -class LASTPoolingRerankModelInfo(RerankModelInfo): - default_pooling_type: str = "LAST" + chat_template_name: str | None = None @dataclass diff --git a/tests/standalone_tests/pytorch_nightly_dependency.sh b/tests/standalone_tests/pytorch_nightly_dependency.sh index fd93ad76bed0f..92820b269f9df 100644 --- a/tests/standalone_tests/pytorch_nightly_dependency.sh +++ b/tests/standalone_tests/pytorch_nightly_dependency.sh @@ -4,6 +4,11 @@ set -e set -x +if command -v rocminfo >/dev/null 2>&1; then + echo "Skipping test for ROCm platform" + exit 0 +fi + cd /vllm-workspace/ rm -rf .venv @@ -36,7 +41,7 @@ if diff before.txt after.txt; then echo "torch version not overridden." else echo "torch version overridden by nightly_torch_test.txt, \ - if the dependency is not triggered by the pytroch nightly test,\ + if the dependency is not triggered by the pytorch nightly test,\ please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py" exit 1 fi diff --git a/tests/tool_parsers/test_mistral_tool_parser.py b/tests/tool_parsers/test_mistral_tool_parser.py index 9400a67267f4c..d2502079d0de9 100644 --- a/tests/tool_parsers/test_mistral_tool_parser.py +++ b/tests/tool_parsers/test_mistral_tool_parser.py @@ -281,6 +281,8 @@ def test_extract_tool_calls_pre_v11_tokenizer( "single_tool_add", "single_tool_weather", "multiple_tool_calls", + "complex", + "wrong_json", ], argnames=["model_output", "expected_tool_calls", "expected_content"], argvalues=[ @@ -326,6 +328,36 @@ def test_extract_tool_calls_pre_v11_tokenizer( ], None, ), + ( + # Complex + """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')""", # noqa: E501 + [ + ToolCall( + function=FunctionCall( + name="bash", + arguments=json.dumps( + {"command": "print(\"hello world!\")\nre.compile(r'{}')"} + )[:-2], + ) + ) + ], + "hi{hi", + ), + ( + # Wrong json + """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501 + [ + ToolCall( + function=FunctionCall( + name="bash", + arguments=json.dumps( + {"command": "print(\"hello world!\")\nre.compile(r'{}')"} + ), + ) + ) + ], + "hi{hi", + ), ], ) def test_extract_tool_calls( @@ -673,7 +705,7 @@ def test_extract_tool_calls_streaming( ), ( # Complex - """[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501 + """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501 [ ToolCall( function=FunctionCall( @@ -684,7 +716,7 @@ def test_extract_tool_calls_streaming( ) ) ], - "", + "hi{hi", ), ], ) diff --git a/tests/utils.py b/tests/utils.py index d8102331b3612..1b338e93182a5 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -106,6 +106,7 @@ class RemoteOpenAIServer: env.update(env_dict) serve_cmd = ["vllm", "serve", model, *vllm_serve_args] print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}") + print(f"Environment variables: {env}") self.proc: subprocess.Popen = subprocess.Popen( serve_cmd, env=env, diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index fd5cf6d3e74aa..c84a51b6883dc 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1798,3 +1798,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes ) ) assert block_hashes[1] == expected_hash2 + + +def test_auto_fit_max_model_len(): + """Test that max_model_len=-1 auto-fits to available GPU memory.""" + # Create config with original_max_model_len=-1 to trigger auto-fit + model_config = ModelConfig(max_model_len=1024) + # Simulate the user passing -1 by setting original_max_model_len + model_config.original_max_model_len = -1 + vllm_config = VllmConfig(model_config=model_config) + + mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # 16KB per block per layer + kv_cache_specs = { + "layer_1": new_kv_cache_spec(), + "layer_2": new_kv_cache_spec(), + } + + # With enough memory, max_model_len stays at the derived max + large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory + _kv_cache_configs = get_kv_cache_configs( + vllm_config, [kv_cache_specs], [large_available_memory] + ) + assert vllm_config.model_config.max_model_len == 1024 + + # Reset for next test + model_config = ModelConfig(max_model_len=1024) + model_config.original_max_model_len = -1 + vllm_config = VllmConfig(model_config=model_config) + + # With limited memory, max_model_len should be reduced + # Need memory for at least max_model_len tokens + # 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens + limited_memory = mem_per_block_per_layer * 2 * 32 + _kv_cache_configs = get_kv_cache_configs( + vllm_config, [kv_cache_specs], [limited_memory] + ) + # Should be reduced to fit in memory + assert vllm_config.model_config.max_model_len < 1024 + assert vllm_config.model_config.max_model_len > 0 + + +def test_auto_fit_max_model_len_not_triggered(): + """Test that auto-fit is not triggered when original_max_model_len is not -1.""" + model_config = ModelConfig(max_model_len=16) + # original_max_model_len should be None by default, not -1 + vllm_config = VllmConfig(model_config=model_config) + + mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 + kv_cache_specs = { + "layer_1": new_kv_cache_spec(), + "layer_2": new_kv_cache_spec(), + } + + # This should work normally without auto-fit + _kv_cache_configs = get_kv_cache_configs( + vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32] + ) + assert vllm_config.model_config.max_model_len == 16 diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 0880a17c78d40..977ec71bcbecf 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -1356,6 +1356,69 @@ def test_kv_cache_events(blocks_to_cache: int): assert len(manager.block_pool.cached_block_hash_to_block) == 0 +def test_null_parent_block_hash(): + block_size = 1 + num_cached_blocks = 2 + num_full_blocks = 4 + + pool = BlockPool( + num_gpu_blocks=8, + enable_caching=True, + hash_block_size=block_size, + enable_kv_cache_events=True, + ) + + req = make_request( + "req_null_parent", + prompt_token_ids=[10, 11, 12, 13], + block_size=block_size, + hash_fn=sha256, + ) + assert len(req.block_hashes) == num_full_blocks + + # Physical parent is `null_block` (no hash), while the logical parent hash + # still exists in `request.block_hashes[num_cached_blocks - 1]`. + assert pool.null_block.block_hash is None + new_blocks = pool.get_new_blocks(num_full_blocks - 1) + blocks = [ + new_blocks[: num_cached_blocks - 1], + pool.null_block, # physical parent + *new_blocks[num_cached_blocks - 1 :], + ] + + pool.cache_full_blocks( + request=req, + blocks=blocks, + num_cached_blocks=num_cached_blocks, + num_full_blocks=num_full_blocks, + block_size=block_size, + kv_cache_group_id=0, + ) + + events = pool.take_events() + assert len(events) == 1 + event = events[0] + assert isinstance(event, BlockStored) + + expected_parent = kv_cache_utils.maybe_convert_block_hash( + req.block_hashes[num_cached_blocks - 1] + ) + assert event.parent_block_hash == expected_parent + assert event.parent_block_hash is not None + + expected_new_hashes = [ + kv_cache_utils.maybe_convert_block_hash(h) + for h in req.block_hashes[num_cached_blocks:num_full_blocks] + ] + assert event.block_hashes == expected_new_hashes + + # Ensure we didn't accidentally assign a hash to the null block. + assert pool.null_block.block_hash is None + # Sanity check: newly cached physical blocks should have hashes assigned. + assert blocks[num_cached_blocks].block_hash is not None + assert blocks[num_full_blocks - 1].block_hash is not None + + @pytest.mark.parametrize("blocks_to_cache", [2, 3, 10]) def test_kv_cache_events_with_lora(blocks_to_cache: int): """Test BlockStored events contain correct lora_id when using LoRA requests.""" diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py index 616d34441ab8e..eae4b7427240f 100644 --- a/tests/v1/ec_connector/integration/test_epd_correctness.py +++ b/tests/v1/ec_connector/integration/test_epd_correctness.py @@ -31,7 +31,7 @@ import openai import requests from vllm.assets.image import ImageAsset -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url MAX_OUTPUT_LEN = 256 @@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [ "content": [ { "type": "image_url", - "image_url": { - "url": f"data:image;base64,{encode_image_base64(image_1)}" - }, + "image_url": {"url": encode_image_url(image_1)}, }, {"type": "text", "text": "What's in this image?"}, ], @@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [ "content": [ { "type": "image_url", - "image_url": { - "url": f"data:image;base64,{encode_image_base64(image_2)}" - }, + "image_url": {"url": encode_image_url(image_2)}, }, { "type": "image_url", diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 224e5d741024b..11681cfcebca4 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -260,7 +260,7 @@ async def test_multi_abort(output_kind: RequestOutputKind): # Use multi-abort to abort multiple requests at once abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT] - await engine.abort(abort_request_ids) + await engine.abort(abort_request_ids, internal=False) # Wait for all tasks to complete results = await asyncio.gather(*tasks, return_exceptions=True) @@ -609,7 +609,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind): await asyncio.sleep(0.5) # Abort the request - await engine.abort(request_id) + await engine.abort(request_id, internal=False) # Wait for generation to complete and return final output final_output = await generated diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 5fa16897b4e0c..4f96ded7ec351 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -40,10 +40,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) PROMPT = "I am Gyoubu Masataka Oniwa" PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids +_REQUEST_COUNTER = 0 + def make_request() -> EngineCoreRequest: + global _REQUEST_COUNTER + _REQUEST_COUNTER += 1 + request_id = f"request-{_REQUEST_COUNTER}" return EngineCoreRequest( - request_id=str(uuid.uuid4()), + request_id=request_id, + external_req_id=f"{request_id}-{uuid.uuid4()}", prompt_token_ids=PROMPT_TOKENS, mm_features=None, sampling_params=SamplingParams(), diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index b4fed6e8f9650..a0e2e5e25a47e 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -45,6 +45,8 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) PROMPT = "Hello my name is Robert and I love quantization kernels" PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids +_REQUEST_COUNTER = 0 + def make_request( params: SamplingParams, prompt_tokens_ids: list[int] | None = None @@ -52,8 +54,12 @@ def make_request( if not prompt_tokens_ids: prompt_tokens_ids = PROMPT_TOKENS + global _REQUEST_COUNTER + _REQUEST_COUNTER += 1 + request_id = f"request-{_REQUEST_COUNTER}" return EngineCoreRequest( - request_id=str(uuid.uuid4()), + request_id=request_id, + external_req_id=f"{request_id}-{uuid.uuid4()}", prompt_token_ids=prompt_tokens_ids, mm_features=None, sampling_params=params, diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py index 77e67d54e587e..67a3b6b012dcc 100644 --- a/tests/v1/engine/test_fast_incdec_prefix_err.py +++ b/tests/v1/engine/test_fast_incdec_prefix_err.py @@ -27,6 +27,7 @@ def test_fast_inc_detok_invalid_utf8_err_case(): params = SamplingParams(skip_special_tokens=True) request = EngineCoreRequest( request_id="test", + external_req_id="test-ext", prompt_token_ids=prompt_token_ids, mm_features=None, sampling_params=params, diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 990aa9d925855..f1185222f7137 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -58,12 +58,12 @@ def test_incremental_detokenization( output_processor = OutputProcessor( dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval ) - engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens) # Make N requests. requests = [ EngineCoreRequest( - request_id=f"request-{idx}", + request_id=f"request-{idx}-int", + external_req_id=f"request-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -83,6 +83,11 @@ def test_incremental_detokenization( for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] + engine_core = MockEngineCore( + tokens_list=dummy_test_vectors.generation_tokens, + request_ids=[req.request_id for req in requests], + ) + # Add requests to the detokenizer. for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): output_processor.add_request(request, prompt) @@ -438,15 +443,6 @@ def test_logprobs_processor( dummy_test_vectors, ): output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False) - engine_core = MockEngineCore( - tokens_list=dummy_test_vectors.generation_tokens, - generated_logprobs_raw=None - if num_sample_logprobs is None - else dummy_test_vectors.generation_logprobs, - prompt_logprobs_raw=None - if num_prompt_logprobs is None - else dummy_test_vectors.prompt_logprobs, - ) # Make N requests. request_id_list = [ @@ -454,7 +450,8 @@ def test_logprobs_processor( ] requests = [ EngineCoreRequest( - request_id=request_id_list[idx], + request_id=request_id_list[idx] + "-int", + external_req_id=request_id_list[idx], prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -476,6 +473,17 @@ def test_logprobs_processor( for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] + engine_core = MockEngineCore( + tokens_list=dummy_test_vectors.generation_tokens, + generated_logprobs_raw=None + if num_sample_logprobs is None + else dummy_test_vectors.generation_logprobs, + prompt_logprobs_raw=None + if num_prompt_logprobs is None + else dummy_test_vectors.prompt_logprobs, + request_ids=[req.request_id for req in requests], + ) + # Add requests to the detokenizer. for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): output_processor.add_request(request, prompt) @@ -621,19 +629,12 @@ def test_stop_token( ] prompt_string = dummy_test_vectors.prompt_strings[0] prompt_tokens = dummy_test_vectors.prompt_tokens[0] - engine_core = MockEngineCore( - tokens_list=[generation_tokens], - generated_logprobs_raw=[generation_logprobs] if do_logprobs else None, - prompt_logprobs_raw=None, - eos_token_id=eos_token_id, - stop_token_ids=stop_token_ids, - ignore_eos=ignore_eos, - ) # Make request. request_id = "request-0" request = EngineCoreRequest( request_id=request_id, + external_req_id=request_id + "-ext", prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=eos_token_id, @@ -655,6 +656,16 @@ def test_stop_token( pooling_params=None, ) + engine_core = MockEngineCore( + tokens_list=[generation_tokens], + generated_logprobs_raw=[generation_logprobs] if do_logprobs else None, + prompt_logprobs_raw=None, + eos_token_id=eos_token_id, + stop_token_ids=stop_token_ids, + ignore_eos=ignore_eos, + request_ids=[request.request_id], + ) + # Add request to the detokenizer. output_processor.add_request(request, prompt_string) @@ -720,13 +731,6 @@ def test_stop_string( dummy_test_vectors, ): output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False) - engine_core = MockEngineCore( - tokens_list=dummy_test_vectors.generation_tokens, - generated_logprobs_raw=dummy_test_vectors.generation_logprobs - if num_sample_logprobs - else None, - prompt_logprobs_raw=None, - ) # Make N requests. request_id_list = [ @@ -734,7 +738,8 @@ def test_stop_string( ] requests = [ EngineCoreRequest( - request_id=request_id_list[idx], + request_id=request_id_list[idx] + "-int", + external_req_id=request_id_list[idx], prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -756,6 +761,15 @@ def test_stop_string( for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] + engine_core = MockEngineCore( + tokens_list=dummy_test_vectors.generation_tokens, + generated_logprobs_raw=dummy_test_vectors.generation_logprobs + if num_sample_logprobs + else None, + prompt_logprobs_raw=None, + request_ids=[req.request_id for req in requests], + ) + # Add requests to the detokenizer. for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): output_processor.add_request(request, prompt) @@ -813,9 +827,12 @@ def test_stop_string( for idx, (ref_gen_str, stop_str) in enumerate( zip(dummy_test_vectors.generation_strings, STOP_STRINGS) ): - # Request should be aborted. + # Request should be aborted (check internal ID in abort list). + internal_request_id = f"request-{idx}-int" + assert internal_request_id in aborted + + # Use external ID for collecting outputs request_id = f"request-{idx}" - assert request_id in aborted # Collected values that were generated. gen_str = gen_strings[request_id] @@ -848,13 +865,13 @@ def test_stop_string( def test_iteration_stats(dummy_test_vectors): output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True) - engine_core = MockEngineCore(dummy_test_vectors.generation_tokens) engine_core_timestamp = time.monotonic() # Make N requests. requests = [ EngineCoreRequest( request_id=f"request-{idx}", + external_req_id=f"request-{idx}-ext", prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -868,6 +885,11 @@ def test_iteration_stats(dummy_test_vectors): for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] + engine_core = MockEngineCore( + dummy_test_vectors.generation_tokens, + request_ids=[req.request_id for req in requests], + ) + # Add all requests except one to the OutputProcessor. num_active = len(dummy_test_vectors.generation_tokens) - 1 for request in requests[:num_active]: @@ -922,7 +944,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): output_processor = OutputProcessor( dummy_test_vectors.tokenizer, log_stats=log_stats ) - engine_core = MockEngineCore(dummy_test_vectors.generation_tokens) engine_core_timestamp = time.monotonic() # Create LoRA requests @@ -936,7 +957,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): lora_assignments = [lora1, lora2, None] requests = [ EngineCoreRequest( - request_id=f"request-{idx}", + request_id=f"request-{idx}-int", + external_req_id=f"request-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -950,6 +972,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] + engine_core = MockEngineCore( + dummy_test_vectors.generation_tokens, + request_ids=[req.request_id for req in requests], + ) + # Add all requests to the OutputProcessor for request in requests: output_processor.add_request(request, None) @@ -1015,9 +1042,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): outputs = EngineCoreOutputs( outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() ) - # Find and mark request-0 as finished (it uses lora-1) + # Find and mark request-0-int as finished (it uses lora-1) for output in outputs.outputs: - if output.request_id == "request-0": + if output.request_id == "request-0-int": output.finish_reason = FinishReason.LENGTH break @@ -1040,9 +1067,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): outputs = EngineCoreOutputs( outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() ) - # Find and mark request-1 as finished (it uses lora-2) + # Find and mark request-1-int as finished (it uses lora-2) for output in outputs.outputs: - if output.request_id == "request-1": + if output.request_id == "request-1-int": output.finish_reason = FinishReason.LENGTH break @@ -1064,9 +1091,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): outputs = EngineCoreOutputs( outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() ) - # Find and mark request-2 as finished (it has no LoRA) + # Find and mark request-2-int as finished (it has no LoRA) for output in outputs.outputs: - if output.request_id == "request-2": + if output.request_id == "request-2-int": output.finish_reason = FinishReason.LENGTH break @@ -1107,7 +1134,9 @@ async def test_request_output_collector(): for idx in range(NUM_REQS) ] - collector = RequestOutputCollector(RequestOutputKind.DELTA) + collector = RequestOutputCollector( + RequestOutputKind.DELTA, request_id="my-request-id-int" + ) # CASE 1: Put then get. outputs = make_outputs() @@ -1163,7 +1192,9 @@ async def test_request_output_collector(): @pytest.mark.asyncio async def test_cumulative_output_collector_n(): """Test collector correctly handles multiple outputs by index.""" - collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE) + collector = RequestOutputCollector( + RequestOutputKind.CUMULATIVE, request_id="my-request-id-int" + ) outputs = [ RequestOutput( request_id="my-request-id", @@ -1242,11 +1273,13 @@ async def test_cumulative_output_collector_n(): @pytest.mark.parametrize("runner", ["generate", "pooling"]) -def test_abort_requests(runner: str, dummy_test_vectors): +@pytest.mark.parametrize("abort_by", ["internal", "external"]) +def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors): output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True) requests = [ EngineCoreRequest( request_id=f"request-{idx}", + external_req_id=f"external-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -1265,8 +1298,13 @@ def test_abort_requests(runner: str, dummy_test_vectors): output_kind = request.sampling_params.output_kind else: output_kind = request.pooling_params.output_kind - queue = RequestOutputCollector(output_kind=output_kind) + queue = RequestOutputCollector( + output_kind=output_kind, request_id=request.request_id + ) output_processor.add_request(request, None, queue=queue) for request in requests: - output_processor.abort_requests([request.request_id]) + if abort_by == "internal": + output_processor.abort_requests([request.request_id], internal=True) + else: + output_processor.abort_requests([request.external_req_id], internal=False) diff --git a/tests/v1/engine/test_parallel_sampling.py b/tests/v1/engine/test_parallel_sampling.py index 736c0e54837fe..fe6f15df20982 100644 --- a/tests/v1/engine/test_parallel_sampling.py +++ b/tests/v1/engine/test_parallel_sampling.py @@ -4,11 +4,12 @@ from vllm import SamplingParams from vllm.outputs import CompletionOutput from vllm.sampling_params import RequestOutputKind +from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.parallel_sampling import ParentRequest def test_parent_request_to_output_stream() -> None: - parent_request = ParentRequest("parent_id", SamplingParams(n=2)) + parent_request = ParentRequest(make_request(SamplingParams(n=2))) parent_request.child_requests = {"child_id_0", "child_id_1"} output_0 = CompletionOutput( index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None @@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None: index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None ) # Request not finished - assert ("parent_id", [output_0], False) == parent_request.get_outputs( - "child_id_0", output_0 - ) - assert ("parent_id", [output_1], False) == parent_request.get_outputs( - "child_id_1", output_1 - ) - assert ("parent_id", [output_0], False) == parent_request.get_outputs( - "child_id_0", output_0 - ) - assert ("parent_id", [output_1], False) == parent_request.get_outputs( - "child_id_1", output_1 - ) + assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0) + assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1) + assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0) + assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1) # output_1 finished output_1.finish_reason = "ended" - assert ("parent_id", [output_0], False) == parent_request.get_outputs( - "child_id_0", output_0 - ) - assert ("parent_id", [output_1], False) == parent_request.get_outputs( - "child_id_1", output_1 - ) + assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0) + assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1) # Finished output_1 had already returned, DO NOT returned again - assert ("parent_id", [output_0], False) == parent_request.get_outputs( - "child_id_0", output_0 - ) - assert parent_request.get_outputs("child_id_1", output_1) == ( - "parent_id", - [], - False, - ) + assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0) + assert parent_request.get_outputs("child_id_1", output_1) == ([], False) # output_0 finished output_0.finish_reason = "ended" - assert ("parent_id", [output_0], True) == parent_request.get_outputs( - "child_id_0", output_0 - ) - assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True) + assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0) + assert parent_request.get_outputs("child_id_1", output_1) == ([], True) # Finished output_0 had already returned, DO NOT returned again - assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True) - assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True) + assert parent_request.get_outputs("child_id_0", output_0) == ([], True) + assert parent_request.get_outputs("child_id_1", output_1) == ([], True) def test_parent_request_to_output_final_only() -> None: parent_request = ParentRequest( - "parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY) + make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)) ) parent_request.child_requests = {"child_id_0", "child_id_1"} output_0 = CompletionOutput( @@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None: index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None ) # Request not finished, return nothing - assert parent_request.get_outputs("child_id_0", output_0) == ( - "parent_id", - [], - False, - ) - assert parent_request.get_outputs("child_id_1", output_1) == ( - "parent_id", - [], - False, - ) + assert parent_request.get_outputs("child_id_0", output_0) == ([], False) + assert parent_request.get_outputs("child_id_1", output_1) == ([], False) # output_1 finished, but outputs won't be returned until all child requests finished output_1.finish_reason = "ended" - assert parent_request.get_outputs("child_id_0", output_0) == ( - "parent_id", - [], - False, - ) - assert parent_request.get_outputs("child_id_1", output_1) == ( - "parent_id", - [], - False, - ) + assert parent_request.get_outputs("child_id_0", output_0) == ([], False) + assert parent_request.get_outputs("child_id_1", output_1) == ([], False) # output_0 finished, as all child requests finished, the output would be returned output_0.finish_reason = "ended" - assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs( + assert ([output_0, output_1], True) == parent_request.get_outputs( "child_id_0", output_0 ) - assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs( + assert ([output_0, output_1], True) == parent_request.get_outputs( "child_id_1", output_1 ) + + +def make_request(sampling_params: SamplingParams) -> EngineCoreRequest: + return EngineCoreRequest( + request_id="parent_id", + external_req_id="ext_parent_id", + prompt_token_ids=None, + mm_features=None, + sampling_params=sampling_params, + pooling_params=None, + eos_token_id=None, + arrival_time=0.0, + lora_request=None, + cache_salt=None, + data_parallel_rank=None, + ) diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py index 1b11b8af49d17..1a16e391316f1 100644 --- a/tests/v1/engine/test_process_multi_modal_uuids.py +++ b/tests/v1/engine/test_process_multi_modal_uuids.py @@ -6,6 +6,7 @@ import pytest from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig +from vllm.multimodal import MultiModalUUIDDict from vllm.sampling_params import SamplingParams from vllm.v1.engine import input_processor as input_processor_mod from vllm.v1.engine.input_processor import InputProcessor @@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False ) - captured: dict[str, object] = {} + captured: dict[str, MultiModalUUIDDict] = {} def fake_preprocess( prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None @@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): ) # Expect request-id-based overrides are passed through - assert captured["mm_uuids"] == { - "image": [f"{request_id}-image-0", f"{request_id}-image-1"], - "video": [f"{request_id}-video-0"], - } + mm_uuids = captured["mm_uuids"] + assert set(mm_uuids.keys()) == {"image", "video"} + assert len(mm_uuids["image"]) == 2 + assert len(mm_uuids["video"]) == 1 + assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[ + "image" + ][0].endswith("-0") + assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[ + "image" + ][1].endswith("-1") + assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[ + "video" + ][0].endswith("-0") diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py index 3541ef89bfc14..d14775668147e 100644 --- a/tests/v1/engine/utils.py +++ b/tests/v1/engine/utils.py @@ -343,6 +343,7 @@ class MockEngineCore: eos_token_id: int | None = None, stop_token_ids: list[int] | None = None, ignore_eos: bool = False, + request_ids: list[str] | None = None, ) -> None: self.num_requests = len(tokens_list) self.tokens_list = tokens_list @@ -355,6 +356,11 @@ class MockEngineCore: self.eos_token_id = eos_token_id self.stop_token_ids = stop_token_ids self.ignore_eos = ignore_eos + self.request_ids = ( + request_ids + if request_ids is not None + else [f"request-{i}" for i in range(self.num_requests)] + ) def get_outputs(self) -> list[EngineCoreOutput]: do_logprobs = self.do_logprobs @@ -386,7 +392,7 @@ class MockEngineCore: prompt_logprobs = None new_token_id = token_ids[token_idx] output = EngineCoreOutput( - request_id=f"request-{req_idx}", + request_id=self.request_ids[req_idx], new_token_ids=[new_token_id], new_logprobs=logprobs, new_prompt_logprobs_tensors=prompt_logprobs, diff --git a/tests/v1/entrypoints/openai/serving_responses/test_image.py b/tests/v1/entrypoints/openai/serving_responses/test_image.py index be5693bbf2736..644d8ce00686e 100644 --- a/tests/v1/entrypoints/openai/serving_responses/test_image.py +++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py @@ -8,7 +8,7 @@ import pytest import pytest_asyncio from tests.utils import RemoteOpenAIServer -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url # Use a small vision model for testing MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" @@ -52,9 +52,9 @@ async def client(image_server): @pytest.fixture(scope="session") -def base64_encoded_image(local_asset_server) -> dict[str, str]: +def url_encoded_image(local_asset_server) -> dict[str, str]: return { - image_url: encode_image_base64(local_asset_server.get_image_asset(image_url)) + image_url: encode_image_url(local_asset_server.get_image_asset(image_url)) for image_url in TEST_IMAGE_ASSETS } @@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded( client: openai.AsyncOpenAI, model_name: str, raw_image_url: str, - base64_encoded_image: dict[str, str], + url_encoded_image: dict[str, str], ): content_text = "What's in this image?" messages = [ @@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded( "content": [ { "type": "input_image", - "image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501 + "image_url": url_encoded_image[raw_image_url], "detail": "auto", }, {"type": "input_text", "text": content_text}, diff --git a/tests/v1/kv_connector/unit/test_example_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py index 75edb79fb4af4..d415608c95faa 100644 --- a/tests/v1/kv_connector/unit/test_example_connector.py +++ b/tests/v1/kv_connector/unit/test_example_connector.py @@ -9,7 +9,7 @@ from PIL import Image from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.image import ImageAsset from vllm.config import KVTransferConfig -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url from vllm.platforms import current_platform MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8" @@ -74,7 +74,7 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]): placeholders = [ { "type": "image_url", - "image_url": {"url": f"data:image;base64,{encode_image_base64(image_pil)}"}, + "image_url": {"url": encode_image_url(image_pil)}, } for image_pil in image_urls ] @@ -145,7 +145,7 @@ def test_shared_storage_connector_hashes(tmp_path): # don't put this import at the top level # it will call torch.cuda.device_count() - from transformers import AutoProcessor # noqa: F401 + from transformers import AutoProcessor # Create processor to handle the chat prompt processor = AutoProcessor.from_pretrained(MODEL_NAME) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 20ef566416b8f..f4389a405196f 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -41,10 +41,13 @@ from vllm.distributed.kv_transfer.kv_transfer_state import ( has_kv_transfer_group, ) from vllm.forward_context import ForwardContext +from vllm.outputs import RequestOutput from vllm.platforms import current_platform from vllm.platforms.interface import Platform from vllm.sampling_params import SamplingParams from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput from vllm.v1.request import RequestStatus @@ -1265,6 +1268,22 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend): run_test_and_cleanup() +class RequestIdMapper: + """Helper class to map external request IDs to internal request IDs.""" + + def __init__(self, output_processor: OutputProcessor): + self.req_id_mapping: dict[str, str] = {} + self.original_add_request = output_processor.add_request + output_processor.add_request = self._add_request + + def _add_request(self, request: EngineCoreRequest, *args, **kwargs): + self.req_id_mapping[request.external_req_id] = request.request_id + return self.original_add_request(request, *args, **kwargs) + + def __call__(self, external_req_id: str) -> str: + return self.req_id_mapping[external_req_id] + + def _run_abort_timeout_test(llm: LLM, timeout: int): """Helper function to run the abort timeout test logic.""" remote_prefill_opts = { @@ -1286,24 +1305,34 @@ def _run_abort_timeout_test(llm: LLM, timeout: int): 0 ].req_to_blocks + id_mapper = RequestIdMapper(llm.llm_engine.output_processor) + + def req_id(outputs: list[RequestOutput]) -> str: + assert len(outputs) == 1 + return id_mapper(outputs[0].request_id) + padding = "Just making this request a little longer so that we're sure " "we're not hitting the small-request lower bound beneath which we don't " "actually trigger the whole kv transfer, but rather just recompute the " "blocks on D." - _ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params) + req0_id = req_id( + llm.generate([f"What is the capital of Japan? {padding}"], sampling_params) + ) # Request finished but not freed - assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks + assert req0_id in scheduler.finished_req_ids and req0_id in req_to_blocks # Some other request, 0 still not freed - _ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params) - assert "0" in req_to_blocks - assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks + req1_id = req_id( + llm.generate([f"What is the capital of Italy? {padding}"], sampling_params) + ) + assert req0_id in req_to_blocks + assert req1_id in scheduler.finished_req_ids and req1_id in req_to_blocks # Wait for timeout and trigger another scheduler loop time.sleep(timeout) _ = llm.generate([f"What is the capital of France? {padding}"], sampling_params) # Request-0 times out and is cleared! - assert "0" not in req_to_blocks + assert req0_id not in req_to_blocks # Need to shutdown the background thread to release NIXL side channel port llm.llm_engine.engine_core.shutdown() diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py index 5bf823417d4dc..3caa7c14b393b 100644 --- a/tests/v1/tpu/test_multimodal.py +++ b/tests/v1/tpu/test_multimodal.py @@ -4,7 +4,7 @@ import openai import pytest -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url from vllm.platforms import current_platform from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS @@ -12,11 +12,9 @@ from ...utils import RemoteOpenAIServer @pytest.fixture(scope="session") -def base64_encoded_image(local_asset_server) -> dict[str, str]: +def url_encoded_image(local_asset_server) -> dict[str, str]: return { - image_asset: encode_image_base64( - local_asset_server.get_image_asset(image_asset) - ) + image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset)) for image_asset in TEST_IMAGE_ASSETS } @@ -24,19 +22,16 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]: @pytest.mark.asyncio @pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU") @pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"]) -async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str]): +async def test_basic_vision(model_name: str, url_encoded_image: dict[str, str]): pytest.skip("Skip this test until it's fixed.") - def whats_in_this_image_msg(b64): + def whats_in_this_image_msg(url): return [ { "role": "user", "content": [ {"type": "text", "text": "What's in this image?"}, - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, - }, + {"type": "image_url", "image_url": {"url": url}}, ], } ] @@ -63,14 +58,14 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str # Other requests now should be much faster for image_url in TEST_IMAGE_ASSETS: - image_base64 = base64_encoded_image[image_url] - chat_completion_from_base64 = await client.chat.completions.create( + image_url = url_encoded_image[image_url] + chat_completion_from_url = await client.chat.completions.create( model=model_name, - messages=whats_in_this_image_msg(image_base64), + messages=whats_in_this_image_msg(image_url), max_completion_tokens=24, temperature=0.0, ) - result = chat_completion_from_base64 + result = chat_completion_from_url assert result choice = result.choices[0] assert choice.finish_reason == "length" diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 299c8219120ae..5820832ed4860 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -380,6 +380,31 @@ def _rocm_aiter_gemm_a8w8_fake( return Y +def _rocm_aiter_triton_gemm_a8w8_blockscale_impl( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale + + return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype) + + +def _rocm_aiter_triton_gemm_a8w8_blockscale_fake( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + m = A.shape[0] + n = B.shape[0] + Y = torch.empty(m, n, dtype=output_dtype, device=A.device) + return Y + + def _rocm_aiter_gemm_a8w8_blockscale_impl( A: torch.Tensor, B: torch.Tensor, @@ -964,6 +989,12 @@ class rocm_aiter_ops: dispatch_key=current_platform.dispatch_key, ) + direct_register_custom_op( + op_name="rocm_aiter_triton_gemm_a8w8_blockscale", + op_func=_rocm_aiter_triton_gemm_a8w8_blockscale_impl, + fake_impl=_rocm_aiter_triton_gemm_a8w8_blockscale_fake, + ) + direct_register_custom_op( op_name="rocm_aiter_gemm_a8w8_blockscale", op_func=_rocm_aiter_gemm_a8w8_blockscale_impl, @@ -1102,6 +1133,19 @@ class rocm_aiter_ops: ) -> torch.Tensor: return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype) + @staticmethod + def triton_gemm_a8w8_blockscale( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype = torch.float16, + ) -> torch.Tensor: + return torch.ops.vllm.rocm_aiter_triton_gemm_a8w8_blockscale( + A, B, As, Bs, output_dtype + ) + @staticmethod def gemm_a8w8_blockscale( A: torch.Tensor, @@ -1373,19 +1417,6 @@ class rocm_aiter_ops: config=config, ) - @staticmethod - def triton_gemm_a8w8_blockscale( - A: torch.Tensor, - B: torch.Tensor, - As: torch.Tensor, - Bs: torch.Tensor, - block_size: list[int], - output_dtype: torch.dtype = torch.float16, - ) -> torch.Tensor: - from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale - - return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype) - @staticmethod def group_fp8_quant( input_2d: torch.Tensor, diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 78bd8d4e64115..c1519fc177250 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -2328,18 +2328,6 @@ def concat_and_cache_mla( ) -def copy_blocks( - key_caches: list[torch.Tensor], - value_caches: list[torch.Tensor], - block_mapping: torch.Tensor, -) -> None: - torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) - - -def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None: - torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping) - - def swap_blocks( src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor ) -> None: diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 95c17cb331f67..239f5376eb462 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -383,18 +383,6 @@ class ipex_ops: ) return None - @staticmethod - def copy_blocks( - key_caches: list[torch.Tensor], - value_caches: list[torch.Tensor], - block_mapping: torch.Tensor, - ) -> None: - torch.xpu.copy_blocks( # type: ignore - key_caches, - value_caches, - block_mapping, - ) - @staticmethod def swap_blocks( src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py index 25f54cc867b5a..1c1623b13f55a 100644 --- a/vllm/attention/layers/mm_encoder_attention.py +++ b/vllm/attention/layers/mm_encoder_attention.py @@ -136,7 +136,7 @@ class MMEncoderAttention(CustomOp): cu_seqlens=cu_seqlens, ) if is_reshaped: - output = output.view(bsz, q_len, -1) + output = output.reshape(bsz, q_len, -1) return output def _forward_fa( @@ -174,7 +174,7 @@ class MMEncoderAttention(CustomOp): fa_version=self._fa_version, ) if is_reshaped: - output = output.view(bsz, q_len, -1) + output = output.reshape(bsz, q_len, -1) return output def forward_native( diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 49ee0faf049d1..067e31f4303b6 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1847,7 +1847,6 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: random_seed=args.seed, dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle, - prefix_len=args.common_prefix_len, ).sample( tokenizer=tokenizer, num_requests=args.num_prompts, diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 12756d1700c9f..f10f50834e4c9 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1281,12 +1281,6 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Repetition penalty sampling parameter. Only has effect on " "openai-compatible backends.", ) - sampling_group.add_argument( - "--common-prefix-len", - type=int, - default=None, - help="Common prefix length shared by all prompts (used by random dataset)", - ) parser.add_argument( "--served-model-name", diff --git a/vllm/config/model.py b/vllm/config/model.py index dd2b7b9d7a786..a730aa8ad1b9c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -11,7 +11,6 @@ import torch from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE -from transformers.configuration_utils import ALLOWED_LAYER_TYPES import vllm.envs as envs from vllm.attention.backends.registry import AttentionBackendEnum @@ -29,6 +28,7 @@ from vllm.transformers_utils.config import ( get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, + is_rope_parameters_nested, try_get_dense_modules, try_get_generation_config, try_get_safetensors_metadata, @@ -164,7 +164,7 @@ class ModelConfig: """The specific revision to use for the tokenizer on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - max_model_len: int = Field(default=None, gt=0) + max_model_len: int = Field(default=None, ge=-1) """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. @@ -172,7 +172,10 @@ class ModelConfig: format. Examples:\n - 1k -> 1000\n - 1K -> 1024\n - - 25.6k -> 25,600""" + - 25.6k -> 25,600\n + - -1 or 'auto' -> Automatically choose the maximum model length that fits in + GPU memory. This will use the model's maximum context length if it fits, + otherwise it will find the largest length that can be accommodated.""" spec_target_max_model_len: int | None = None """Specify the maximum length for spec decoding draft models.""" quantization: QuantizationMethods | str | None = None @@ -592,7 +595,7 @@ class ModelConfig: # Avoid running try_verify_and_update_config multiple times self.config_updated = False - + self._try_verify_and_update_model_config() self._verify_quantization() self._verify_cuda_graph() self._verify_bnb_config() @@ -1005,6 +1008,23 @@ class ModelConfig: "when expert parallelism is enabled." ) + def _try_verify_and_update_model_config(self): + # Avoid running try_verify_and_update_config multiple times + if getattr(self, "config_updated", False): + return + + architecture = self.architecture + if architecture is None: + return + + from vllm.model_executor.models.config import ( + MODELS_CONFIG_MAP, + ) + + cls = MODELS_CONFIG_MAP.get(architecture, None) + if cls is not None: + cls.verify_and_update_model_config(self) + def verify_dual_chunk_attention_config( self, load_config: LoadConfig, @@ -1094,11 +1114,10 @@ class ModelConfig: # The size of inputs_embeds is usually identical to the size # of the hidden states, however there are exceptions, such as # embedding models like CLIP and SigLIP - for target_attr in ("projection_dim", "projection_size"): - if hasattr(self.hf_text_config, target_attr): - return getattr(self.hf_text_config, target_attr) - - return self.get_hidden_size() + names = ("projection_dim", "projection_size") + return getattr_iter( + self.hf_text_config, names, default_factory=self.get_hidden_size + ) @property def is_deepseek_mla(self) -> bool: @@ -1231,14 +1250,12 @@ class ModelConfig: # For ChatGLM: "multi_query_group_num", ] - for attr in attributes: - num_kv_heads = getattr(self.hf_text_config, attr, None) - if num_kv_heads is not None: - return num_kv_heads - # For non-grouped-query attention models, the number of KV heads is # equal to the number of attention heads. - return self.hf_text_config.num_attention_heads + default_factory = lambda: self.hf_text_config.num_attention_heads + return getattr_iter( + self.hf_text_config, attributes, default_factory=default_factory + ) def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int: """Returns the number of KV heads per GPU.""" @@ -1542,6 +1559,10 @@ class ModelConfig: def is_multimodal_raw_input_only_model(self) -> bool: return self._model_info.supports_multimodal_raw_input_only + @property + def requires_raw_input_tokens(self) -> bool: + return self._model_info.requires_raw_input_tokens + @property def is_cross_encoder(self) -> bool: return ( @@ -2125,9 +2146,7 @@ def _get_and_verify_max_len( # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict]. # To simplify the verification, we convert it to dict[str, TypedDict]. rope_parameters = getattr(hf_config, "rope_parameters", None) - if rope_parameters and not set(rope_parameters.keys()).issubset( - ALLOWED_LAYER_TYPES - ): + if rope_parameters and not is_rope_parameters_nested(rope_parameters): rope_parameters = {"": rope_parameters} # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE @@ -2152,9 +2171,10 @@ def _get_and_verify_max_len( if encoder_config and "max_seq_length" in encoder_config: derived_max_model_len = encoder_config["max_seq_length"] - # If the user didn't specify `max_model_len`, then use that derived from - # the model config as a default value. - if max_model_len is None: + # If the user didn't specify `max_model_len` or specified -1 (auto-fit), + # then use that derived from the model config as a default value. + # When -1 is specified, the engine will later auto-fit to available memory. + if max_model_len is None or max_model_len == -1: # For LongRoPE, default to original_max_position_embeddings to avoid # performance degradation for shorter sequences if rope_parameters is not None and any( diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 7fb3bef34b77a..11504fb083558 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -465,6 +465,7 @@ class ParallelConfig: # Derived/runtime topology, networking, or launch details "data_parallel_rank", "data_parallel_rank_local", + "data_parallel_size_local", "data_parallel_backend", "data_parallel_external_lb", "data_parallel_hybrid_lb", diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 470296517deb1..614373782d12f 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -9,7 +9,7 @@ import inspect import json import pathlib import textwrap -from collections.abc import Iterable, Mapping, Sequence, Set +from collections.abc import Callable, Iterable, Mapping, Sequence, Set from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace from itertools import pairwise from typing import TYPE_CHECKING, Any, Protocol, TypeVar @@ -74,7 +74,11 @@ def get_field(cls: ConfigType, name: str) -> Field: def getattr_iter( - object: object, names: Iterable[str], default: Any, warn: bool = False + object: object, + names: Iterable[str], + default: Any | None = None, + default_factory: Callable[[], Any] | None = None, + warn: bool = False, ) -> Any: """ A helper function that retrieves an attribute from an object which may @@ -96,7 +100,7 @@ def getattr_iter( names[0], ) return getattr(object, name) - return default + return default_factory() if default_factory is not None else default def contains_object_print(text: str) -> bool: diff --git a/vllm/distributed/ec_transfer/ec_connector/example_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py index 3518044ce2e00..48a7d41908fd4 100644 --- a/vllm/distributed/ec_transfer/ec_connector/example_connector.py +++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py @@ -81,10 +81,7 @@ class ECExampleConnector(ECConnectorBase): assert encoder_cache is not None if metadata is None: logger.warning( - ( - "In connector.start_load_caches, ", - "but the connector metadata is None", - ) + "In connector.start_load_caches, but the connector metadata is None" ) return # Load the EC for each mm data diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py index 705960aebe2da..9a15d3fa6ed09 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py @@ -408,7 +408,13 @@ class MooncakeConnectorWorker: self.engine = TransferEngine() self.hostname = get_ip() - ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", "rdma", "") + protocol = self.vllm_config.kv_transfer_config.kv_connector_extra_config.get( # type: ignore[union-attr] + "mooncake_protocol", "rdma" + ) + logger.info( + "The Mooncake Transfer Engine is using %s as its protocol.", protocol + ) + ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", protocol, "") if ret_value != 0: raise RuntimeError("Mooncake Transfer Engine initialization failed.") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a4d262d5e1183..1442c83a1504a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -297,16 +297,14 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]: elif contains_type(type_hints, set): kwargs[name].update(collection_to_kwargs(type_hints, set)) elif contains_type(type_hints, int): - kwargs[name]["type"] = int - # Special case for large integers - human_readable_ints = { - "max_model_len", - "max_num_batched_tokens", - "kv_cache_memory_bytes", - } - if name in human_readable_ints: + if name == "max_model_len": + kwargs[name]["type"] = human_readable_int_or_auto + kwargs[name]["help"] += f"\n\n{human_readable_int_or_auto.__doc__}" + elif name in ("max_num_batched_tokens", "kv_cache_memory_bytes"): kwargs[name]["type"] = human_readable_int kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}" + else: + kwargs[name]["type"] = int elif contains_type(type_hints, float): kwargs[name]["type"] = float elif contains_type(type_hints, dict) and ( @@ -2042,7 +2040,7 @@ def _raise_unsupported_error(feature_name: str): raise NotImplementedError(msg) -def human_readable_int(value): +def human_readable_int(value: str) -> int: """Parse human-readable integers like '1k', '2M', etc. Including decimal values with decimal multipliers. @@ -2052,6 +2050,7 @@ def human_readable_int(value): - '25.6k' -> 25,600 """ value = value.strip() + match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value) if match: decimal_multiplier = { @@ -2085,3 +2084,22 @@ def human_readable_int(value): # Regular plain number. return int(value) + + +def human_readable_int_or_auto(value: str) -> int: + """Parse human-readable integers like '1k', '2M', etc. + Including decimal values with decimal multipliers. + Also accepts -1 or 'auto' as a special value for auto-detection. + + Examples: + - '1k' -> 1,000 + - '1K' -> 1,024 + - '25.6k' -> 25,600 + - '-1' or 'auto' -> -1 (special value for auto-detection) + """ + value = value.strip() + + if value == "-1" or value.lower() == "auto": + return -1 + + return human_readable_int(value) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index ab055dfb1fb0e..5e31f60ad0ca8 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -67,6 +67,15 @@ else: logger = init_logger(__name__) + +class ChatTemplateResolutionError(ValueError): + """Raised when chat template resolution fails. + + This is a subclass of ValueError for backward compatibility with + existing exception handlers. + """ + + MODALITY_PLACEHOLDERS_MAP = { "image": "<##IMAGE##>", "audio": "<##AUDIO##>", @@ -1814,7 +1823,7 @@ def apply_hf_chat_template( ) if hf_chat_template is None: - raise ValueError( + raise ChatTemplateResolutionError( "As of transformers v4.44, default chat template is no longer " "allowed, so you must provide a chat template if the tokenizer " "does not define one." diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py index 2ff98577c3634..48f34fce1d44c 100644 --- a/vllm/entrypoints/cli/benchmark/main.py +++ b/vllm/entrypoints/cli/benchmark/main.py @@ -32,6 +32,7 @@ class BenchmarkSubcommand(CLISubcommand): ) -> FlexibleArgumentParser: bench_parser = subparsers.add_parser( self.name, + help=self.help, description=self.help, usage=f"vllm {self.name} [options]", ) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 96608f360e17b..77c7253aef06e 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -66,7 +66,11 @@ class ServeSubcommand(CLISubcommand): self, subparsers: argparse._SubParsersAction ) -> FlexibleArgumentParser: serve_parser = subparsers.add_parser( - self.name, description=DESCRIPTION, usage="vllm serve [model_tag] [options]" + self.name, + help="Launch a local OpenAI-compatible API server to serve LLM " + "completions via HTTP.", + description=DESCRIPTION, + usage="vllm serve [model_tag] [options]", ) serve_parser = make_arg_parser(serve_parser) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2768e267f4837..6be1f1a126f55 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1280,6 +1280,7 @@ class LLM: pooling_params: PoolingParams | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, tokenization_kwargs: dict[str, Any] | None = None, + score_template: str | None = None, ) -> list[ScoringRequestOutput]: model_config = self.model_config @@ -1313,6 +1314,7 @@ class LLM: data_2=d, tokenizer=tokenizer, tokenization_kwargs=tokenization_kwargs, + score_template=score_template, ) if token_type_ids := engine_prompt.pop("token_type_ids", None): @@ -1347,6 +1349,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + chat_template: str | None = None, ) -> list[ScoringRequestOutput]: """Generate similarity scores for all pairs `` or ``. @@ -1379,6 +1382,8 @@ class LLM: lora_request: LoRA request to use for generation, if any. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. + chat_template: The chat template to use for the scoring. If None, we + use the model's default chat template. Returns: A list of `ScoringRequestOutput` objects containing the generated scores in the same order as the input prompts. @@ -1406,6 +1411,11 @@ class LLM: ): raise ValueError("Score API is only enabled for num_labels == 1.") + if not model_config.is_cross_encoder and chat_template is not None: + raise ValueError( + "chat_template is only supported for cross-encoder models." + ) + # the tokenizer for models such as # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing # lists of tokens to the `text` and `text_pair` kwargs @@ -1475,6 +1485,7 @@ class LLM: use_tqdm, pooling_params, lora_request, + score_template=chat_template, ) else: return self._embedding_score( @@ -1610,7 +1621,7 @@ class LLM: added_request_ids.append(request_id) except Exception as e: if added_request_ids: - self.llm_engine.abort_request(added_request_ids) + self.llm_engine.abort_request(added_request_ids, internal=True) raise e def _validate_mm_data_and_uuids( @@ -1720,7 +1731,7 @@ class LLM: priority=priority, prompt_text=prompt_text, ) - return request_id + return engine_request.request_id def _run_engine( self, *, use_tqdm: bool | Callable[..., tqdm] = True diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index d45773f5364e3..bc8855a76e2a2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -909,6 +909,16 @@ def build_app(args: Namespace) -> FastAPI: @app.exception_handler(RequestValidationError) async def validation_exception_handler(_: Request, exc: RequestValidationError): + from vllm.entrypoints.openai.protocol import VLLMValidationError + + param = None + for error in exc.errors(): + if "ctx" in error and "error" in error["ctx"]: + ctx_error = error["ctx"]["error"] + if isinstance(ctx_error, VLLMValidationError): + param = ctx_error.parameter + break + exc_str = str(exc) errors_str = str(exc.errors()) @@ -922,6 +932,7 @@ def build_app(args: Namespace) -> FastAPI: message=message, type=HTTPStatus.BAD_REQUEST.phrase, code=HTTPStatus.BAD_REQUEST, + param=param, ) ) return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) @@ -1145,6 +1156,7 @@ async def init_app_state( engine_client, state.openai_serving_models, request_logger=request_logger, + score_template=resolved_chat_template, log_error_stack=args.log_error_stack, ) if ("embed" in supported_tasks or "score" in supported_tasks) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a3c347cb1bd3f..982f5533ad7f9 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -131,6 +131,36 @@ class ErrorResponse(OpenAIBaseModel): error: ErrorInfo +class VLLMValidationError(ValueError): + """vLLM-specific validation error for request validation failures. + + Args: + message: The error message describing the validation failure. + parameter: Optional parameter name that failed validation. + value: Optional value that was rejected during validation. + """ + + def __init__( + self, + message: str, + *, + parameter: str | None = None, + value: Any = None, + ) -> None: + super().__init__(message) + self.parameter = parameter + self.value = value + + def __str__(self): + base = super().__str__() + extras = [] + if self.parameter is not None: + extras.append(f"parameter={self.parameter}") + if self.value is not None: + extras.append(f"value={self.value}") + return f"{base} ({', '.join(extras)})" if extras else base + + class ModelPermission(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}") object: str = "model_permission" @@ -466,7 +496,9 @@ class ResponsesRequest(OpenAIBaseModel): @model_validator(mode="before") def validate_prompt(cls, data): if data.get("prompt") is not None: - raise ValueError("prompt template is not supported") + raise VLLMValidationError( + "prompt template is not supported", parameter="prompt" + ) return data @model_validator(mode="before") @@ -850,7 +882,10 @@ class ChatCompletionRequest(OpenAIBaseModel): @classmethod def validate_stream_options(cls, data): if data.get("stream_options") and not data.get("stream"): - raise ValueError("Stream options can only be defined when `stream=True`.") + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter="stream_options", + ) return data @@ -859,19 +894,29 @@ class ChatCompletionRequest(OpenAIBaseModel): def check_logprobs(cls, data): if (prompt_logprobs := data.get("prompt_logprobs")) is not None: if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): - raise ValueError( - "`prompt_logprobs` are not available when `stream=True`." + raise VLLMValidationError( + "`prompt_logprobs` are not available when `stream=True`.", + parameter="prompt_logprobs", ) if prompt_logprobs < 0 and prompt_logprobs != -1: - raise ValueError("`prompt_logprobs` must be a positive value or -1.") + raise VLLMValidationError( + "`prompt_logprobs` must be a positive value or -1.", + parameter="prompt_logprobs", + value=prompt_logprobs, + ) if (top_logprobs := data.get("top_logprobs")) is not None: if top_logprobs < 0 and top_logprobs != -1: - raise ValueError("`top_logprobs` must be a positive value or -1.") + raise VLLMValidationError( + "`top_logprobs` must be a positive value or -1.", + parameter="top_logprobs", + value=top_logprobs, + ) if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"): - raise ValueError( - "when using `top_logprobs`, `logprobs` must be set to true." + raise VLLMValidationError( + "when using `top_logprobs`, `logprobs` must be set to true.", + parameter="top_logprobs", ) return data @@ -1285,9 +1330,10 @@ class CompletionRequest(OpenAIBaseModel): for k in ("json", "regex", "choice") ) if count > 1: - raise ValueError( + raise VLLMValidationError( "You can only use one kind of constraints for structured " - "outputs ('json', 'regex' or 'choice')." + "outputs ('json', 'regex' or 'choice').", + parameter="structured_outputs", ) return data @@ -1296,14 +1342,23 @@ class CompletionRequest(OpenAIBaseModel): def check_logprobs(cls, data): if (prompt_logprobs := data.get("prompt_logprobs")) is not None: if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): - raise ValueError( - "`prompt_logprobs` are not available when `stream=True`." + raise VLLMValidationError( + "`prompt_logprobs` are not available when `stream=True`.", + parameter="prompt_logprobs", ) if prompt_logprobs < 0 and prompt_logprobs != -1: - raise ValueError("`prompt_logprobs` must be a positive value or -1.") + raise VLLMValidationError( + "`prompt_logprobs` must be a positive value or -1.", + parameter="prompt_logprobs", + value=prompt_logprobs, + ) if (logprobs := data.get("logprobs")) is not None and logprobs < 0: - raise ValueError("`logprobs` must be a positive value.") + raise VLLMValidationError( + "`logprobs` must be a positive value.", + parameter="logprobs", + value=logprobs, + ) return data @@ -1311,7 +1366,10 @@ class CompletionRequest(OpenAIBaseModel): @classmethod def validate_stream_options(cls, data): if data.get("stream_options") and not data.get("stream"): - raise ValueError("Stream options can only be defined when `stream=True`.") + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter="stream_options", + ) return data @@ -2138,7 +2196,15 @@ class TranscriptionRequest(OpenAIBaseModel): stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] stream = data.get("stream", False) if any(bool(data.get(so, False)) for so in stream_opts) and not stream: - raise ValueError("Stream options can only be defined when `stream=True`.") + # Find which specific stream option was set + invalid_param = next( + (so for so in stream_opts if data.get(so, False)), + "stream_include_usage", + ) + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter=invalid_param, + ) return data @@ -2351,7 +2417,15 @@ class TranslationRequest(OpenAIBaseModel): stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] stream = data.get("stream", False) if any(bool(data.get(so, False)) for so in stream_opts) and not stream: - raise ValueError("Stream options can only be defined when `stream=True`.") + # Find which specific stream option was set + invalid_param = next( + (so for so in stream_opts if data.get(so, False)), + "stream_include_usage", + ) + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter=invalid_param, + ) return data diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 837e742e6be49..2cdb6a6f8eea2 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -495,6 +495,7 @@ async def run_batch( engine_client, openai_serving_models, request_logger=request_logger, + score_template=None, ) if ("embed" in supported_tasks or enable_serving_reranking) else None diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 422a8c18e8e98..690fb22e2274d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -417,8 +417,7 @@ class OpenAIServingChat(OpenAIServing): generators.append(generator) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) assert len(generators) == 1 (result_generator,) = generators @@ -448,8 +447,7 @@ class OpenAIServingChat(OpenAIServing): except GenerationError as e: return self._convert_generation_error_to_response(e) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) def get_chat_request_role(self, request: ChatCompletionRequest) -> str: if request.add_generation_prompt: @@ -682,7 +680,7 @@ class OpenAIServingChat(OpenAIServing): tool_parsers = [None] * num_choices except Exception as e: logger.exception("Error in tool parser creation.") - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" yield "data: [DONE]\n\n" return @@ -1328,9 +1326,8 @@ class OpenAIServingChat(OpenAIServing): except GenerationError as e: yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: - # TODO: Use a vllm-specific Validation Error logger.exception("Error in chat completion stream generator.") - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" # Send the final done message after all response.n are finished yield "data: [DONE]\n\n" @@ -1354,8 +1351,7 @@ class OpenAIServingChat(OpenAIServing): except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) assert final_res is not None diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 265ca9915e5db..d9a8ccb9f851d 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -23,6 +23,7 @@ from vllm.entrypoints.openai.protocol import ( PromptTokenUsageInfo, RequestResponseMetadata, UsageInfo, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_engine import ( GenerationError, @@ -247,8 +248,7 @@ class OpenAIServingCompletion(OpenAIServing): generators.append(generator) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) result_generator = merge_async_iterators(*generators) @@ -308,8 +308,7 @@ class OpenAIServingCompletion(OpenAIServing): except GenerationError as e: return self._convert_generation_error_to_response(e) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) # When user requests streaming but we don't stream, we still need to # return a streaming response with a single event. @@ -510,9 +509,8 @@ class OpenAIServingCompletion(OpenAIServing): except GenerationError as e: yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: - # TODO: Use a vllm-specific Validation Error logger.exception("Error in completion stream generator.") - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" yield "data: [DONE]\n\n" @@ -660,8 +658,11 @@ class OpenAIServingCompletion(OpenAIServing): token = f"token_id:{token_id}" else: if tokenizer is None: - raise ValueError( - "Unable to get tokenizer because `skip_tokenizer_init=True`" + raise VLLMValidationError( + "Unable to get tokenizer because " + "`skip_tokenizer_init=True`", + parameter="skip_tokenizer_init", + value=True, ) token = tokenizer.decode(token_id) @@ -720,6 +721,15 @@ class OpenAIServingCompletion(OpenAIServing): request: CompletionRequest, max_input_length: int | None = None, ) -> RenderConfig: + # Validate max_tokens before using it + if request.max_tokens is not None and request.max_tokens > self.max_model_len: + raise VLLMValidationError( + f"'max_tokens' ({request.max_tokens}) cannot be greater than " + f"the model's maximum context length ({self.max_model_len}).", + parameter="max_tokens", + value=request.max_tokens, + ) + max_input_tokens_len = self.max_model_len - (request.max_tokens or 0) return RenderConfig( max_length=max_input_tokens_len, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index b9771963c6d4c..5ea2a7a572650 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -57,6 +57,7 @@ from vllm.entrypoints.openai.protocol import ( TranscriptionRequest, TranscriptionResponse, TranslationRequest, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.pooling.classify.protocol import ( @@ -322,8 +323,10 @@ class OpenAIServing: input_processor = self.input_processor tokenizer = input_processor.tokenizer if tokenizer is None: - raise ValueError( - "You cannot use beam search when `skip_tokenizer_init=True`" + raise VLLMValidationError( + "You cannot use beam search when `skip_tokenizer_init=True`", + parameter="skip_tokenizer_init", + value=True, ) eos_token_id: int = tokenizer.eos_token_id # type: ignore @@ -706,8 +709,7 @@ class OpenAIServing: return None except Exception as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) async def _collect_batch( self, @@ -738,14 +740,43 @@ class OpenAIServing: return None except Exception as e: - return self.create_error_response(str(e)) + return self.create_error_response(e) def create_error_response( self, - message: str, + message: str | Exception, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + param: str | None = None, ) -> ErrorResponse: + exc: Exception | None = None + + if isinstance(message, Exception): + exc = message + + from vllm.entrypoints.openai.protocol import VLLMValidationError + + if isinstance(exc, VLLMValidationError): + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = exc.parameter + elif isinstance(exc, (ValueError, TypeError, RuntimeError)): + # Common validation errors from user input + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + elif exc.__class__.__name__ == "TemplateError": + # jinja2.TemplateError (avoid importing jinja2) + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + else: + err_type = "InternalServerError" + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + param = None + + message = str(exc) + if self.log_error_stack: exc_type, _, _ = sys.exc_info() if exc_type is not None: @@ -753,18 +784,27 @@ class OpenAIServing: else: traceback.print_stack() return ErrorResponse( - error=ErrorInfo(message=message, type=err_type, code=status_code.value) + error=ErrorInfo( + message=message, + type=err_type, + code=status_code.value, + param=param, + ) ) def create_streaming_error_response( self, - message: str, + message: str | Exception, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + param: str | None = None, ) -> str: json_str = json.dumps( self.create_error_response( - message=message, err_type=err_type, status_code=status_code + message=message, + err_type=err_type, + status_code=status_code, + param=param, ).model_dump() ) return json_str @@ -825,6 +865,7 @@ class OpenAIServing: message=f"The model `{request.model}` does not exist.", err_type="NotFoundError", status_code=HTTPStatus.NOT_FOUND, + param="model", ) def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None: @@ -991,11 +1032,13 @@ class OpenAIServing: ClassificationChatRequest: "classification", } operation = operations.get(type(request), "embedding generation") - raise ValueError( + raise VLLMValidationError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, you requested " f"{token_num} tokens in the input for {operation}. " - f"Please reduce the length of the input." + f"Please reduce the length of the input.", + parameter="input_tokens", + value=token_num, ) return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids) @@ -1017,20 +1060,24 @@ class OpenAIServing: # Note: input length can be up to model context length - 1 for # completion-like requests. if token_num >= self.max_model_len: - raise ValueError( + raise VLLMValidationError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, your request has " f"{token_num} input tokens. Please reduce the length of " - "the input messages." + "the input messages.", + parameter="input_tokens", + value=token_num, ) if max_tokens is not None and token_num + max_tokens > self.max_model_len: - raise ValueError( + raise VLLMValidationError( "'max_tokens' or 'max_completion_tokens' is too large: " f"{max_tokens}. This model's maximum context length is " f"{self.max_model_len} tokens and your request has " f"{token_num} input tokens ({max_tokens} > {self.max_model_len}" - f" - {token_num})." + f" - {token_num}).", + parameter="max_tokens", + value=max_tokens, ) return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 1f9b5704624ab..e9eaaa49275d3 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -94,6 +94,7 @@ from vllm.entrypoints.openai.protocol import ( ResponsesResponse, ResponseUsage, StreamingResponsesResponse, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_engine import ( GenerationError, @@ -271,6 +272,7 @@ class OpenAIServingResponses(OpenAIServing): err_type="invalid_request_error", message=error_message, status_code=HTTPStatus.BAD_REQUEST, + param="input", ) return None @@ -282,6 +284,7 @@ class OpenAIServingResponses(OpenAIServing): err_type="invalid_request_error", message="logprobs are not supported with gpt-oss models", status_code=HTTPStatus.BAD_REQUEST, + param="logprobs", ) if request.store and not self.enable_store and request.background: return self.create_error_response( @@ -294,6 +297,7 @@ class OpenAIServingResponses(OpenAIServing): "the vLLM server." ), status_code=HTTPStatus.BAD_REQUEST, + param="background", ) if request.previous_input_messages and request.previous_response_id: return self.create_error_response( @@ -301,6 +305,7 @@ class OpenAIServingResponses(OpenAIServing): message="Only one of `previous_input_messages` and " "`previous_response_id` can be set.", status_code=HTTPStatus.BAD_REQUEST, + param="previous_response_id", ) return None @@ -457,8 +462,7 @@ class OpenAIServingResponses(OpenAIServing): ) generators.append(generator) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) assert len(generators) == 1 (result_generator,) = generators @@ -546,7 +550,7 @@ class OpenAIServingResponses(OpenAIServing): except GenerationError as e: return self._convert_generation_error_to_response(e) except Exception as e: - return self.create_error_response(str(e)) + return self.create_error_response(e) async def _make_request( self, @@ -630,8 +634,7 @@ class OpenAIServingResponses(OpenAIServing): except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) # NOTE: Implementation of stauts is still WIP, but for now # we guarantee that if the status is not "completed", it is accurate. @@ -1074,7 +1077,7 @@ class OpenAIServingResponses(OpenAIServing): response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) - response = self.create_error_response(str(e)) + response = self.create_error_response(e) finally: new_event_signal.set() @@ -1099,7 +1102,7 @@ class OpenAIServingResponses(OpenAIServing): response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) - response = self.create_error_response(str(e)) + response = self.create_error_response(e) if isinstance(response, ErrorResponse): # If the request has failed, update the status to "failed". @@ -1116,7 +1119,11 @@ class OpenAIServingResponses(OpenAIServing): starting_after: int | None = None, ) -> AsyncGenerator[StreamingResponsesResponse, None]: if response_id not in self.event_store: - raise ValueError(f"Unknown response_id: {response_id}") + raise VLLMValidationError( + f"Unknown response_id: {response_id}", + parameter="response_id", + value=response_id, + ) event_deque, new_event_signal = self.event_store[response_id] start_index = 0 if starting_after is None else starting_after + 1 @@ -1172,6 +1179,7 @@ class OpenAIServingResponses(OpenAIServing): return self.create_error_response( err_type="invalid_request_error", message="Cannot cancel a synchronous response.", + param="response_id", ) # Update the status to "cancelled". @@ -1191,6 +1199,7 @@ class OpenAIServingResponses(OpenAIServing): err_type="invalid_request_error", message=f"Response with id '{response_id}' not found.", status_code=HTTPStatus.NOT_FOUND, + param="response_id", ) def _make_store_not_supported_error(self) -> ErrorResponse: @@ -1203,6 +1212,7 @@ class OpenAIServingResponses(OpenAIServing): "starting the vLLM server." ), status_code=HTTPStatus.BAD_REQUEST, + param="store", ) async def _process_simple_streaming_events( diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 3e648f44f380b..22da46902da14 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -30,6 +30,7 @@ from vllm.entrypoints.openai.protocol import ( TranslationSegment, TranslationStreamResponse, UsageInfo, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest from vllm.entrypoints.openai.serving_models import OpenAIServingModels @@ -259,7 +260,11 @@ class OpenAISpeechToText(OpenAIServing): ) if len(audio_data) / 1024**2 > self.max_audio_filesize_mb: - raise ValueError("Maximum file size exceeded.") + raise VLLMValidationError( + "Maximum file size exceeded", + parameter="audio_filesize_mb", + value=len(audio_data) / 1024**2, + ) with io.BytesIO(audio_data) as bytes_: # NOTE resample to model SR here for efficiency. This is also a @@ -287,12 +292,18 @@ class OpenAISpeechToText(OpenAIServing): ) if request.response_format == "verbose_json": if not isinstance(prompt, dict): - raise ValueError(f"Expected prompt to be a dict,got {type(prompt)}") + raise VLLMValidationError( + "Expected prompt to be a dict", + parameter="prompt", + value=type(prompt).__name__, + ) prompt_dict = cast(dict, prompt) decoder_prompt = prompt.get("decoder_prompt") if not isinstance(decoder_prompt, str): - raise ValueError( - f"Expected decoder_prompt to bestr, got {type(decoder_prompt)}" + raise VLLMValidationError( + "Expected decoder_prompt to be str", + parameter="decoder_prompt", + value=type(decoder_prompt).__name__, ) prompt_dict["decoder_prompt"] = decoder_prompt.replace( "<|notimestamps|>", "<|0.00|>" @@ -412,7 +423,7 @@ class OpenAISpeechToText(OpenAIServing): except ValueError as e: logger.exception("Error in preprocessing prompt inputs") - return self.create_error_response(str(e)) + return self.create_error_response(e) list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None try: @@ -448,8 +459,7 @@ class OpenAISpeechToText(OpenAIServing): for i, prompt in enumerate(prompts) ] except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) if request.stream: return stream_generator_method( @@ -523,8 +533,7 @@ class OpenAISpeechToText(OpenAIServing): except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) async def _speech_to_text_stream_generator( self, @@ -634,9 +643,8 @@ class OpenAISpeechToText(OpenAIServing): ) except Exception as e: - # TODO: Use a vllm-specific Validation Error logger.exception("Error in %s stream generator.", self.task_type) - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" # Send the final done message after all response.n are finished yield "data: [DONE]\n\n" diff --git a/vllm/entrypoints/pooling/embed/conftest.py b/vllm/entrypoints/pooling/embed/conftest.py new file mode 100644 index 0000000000000..002b85874049c --- /dev/null +++ b/vllm/entrypoints/pooling/embed/conftest.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Pytest configuration for vLLM pooling embed tests.""" + +import warnings + +import torch + +from vllm.platforms import current_platform + + +def pytest_collection_modifyitems(config, items): + """Configure ROCm-specific settings based on collected tests.""" + if not current_platform.is_rocm(): + return + + # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers + # accuracy issues: https://github.com/vllm-project/vllm/issues/30167 + # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_mem_efficient_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + warnings.warn( + "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp " + "to avoid HuggingFace Transformers accuracy issues", + UserWarning, + stacklevel=1, + ) diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index edbfcd03ac92c..9762b23639853 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -52,6 +52,7 @@ class ServingScores(OpenAIServing): models: OpenAIServingModels, *, request_logger: RequestLogger | None, + score_template: str | None = None, log_error_stack: bool = False, ) -> None: super().__init__( @@ -60,6 +61,7 @@ class ServingScores(OpenAIServing): request_logger=request_logger, log_error_stack=log_error_stack, ) + self.score_template = score_template async def _embedding_score( self, @@ -169,6 +171,7 @@ class ServingScores(OpenAIServing): data_2=data_2, tokenizer=tokenizer, tokenization_kwargs=tokenization_kwargs, + score_template=self.score_template, ) self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt) if request.mm_processor_kwargs is not None: diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 0f89c840be80f..ca2e27fa4428b 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -12,6 +12,7 @@ import torch from pydantic import Field from vllm.config import ModelConfig +from vllm.entrypoints.openai.protocol import VLLMValidationError from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokensPrompt from vllm.inputs.parse import get_prompt_components, parse_raw_prompts from vllm.tokenizers import TokenizerLike @@ -162,8 +163,9 @@ class BaseRenderer(ABC): ) -> list[EmbedsPrompt]: """Load and validate base64-encoded embeddings into prompt objects.""" if not self.model_config.enable_prompt_embeds: - raise ValueError( - "You must set `--enable-prompt-embeds` to input `prompt_embeds`." + raise VLLMValidationError( + "You must set `--enable-prompt-embeds` to input `prompt_embeds`.", + parameter="prompt_embeds", ) def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: @@ -396,10 +398,12 @@ class CompletionRenderer(BaseRenderer): ) -> TokensPrompt: """Create validated TokensPrompt.""" if max_length is not None and len(token_ids) > max_length: - raise ValueError( + raise VLLMValidationError( f"This model's maximum context length is {max_length} tokens. " f"However, your request has {len(token_ids)} input tokens. " - "Please reduce the length of the input messages." + "Please reduce the length of the input messages.", + parameter="input_tokens", + value=len(token_ids), ) tokens_prompt = TokensPrompt(prompt_token_ids=token_ids) diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 072ddd4c90b16..d7c31cddffad6 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -11,9 +11,11 @@ from vllm.entrypoints.chat_utils import ( ChatCompletionContentPartImageEmbedsParam, ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam, + ChatTemplateResolutionError, MultiModalItemTracker, _ContentPart, _parse_chat_message_content_part, + apply_hf_chat_template, ) from vllm.inputs import TokensPrompt from vllm.model_executor.models.interfaces import supports_score_template @@ -139,10 +141,8 @@ def _parse_score_content( return next(iter(mm_placeholder_storage.values()))[0] -def apply_score_template( - model_config: ModelConfig, - prompt_1: str, - prompt_2: str, +def _apply_model_score_template( + model_config: ModelConfig, prompt_1: str, prompt_2: str ) -> str: # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf) from vllm.model_executor.model_loader import get_model_cls @@ -181,6 +181,7 @@ def get_score_prompt( tokenization_kwargs: dict[str, Any], data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, + score_template: str | None = None, ) -> tuple[str, TokensPrompt]: prompt_1, prompt_2, mm_data = parse_score_data( data_1, @@ -190,19 +191,48 @@ def get_score_prompt( from vllm.model_executor.model_loader import get_model_cls model = get_model_cls(model_config) - if supports_score_template(model): - full_prompt = apply_score_template(model_config, prompt_1, prompt_2) - prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) - elif model_config.use_pad_token: - # cross_encoder models defaults to using pad_token. - prompt_inputs = tokenizer( - text=prompt_1, text_pair=prompt_2, **tokenization_kwargs - ) - full_prompt = tokenizer.decode(prompt_inputs["input_ids"]) + + def default_tokenizer_encode(): + if supports_score_template(model): + full_prompt = _apply_model_score_template(model_config, prompt_1, prompt_2) + prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) + else: + if model_config.use_pad_token: + # cross_encoder models defaults to using pad_token. + prompt_inputs = tokenizer( + text=prompt_1, text_pair=prompt_2, **tokenization_kwargs + ) + full_prompt = tokenizer.decode(prompt_inputs["input_ids"]) + else: + # `llm as reranker` models defaults to not using pad_token. + full_prompt = prompt_1 + prompt_2 + prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs) + return full_prompt, prompt_inputs + + # FIXME: For now, we only apply a template when one is explicitly provided. + # We cannot rely on the tokenizer's chat template because many models + # inherit junk templates from their base LLM, which breaks both the models + # and the tests that use them. + if score_template is None: + full_prompt, prompt_inputs = default_tokenizer_encode() else: - # `llm as reranker` models defaults to not using pad_token. - full_prompt = prompt_1 + prompt_2 - prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs) + # FIXME: Try applying a score template from the CLI arg or tokenizer_config.json + # If that fails because there is no such template, + # fall back to the default implementation. + try: + full_prompt = apply_hf_chat_template( + tokenizer, + [ + {"role": "query", "content": prompt_1}, + {"role": "document", "content": prompt_2}, + ], + score_template, + tools=None, + model_config=model_config, + ) + prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) + except ChatTemplateResolutionError: + full_prompt, prompt_inputs = default_tokenizer_encode() engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"]) diff --git a/vllm/entrypoints/serve/elastic_ep/api_router.py b/vllm/entrypoints/serve/elastic_ep/api_router.py index 21d5d2e60778a..e5adb81051ffd 100644 --- a/vllm/entrypoints/serve/elastic_ep/api_router.py +++ b/vllm/entrypoints/serve/elastic_ep/api_router.py @@ -43,7 +43,7 @@ async def scale_elastic_ep(raw_request: Request): try: body = await raw_request.json() except json.JSONDecodeError as e: - raise HTTPException(status_code=400, detail="Invalid JSON format") from e # noqa: B904 + raise HTTPException(status_code=400, detail="Invalid JSON format") from e new_data_parallel_size = body.get("new_data_parallel_size") drain_timeout = body.get("drain_timeout", 120) # Default 2 minutes diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py index 0bbad17d7ebc7..c9a80e9f7317d 100644 --- a/vllm/model_executor/layers/mamba/short_conv.py +++ b/vllm/model_executor/layers/mamba/short_conv.py @@ -118,6 +118,7 @@ class ShortConv(MambaBase, CustomOp): conv_state = self_kv_cache[0].transpose(-1, -2) state_indices_tensor = attn_metadata.state_indices_tensor has_initial_states_p = attn_metadata.has_initial_states_p + query_start_loc_p = attn_metadata.query_start_loc_p BCx, _ = self.in_proj(hidden_states) @@ -165,11 +166,6 @@ class ShortConv(MambaBase, CustomOp): [num_decodes, num_prefills], dim=0, ) - query_start_loc_p = ( - attn_metadata.query_start_loc[-num_prefills - 1 :] - num_decodes - if has_prefill - else None - ) conv_output_list = [] diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 15ea9f7d60fff..8e4dde324f397 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -625,8 +625,9 @@ def silu_mul_per_token_group_quant_fp8_colmajor( M, N = input.size() N_2 = N // 2 + fp8_dtype = current_platform.fp8_dtype() if output is None: - output = torch.empty((M, N_2), dtype=torch.float8_e4m3fn, device=input.device) + output = torch.empty((M, N_2), dtype=fp8_dtype, device=input.device) output_scales = torch.empty( ((N_2 // GROUP_SIZE), M), dtype=torch.float32, device=input.device @@ -637,9 +638,12 @@ def silu_mul_per_token_group_quant_fp8_colmajor( assert M % BLOCK_M == 0 assert N_2 % BLOCK_N == 0 - finfo = torch.finfo(torch.float8_e4m3fn) - fp8_min = finfo.min - fp8_max = finfo.max + # Using the default value (240.0) from pytorch will cause accuracy + # issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm + # platforms that use the torch.float8_e4m3fnuz dtype. + finfo = torch.finfo(fp8_dtype) + fp8_min = -224.0 if current_platform.is_fp8_fnuz() else finfo.min + fp8_max = 224.0 if current_platform.is_fp8_fnuz() else finfo.max # Force even division so we can avoid edgecases within the kernel. assert M % BLOCK_M == 0 diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 97c7a20bc4d5a..aa020645021ea 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa: SIM117 import fnmatch import glob import itertools @@ -59,7 +58,7 @@ def is_moe_model(model: torch.nn.Module) -> bool: class BitsAndBytesModelLoader(BaseModelLoader): - """Model loader to load model weights with BitAndBytes quantization.""" + """Model loader to load model weights with BitsAndBytes quantization.""" possible_config_file_names = ["adapter_config.json"] diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index 93da07c550195..fb33d3c6448bd 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa: SIM117 import os from collections.abc import Generator diff --git a/vllm/model_executor/models/audioflamingo3.py b/vllm/model_executor/models/audioflamingo3.py index 0ca5f2c4e0a75..3609cc26a4c6b 100644 --- a/vllm/model_executor/models/audioflamingo3.py +++ b/vllm/model_executor/models/audioflamingo3.py @@ -111,7 +111,7 @@ class AudioFlamingo3EmbeddingInputs(TensorSchema): audio_embeds: Annotated[ list[torch.Tensor], - TensorShape("bn", "naf", "hs"), + TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}), ] diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index ccac8a6066429..10fd599f9e5f8 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -13,7 +13,7 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec if TYPE_CHECKING: - from vllm.config import VllmConfig + from vllm.config import ModelConfig, VllmConfig logger = init_logger(__name__) @@ -21,20 +21,24 @@ logger = init_logger(__name__) class VerifyAndUpdateConfig: @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: - raise NotImplementedError + return - -class Gemma3TextModelConfig: @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - hf_config = vllm_config.model_config.hf_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + return + + +class Gemma3TextModelConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + hf_config = model_config.hf_config hf_config.is_causal = not hf_config.use_bidirectional_attention class GteNewModelConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - config = vllm_config.model_config.hf_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config assert config.__class__.__name__ == "NewConfig" assert config.hidden_act == "gelu" @@ -53,16 +57,15 @@ class GteNewModelConfig(VerifyAndUpdateConfig): class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - pooler_config = vllm_config.model_config.pooler_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + pooler_config = model_config.pooler_config if pooler_config.use_activation is None: pooler_config.use_activation = False class JinaRobertaModelConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - model_config = vllm_config.model_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: config = model_config.hf_config if config.position_embedding_type == "rotary": @@ -88,6 +91,26 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig): } +class LlamaBidirectionalConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + from vllm.config.pooler import PoolingTypeStr + + hf_config = model_config.hf_config + hf_config.is_causal = False + + pooling_type_map: dict[str, PoolingTypeStr] = { + "avg": "MEAN", + "cls": "CLS", + "last": "LAST", + } + + pooling_type = pooling_type_map.get(hf_config.pooling, None) + if pooling_type is None: + raise ValueError(f"pool_type {hf_config.pooling} not supported") + model_config.pooler_config.pooling_type = pooling_type + + class NomicBertModelConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: @@ -184,8 +207,8 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - pooler_config = vllm_config.model_config.pooler_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + pooler_config = model_config.pooler_config if pooler_config.step_tag_id is None: pooler_config.step_tag_id = 151651 @@ -193,8 +216,8 @@ class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig): class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - pooler_config = vllm_config.model_config.pooler_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + pooler_config = model_config.pooler_config if pooler_config.softmax is None: pooler_config.softmax = False @@ -202,8 +225,8 @@ class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig): class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - config = vllm_config.model_config.hf_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config is_original_qwen3_reranker = getattr( config, "is_original_qwen3_reranker", False @@ -217,23 +240,23 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig): "Try loading the original Qwen3 Reranker?, see: " "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py" ) - vllm_config.model_config.hf_config.method = "from_2_way_softmax" + model_config.hf_config.method = "from_2_way_softmax" class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - config = vllm_config.model_config.hf_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config config.num_labels = 1 - pooler_config = vllm_config.model_config.pooler_config + pooler_config = model_config.pooler_config if pooler_config.logit_bias is None: pooler_config.logit_bias = 2.65 class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - config = vllm_config.model_config.hf_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config assert config.__class__.__name__ == "GteConfig" assert config.hidden_act == "gelu" @@ -509,6 +532,8 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "GteNewModel": GteNewModelConfig, "GteNewForSequenceClassification": GteNewModelConfig, "Gemma3TextModel": Gemma3TextModelConfig, + "LlamaBidirectionalForSequenceClassification": LlamaBidirectionalConfig, + "LlamaBidirectionalModel": LlamaBidirectionalConfig, "NomicBertModel": NomicBertModelConfig, "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig, "Qwen2ForRewardModel": Qwen2ForRewardModelConfig, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 22d43a4bae18a..b22cdb6d6c80c 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -878,11 +878,14 @@ class Indexer(nn.Module): ) q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1)) - # `rotary_emb` is shape-preserving; `q_pe` is already - # [num_tokens, n_head, rope_dim]. + # Note: RoPE (NeoX) can introduce extra leading dimensions during compilation + # so we need to reshape back to token-flattened shapes + q_pe = q_pe.reshape(-1, self.n_head, self.rope_dim) + k_pe = k_pe.reshape(-1, 1, self.rope_dim) + q = torch.cat([q_pe, q_nope], dim=-1) # `k_pe` is [num_tokens, 1, rope_dim] (MQA). - k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1) + k = torch.cat([k_pe.squeeze(-2), k_nope], dim=-1) # we only quant q here since k quant is fused with cache insertion q = q.view(-1, self.head_dim) @@ -1595,7 +1598,11 @@ class DeepseekV2ForCausalLM( # Determine split axis based on op type # gate/up: ColumnParallel → split along dim 0 # down: RowParallel → split along dim 1 - split_dim = 1 if "down_proj.weight" in name else 0 + split_dim = ( + 1 + if ("down_proj.weight" in name and loaded_weight.ndim > 1) + else 0 + ) total = loaded_weight.shape[split_dim] assert total % num_chunks == 0, ( f"Shared expert weight dim {total} " @@ -1608,14 +1615,13 @@ class DeepseekV2ForCausalLM( weight_to_load = loaded_weight if is_fusion_moe_shared_experts_layer: - if split_dim == 0: - weight_to_load = loaded_weight[ - j * chunk_size : (j + 1) * chunk_size, : - ] + chunk_slice = slice(j * chunk_size, (j + 1) * chunk_size) + if loaded_weight.ndim == 1: + weight_to_load = loaded_weight[chunk_slice] + elif split_dim == 0: + weight_to_load = loaded_weight[chunk_slice, :] else: - weight_to_load = loaded_weight[ - :, j * chunk_size : (j + 1) * chunk_size - ] + weight_to_load = loaded_weight[:, chunk_slice] # Synthesize an expert-style name so expert mapping # can route it chunk_name = name.replace( diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 67c65a44dcf7f..f8288b92ebfae 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -94,6 +94,12 @@ class SupportsMultiModal(Protocol): `multimodal_config.mm_encoder_tp_mode="data"`. """ + requires_raw_input_tokens: ClassVar[bool] = False + """ + A flag that indicates this model processes input id tokens + in their raw form and not input embeddings. + """ + merge_by_field_config: ClassVar[bool | None] = None """ [DEPRECATED] A flag that indicates which implementation of @@ -306,6 +312,10 @@ def supports_multimodal_raw_input_only(model: type[object] | object) -> bool: return getattr(model, "supports_multimodal_raw_input_only", False) +def requires_raw_input_tokens(model: type[object] | object) -> bool: + return getattr(model, "requires_raw_input_tokens", False) + + def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool: return getattr(model, "supports_encoder_tp_data", False) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 3507a2bc66c17..f0f2983f84637 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -57,7 +57,13 @@ from vllm.model_executor.model_loader.weight_utils import ( ) from vllm.sequence import IntermediateTensors -from .interfaces import SupportsEagle, SupportsEagle3, SupportsLoRA, SupportsPP +from .adapters import as_embedding_model, as_seq_cls_model +from .interfaces import ( + SupportsEagle, + SupportsEagle3, + SupportsLoRA, + SupportsPP, +) from .utils import ( AutoWeightsLoader, PPMissingLayer, @@ -698,3 +704,15 @@ class LlamaForCausalLM( name = name.replace(item, mapping[item]) return name, loaded_weight + + +class LlamaBidirectionalForSequenceClassification(as_seq_cls_model(LlamaForCausalLM)): + # This class sets the correct attention type and pooling type + # through LlamaBidirectionalConfig. + pass + + +class LlamaBidirectionalModel(as_embedding_model(LlamaForCausalLM)): + # This class sets the correct attention type and pooling type + # through LlamaBidirectionalConfig. + pass diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index c45bdf95e7487..930ff737bcdac 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -139,7 +139,7 @@ class MiniCPMVImageEmbeddingInputs(TensorSchema): type: Literal["image_embeds"] image_embeds: Annotated[ torch.Tensor | list[torch.Tensor], - TensorShape("bn", "ns", "hs"), + TensorShape("bn", "ns", "hs", dynamic_dims={"ns"}), ] diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index f84ddfa84f6ab..c97e6873e0d17 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -101,7 +101,7 @@ class Qwen2AudioEmbeddingInputs(TensorSchema): audio_embeds: Annotated[ list[torch.Tensor], - TensorShape("bn", "naf", "hs"), + TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}), ] diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 5ca6b3d852ac3..07c5499d0b194 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -118,7 +118,7 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): output_lengths = ( ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13 ) - return feat_lengths, output_lengths + return output_lengths class Qwen3_VisionPatchEmbed(nn.Module): @@ -921,13 +921,11 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( if audio_feature_lengths is None and feature_attention_mask is None: audio_output_lengths = [] elif audio_feature_lengths is not None: - _, audio_output_lens = _get_feat_extract_output_lengths( - audio_feature_lengths - ) + audio_output_lens = _get_feat_extract_output_lengths(audio_feature_lengths) audio_output_lengths = audio_output_lens.tolist() elif feature_attention_mask is not None: assert isinstance(feature_attention_mask, torch.Tensor) - _, audio_output_lens = _get_feat_extract_output_lengths( + audio_output_lens = _get_feat_extract_output_lengths( feature_attention_mask.sum(-1) ) audio_output_lengths = audio_output_lens.tolist() @@ -1111,18 +1109,16 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix audio_input: Qwen2_5OmniAudioFeatureInputs, audio_hashes: list[str] | None = None, cached_audio_features: torch.Tensor | None = None, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, ...]: input_features = audio_input["input_features"] audio_feature_lengths = audio_input["audio_feature_lengths"] - audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths( - audio_feature_lengths - ) + audio_output_lengths = _get_feat_extract_output_lengths(audio_feature_lengths) audio_outputs = self.audio_tower( input_features.to(self.audio_tower.dtype), feature_lens=audio_feature_lengths, - aftercnn_lens=audio_feat_lengths, + aftercnn_lens=audio_output_lengths, ) audio_features = audio_outputs.last_hidden_state return audio_features.split(audio_output_lengths.tolist()) @@ -1579,7 +1575,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( + st_idx ) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - _, audio_len = _get_feat_extract_output_lengths( + audio_len = _get_feat_extract_output_lengths( audio_feature_lengths[audio_idx] ) llm_pos_ids = ( @@ -1700,7 +1696,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( llm_pos_ids_list.append(bos_block) llm_pos_ids_list.append(bos_block) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - _, audio_len = _get_feat_extract_output_lengths( + audio_len = _get_feat_extract_output_lengths( audio_feature_lengths[audio_idx] ) audio_llm_pos_ids = ( diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 3ba61b52cfdf1..fd39afe259ae3 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -46,6 +46,7 @@ from .interfaces import ( has_noops, is_attention_free, is_hybrid, + requires_raw_input_tokens, supports_cross_encoding, supports_mamba_prefix_caching, supports_multimodal, @@ -203,6 +204,7 @@ _EMBEDDING_MODELS = { "GteNewModel": ("bert_with_rope", "GteNewModel"), "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"), "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"), # noqa: E501 + "LlamaBidirectionalModel": ("llama", "LlamaBidirectionalModel"), "LlamaModel": ("llama", "LlamaForCausalLM"), **{ # Multiple models share the same architecture, so we include them all @@ -246,6 +248,11 @@ _CROSS_ENCODER_MODELS = { "bert_with_rope", "GteNewForSequenceClassification", ), + "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), + "LlamaBidirectionalForSequenceClassification": ( + "llama", + "LlamaBidirectionalForSequenceClassification", + ), "ModernBertForSequenceClassification": ( "modernbert", "ModernBertForSequenceClassification", @@ -259,8 +266,6 @@ _CROSS_ENCODER_MODELS = { "roberta", "RobertaForSequenceClassification", ), - # [Auto-converted (see adapters.py)] - "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501, } _MULTIMODAL_MODELS = { @@ -418,6 +423,7 @@ _MULTIMODAL_MODELS = { ), "UltravoxModel": ("ultravox", "UltravoxModel"), "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 + "VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"), # noqa: E501 # [Encoder-decoder] "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501 } @@ -535,6 +541,7 @@ class _ModelInfo: supports_cross_encoding: bool supports_multimodal: bool supports_multimodal_raw_input_only: bool + requires_raw_input_tokens: bool supports_multimodal_encoder_tp_data: bool supports_pp: bool has_inner_state: bool @@ -558,6 +565,7 @@ class _ModelInfo: supports_multimodal_raw_input_only=supports_multimodal_raw_input_only( model ), + requires_raw_input_tokens=requires_raw_input_tokens(model), supports_multimodal_encoder_tp_data=supports_multimodal_encoder_tp_data( model ), diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index efdee255ab5eb..15d0ff30ed9bb 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -163,8 +163,10 @@ def apply_rotary_pos_emb( enable_fp32_compute=True, ) - if is_flash_attn_backend and not current_platform.is_cuda(): + if is_flash_attn_backend and current_platform.is_cuda(): apply_rotary_emb_func = apply_rotary_emb.forward_cuda + elif is_flash_attn_backend and current_platform.is_rocm(): + apply_rotary_emb_func = apply_rotary_emb.forward_hip else: apply_rotary_emb_func = apply_rotary_emb.forward_native diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py index b807f45b5d52b..c7844381eb633 100644 --- a/vllm/model_executor/models/transformers/utils.py +++ b/vllm/model_executor/models/transformers/utils.py @@ -22,7 +22,6 @@ from typing import TYPE_CHECKING, Literal import torch from torch import nn -from transformers.configuration_utils import ALLOWED_LAYER_TYPES from vllm.config.utils import getattr_iter from vllm.logger import init_logger @@ -32,6 +31,7 @@ from vllm.model_executor.layers.linear import ( ReplicatedLinear, RowParallelLinear, ) +from vllm.transformers_utils.config import is_rope_parameters_nested if TYPE_CHECKING: from vllm.config import VllmConfig @@ -207,7 +207,7 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool: rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {} if rope_parameters: # Nest rope_parameters if not nested already to simplify logic - if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + if not is_rope_parameters_nested(rope_parameters): rope_parameters = {"": rope_parameters} return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values()) return True diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 331f0c54ecfbc..cbba1af89190c 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import inspect import math from collections.abc import Iterable, Mapping, Sequence from functools import cached_property @@ -116,10 +117,7 @@ class VoxtralProcessorAdapter: self, audio_length: int, ) -> int: - pad_audio_length = self._audio_processor.next_multiple_of_chunk_frames( - audio_length, self.sampling_rate - ) - return ceil(pad_audio_length / (self.sampling_rate // self.frame_rate)) + return ceil(audio_length / (self.sampling_rate // self.frame_rate)) def __call__( self, @@ -158,7 +156,14 @@ class VoxtralProcessorAdapter: assert audio.ndim == 1 # pad if necessary - audio = self._audio_processor.pad(audio, self.sampling_rate) + # TODO(Patrick) - remove once mistral-common is bumped + sig = inspect.signature(self._audio_processor.pad) + if "is_online_streaming" in sig.parameters: + audio = self._audio_processor.pad( + audio, self.sampling_rate, is_online_streaming=False + ) + else: + audio = self._audio_processor.pad(audio, self.sampling_rate) audio_tokens = [self.begin_audio_token_id] + [ self.audio_token_id @@ -510,6 +515,7 @@ class VoxtralForConditionalGeneration( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: remapping_rules = [ + (r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"), (r"mm_whisper_embeddings\.(.*)", r"\1"), (r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"), ( @@ -535,13 +541,16 @@ class VoxtralForConditionalGeneration( def llm_weights_generator(): nonlocal loaded_weights for name, w in weights: - is_encoder = ( - name.startswith("mm_whisper_embeddings") - and not name.startswith("mm_whisper_embeddings.tok_embeddings") - and not name.startswith( - "mm_whisper_embeddings.audio_language_projection" + is_encoder = False + for k in [ + "mm_whisper_embeddings", + "mm_streams_embeddings.embedding_module", + ]: + is_encoder |= ( + name.startswith(k) + and not name.startswith(f"{k}.tok_embeddings") + and not name.startswith(f"{k}.audio_language_projection") ) - ) for pattern, repl in remapping_rules: if re.fullmatch(pattern, name): @@ -676,6 +685,7 @@ class VoxtralEncoderModel(nn.Module): packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]} mistral_remapping = [ + (r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"), ( r"whisper_encoder\.conv_layers\.0\.(weight|bias)", r"whisper_encoder.conv1.\1", @@ -684,6 +694,14 @@ class VoxtralEncoderModel(nn.Module): r"whisper_encoder\.conv_layers\.1\.(weight|bias)", r"whisper_encoder.conv2.\1", ), + ( + r"whisper_encoder\.conv_layers\.0\.conv\.(weight|bias)", + r"whisper_encoder.conv1.\1", + ), # noqa: E501 + ( + r"whisper_encoder\.conv_layers\.1\.conv\.(weight|bias)", + r"whisper_encoder.conv2.\1", + ), # noqa: E501 ( r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)", # noqa: E501 r"whisper_encoder.layers.\1.self_attn.\2_proj.\3", diff --git a/vllm/model_executor/models/voxtral_streaming.py b/vllm/model_executor/models/voxtral_streaming.py new file mode 100644 index 0000000000000..2e79e24e6f194 --- /dev/null +++ b/vllm/model_executor/models/voxtral_streaming.py @@ -0,0 +1,243 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +from collections.abc import Mapping + +import torch + +from vllm.config.vllm import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.models.interfaces import MultiModalEmbeddings +from vllm.model_executor.models.voxtral import ( + VoxtralDummyInputsBuilder, + VoxtralForConditionalGeneration, + VoxtralMultiModalProcessor, + VoxtralProcessingInfo, +) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.cache import _I, BaseMultiModalProcessorCache +from vllm.multimodal.inputs import ( + MultiModalKwargsOptionalItems, +) +from vllm.multimodal.parse import MultiModalDataItems +from vllm.multimodal.processing import ( + MultiModalPromptUpdates, + PlaceholderFeaturesInfo, +) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors + +from .utils import ( + _flatten_embeddings, +) + +logger = init_logger(__name__) + + +class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor): + def __init__( + self, + info: _I, + dummy_inputs: BaseDummyInputsBuilder[_I], + *, + cache: BaseMultiModalProcessorCache | None = None, + ) -> None: + # streaming can't make use of a cache yet + super().__init__(info, dummy_inputs, cache=None) + + def _maybe_apply_prompt_updates( + self, + mm_items: MultiModalDataItems, + prompt_ids: list[int], + mm_kwargs: MultiModalKwargsOptionalItems, + mm_prompt_updates: MultiModalPromptUpdates, + is_update_applied: bool, + ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]: + # there are no placeholder audio tokens for streaming + # so we need to build the place placeholder positions manually + + # in streaming there is always only one audio input + audios = mm_kwargs.get("audio", []) + assert len(audios) == 1, ( + f"Expected only one audio input for streaming, got {mm_kwargs=}" + ) + tokenizer = self.info.get_tokenizer() + audio_config = tokenizer.instruct.audio_encoder.audio_config + + num_audio_samples = audios[0]["audio_arrays"].data.shape[0] + length = audio_config.num_audio_tokens(num_audio_samples) + + features_info = PlaceholderFeaturesInfo( + modality="audio", + item_idx=0, + start_idx=0, + tokens=length + * [0], # only used for length computation, so we can take dummy inputs + is_embed=None, + ) + return prompt_ids, {"audio": [features_info]} + + +class TimeEmbedding(torch.nn.Module): + """Sinusoidal Embedding for encoding time""" + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + self.dim = dim + self.theta = theta + inv_freq = torch.exp( + -math.log(self.theta) + * torch.arange(self.dim // 2).float() + / (self.dim // 2) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, t: torch.Tensor) -> torch.Tensor: + t = t[..., None] # (B,) -> (B, 1) or (B, T) -> (B, T, 1) + inv_freq = self.inv_freq.to(device=t.device, dtype=t.dtype) + emb = ( + t * inv_freq + ) # (B, 1) x (D/2,) -> (B, D/2) or (B, T, 1) x (D/2,) -> (B, T, D/2) + return torch.cat((emb.cos(), emb.sin()), dim=-1) # (B, D) or (B, T, D) + + +@MULTIMODAL_REGISTRY.register_processor( + VoxtralStreamingMultiModalProcessor, + info=VoxtralProcessingInfo, + dummy_inputs=VoxtralDummyInputsBuilder, +) +class VoxtralStreamingGeneration(VoxtralForConditionalGeneration): + requires_raw_input_tokens = True + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + self.time_embedding: TimeEmbedding = TimeEmbedding( + dim=self.config.text_config.hidden_size + ) + + audio_config = self.tokenizer.instruct.audio_encoder.audio_config + _n_delay_tokens = ( + audio_config.frame_rate * audio_config.transcription_delay_ms / 1000 + ) + assert _n_delay_tokens.is_integer(), ( + f"n_delay_tokens must be integer, got {_n_delay_tokens}" + ) + + self.n_delay_tokens = int(_n_delay_tokens) + + @property + def audio_config(self): + return self.tokenizer.instruct.audio_encoder.audio_config + + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings | None = None, + *, + is_multimodal: torch.Tensor | None = None, + # Multi-modal token ID may exceed vocab size + handle_oov_mm_token: bool = True, + ) -> torch.Tensor: + """Pass post-conv embeddings directly as input""" + # for streaming we simply flatten the multimodal embeddings + # to be in tensor format, we treat the input ids later + assert multimodal_embeddings is not None + assert len(multimodal_embeddings) > 0, ( + "For streaming you must provide a multimodal_embedding at every step." + ) + mm_embeds_flat = _flatten_embeddings(multimodal_embeddings) + return mm_embeds_flat + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: object, + ) -> torch.Tensor | IntermediateTensors: + assert inputs_embeds is not None + assert input_ids is not None + + pool_size = self.config.audio_config.block_pool_size + inputs_embeds = inputs_embeds.view( + inputs_embeds.shape[0] * pool_size, inputs_embeds.shape[1] // pool_size + ) + + audio_hidden_states = self.whisper_encoder.whisper_encoder.forward_layers( + inputs_embeds + ) + + num_tokens, audio_hidden_size = audio_hidden_states.shape + assert num_tokens % self.downsample_factor == 0 + audio_hidden_states = audio_hidden_states.reshape( + num_tokens // self.downsample_factor, + audio_hidden_size * self.downsample_factor, + ) + audio_text_embeds = self.audio_language_adapter(audio_hidden_states) + + text_embeds = self.language_model.embed_input_ids(input_ids) + + # sum pool text and audio embeddings + inputs_embeds = audio_text_embeds + text_embeds + + time_tensor = torch.tensor( + [self.n_delay_tokens], + device=inputs_embeds.device, + dtype=inputs_embeds.dtype, + ) + inputs_embeds = inputs_embeds + self.time_embedding(time_tensor) + + hidden_states = self.language_model.model( + input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds + ) + + return hidden_states + + def embed_multimodal( + self, **kwargs + ) -> list[torch.Tensor] | torch.Tensor | tuple[torch.Tensor, ...] | None: + """Transform audio waveforms -> initial whisper post-conv embeddings""" + audio_inputs = self._parse_and_validate_audio_arrays(**kwargs) + + assert audio_inputs is not None, ( + "For streaming you must provide an audio input at every step." + ) + + multiple_of = self.audio_config.raw_audio_length_per_tok + assert all( + (this_audio := audio.shape[0]) % multiple_of == 0 for audio in audio_inputs + ), ( + f"Every input audio waveform has to be a multiple of {multiple_of}, but" + f" one is {this_audio} with {(this_audio / multiple_of)=}." + ) + + mel_features = [ + self.whisper_encoder.compute_whisper_melspec(audio).to( + self.whisper_encoder.dtype + ) + for audio in audio_inputs + ] + seq_lens = [mel.shape[1] for mel in mel_features] + # [total_num_20ms_frames, hidden_size] + audio_embeddings = self.whisper_encoder.whisper_encoder.forward_conv( + mel_features + )[0] + conv_stride = self.whisper_encoder.whisper_encoder.total_stride + audio_embeddings_per_sample = audio_embeddings.split( + [s // conv_stride for s in seq_lens], dim=0 + ) + + # audio_embeddings per sample need to be divisible by 4 + pool_size = self.config.audio_config.block_pool_size + assert all( + (this_shape := sample.shape[0]) % pool_size == 0 + for sample in audio_embeddings_per_sample + ), f"Every audio embedding has to be a multiple of 4, but one is {this_shape}." + + audio_embeddings_per_sample = [ + e.view(e.shape[0] // pool_size, e.shape[1] * pool_size) + for e in audio_embeddings_per_sample + ] + return audio_embeddings_per_sample diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index f5a1e75d99617..f1bae28debad2 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -1,9 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import enum import math from collections.abc import Iterable, Mapping, Sequence from contextlib import nullcontext +from functools import partial from typing import Annotated, Literal, cast import numpy as np @@ -16,7 +18,10 @@ from transformers import ( ) from transformers.models.whisper.modeling_whisper import sinusoids -from vllm.attention.layer import Attention, AttentionType +from vllm.attention.backends.abstract import ( + AttentionType, +) +from vllm.attention.layer import Attention from vllm.attention.layers.cross_attention import CrossAttention from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig @@ -34,6 +39,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.whisper_utils import ( + ISO639_1_SUPPORTED_LANGS, + WhisperAttentionWithBlockPooling, + WhisperCausalConv1d, +) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, @@ -64,67 +74,11 @@ from .utils import ( logger = init_logger(__name__) -# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages -ISO639_1_SUPPORTED_LANGS = { - "af": "Afrikaans", - "ar": "Arabic", - "hy": "Armenian", - "az": "Azerbaijani", - "be": "Belarusian", - "bs": "Bosnian", - "bg": "Bulgarian", - "ca": "Catalan", - "zh": "Chinese", - "hr": "Croatian", - "cs": "Czech", - "da": "Danish", - "nl": "Dutch", - "en": "English", - "et": "Estonian", - "fi": "Finnish", - "fr": "French", - "gl": "Galician", - "de": "German", - "el": "Greek", - "he": "Hebrew", - "hi": "Hindi", - "hu": "Hungarian", - "is": "Icelandic", - "id": "Indonesian", - "it": "Italian", - "ja": "Japanese", - "kn": "Kannada", - "kk": "Kazakh", - "ko": "Korean", - "lv": "Latvian", - "lt": "Lithuanian", - "mk": "Macedonian", - "ms": "Malay", - "mr": "Marathi", - "mi": "Maori", - "ne": "Nepali", - "no": "Norwegian", - "fa": "Persian", - "pl": "Polish", - "pt": "Portuguese", - "ro": "Romanian", - "ru": "Russian", - "sr": "Serbian", - "sk": "Slovak", - "sl": "Slovenian", - "es": "Spanish", - "sw": "Swahili", - "sv": "Swedish", - "tl": "Tagalog", - "ta": "Tamil", - "th": "Thai", - "tr": "Turkish", - "uk": "Ukrainian", - "ur": "Urdu", - "vi": "Vietnamese", - "cy": "Welsh", -} +class WhisperPosEmbedType(enum.Enum): + SINUSOIDAL = "sinusoidal" + NOPE = "nope" + LEARNED = "learned" class WhisperAudioInputs(TensorSchema): @@ -184,6 +138,8 @@ class WhisperAttention(nn.Module): num_heads: int, bias: bool = True, attn_type: AttentionType = AttentionType.DECODER, + per_layer_sliding_window: int | None = None, + block_pool_size: int = 1, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -242,7 +198,14 @@ class WhisperAttention(nn.Module): attn_type=self.attn_type, ) else: # AttentionType.DECODER (regular decoder self-attention) - self.attn = Attention( + if block_pool_size > 1: + attn_cls = partial( + WhisperAttentionWithBlockPooling, block_pool_size=block_pool_size + ) + else: + attn_cls = Attention + + self.attn = attn_cls( self.num_heads, self.head_dim, self.scaling, @@ -251,6 +214,7 @@ class WhisperAttention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=self.attn_type, + per_layer_sliding_window=per_layer_sliding_window, ) def _init_qkv( @@ -386,6 +350,9 @@ class WhisperEncoderLayer(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + is_causal = getattr(config, "is_causal", False) + sliding_window = getattr(config, "sliding_window", None) + block_pool_size = getattr(config, "block_pool_size", 1) cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -393,7 +360,9 @@ class WhisperEncoderLayer(nn.Module): self.self_attn = WhisperAttention( embed_dim=self.embed_dim, num_heads=config.encoder_attention_heads, - attn_type=AttentionType.ENCODER, + attn_type=AttentionType.DECODER if is_causal else AttentionType.ENCODER, + block_pool_size=block_pool_size, + per_layer_sliding_window=sliding_window, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -492,12 +461,21 @@ class WhisperEncoder(nn.Module): super().__init__() config = vllm_config.model_config.hf_config embed_dim = config.d_model + + self.pos_embed_type = WhisperPosEmbedType( + getattr(config, "pos_embed", "sinusoidal") + ) self.num_mel_bins = config.num_mel_bins self.max_source_positions = config.max_source_positions self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 - self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1) - self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1) + is_causal = getattr(config, "is_causal", False) + Conv1d = WhisperCausalConv1d if is_causal else partial(nn.Conv1d, padding=1) + + self.conv1 = Conv1d(self.num_mel_bins, embed_dim, kernel_size=3) + self.conv2 = Conv1d(embed_dim, embed_dim, stride=2, kernel_size=3) + + self.total_stride = self.conv1.stride[0] * self.conv2.stride[0] self.start_layer, self.end_layer, self.layers = make_layers( config.encoder_layers, lambda prefix: WhisperEncoderLayer( @@ -507,29 +485,54 @@ class WhisperEncoder(nn.Module): ) self.layer_norm = nn.LayerNorm(config.d_model) - maybe_fp32_init_ctx = ( - set_default_torch_dtype(torch.float32) if init_in_fp32 else nullcontext() - ) - - with ( - torch.no_grad(), - maybe_fp32_init_ctx, + if is_causal and self.pos_embed_type != WhisperPosEmbedType.NOPE: + raise ValueError( + "Only NOPE position embeddings are supported " + f"for causal models, but got {self.pos_embed_type}" + ) + elif self.pos_embed_type in ( + WhisperPosEmbedType.SINUSOIDAL, + WhisperPosEmbedType.LEARNED, ): - self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim) - self.embed_positions.weight.copy_( - sinusoids(*self.embed_positions.weight.shape) + maybe_fp32_init_ctx = ( + set_default_torch_dtype(torch.float32) + if init_in_fp32 + else nullcontext() ) - def forward(self, input_features: torch.Tensor | list[torch.Tensor]): + with ( + torch.no_grad(), + maybe_fp32_init_ctx, + ): + self.embed_positions = nn.Embedding( + self.max_source_positions, embed_dim + ) + self.embed_positions.weight.copy_( + sinusoids(*self.embed_positions.weight.shape) + ) + + def forward_conv( + self, input_features: torch.Tensor | list[torch.Tensor] + ) -> torch.Tensor: hidden_states = [] input_is_batched = False for features in input_features: embeds = nn.functional.gelu(self.conv1(features)) embeds = nn.functional.gelu(self.conv2(embeds)) - embeds = embeds.transpose(-1, -2) - embeds = (embeds + self.embed_positions.weight[: embeds.size(-2), :]).to( - embeds.dtype - ) + + if self.pos_embed_type in ( + WhisperPosEmbedType.SINUSOIDAL, + WhisperPosEmbedType.LEARNED, + ): + embeds = embeds.transpose(-1, -2) + embeds = ( + embeds + self.embed_positions.weight[: embeds.size(-2), :] + ).to(embeds.dtype) + elif self.pos_embed_type == WhisperPosEmbedType.NOPE: + embeds = embeds.transpose(-1, -2).to(embeds.dtype) + else: + raise ValueError(f"Unknown pos_embed_type: {self.pos_embed_type}") + hidden_states.append(embeds) input_is_batched = embeds.ndim > 2 # Input to MHA must be B x T x D @@ -539,12 +542,19 @@ class WhisperEncoder(nn.Module): else: hidden_states = torch.stack(hidden_states, dim=0) + return hidden_states + + def forward_layers(self, hidden_states: torch.Tensor) -> torch.Tensor: for encoder_layer in self.layers: hidden_states = encoder_layer(hidden_states) hidden_states = self.layer_norm(hidden_states) return hidden_states + def forward(self, input_features: torch.Tensor | list[torch.Tensor]): + hidden_states = self.forward_conv(input_features) + return self.forward_layers(hidden_states) + class WhisperDecoder(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/whisper_utils.py b/vllm/model_executor/models/whisper_utils.py new file mode 100644 index 0000000000000..077b4aff6fec9 --- /dev/null +++ b/vllm/model_executor/models/whisper_utils.py @@ -0,0 +1,299 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy +import functools +import math +from dataclasses import replace + +import torch +import torch.nn.functional as F +from torch import nn + +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionMetadata, + AttentionType, +) +from vllm.attention.layer import Attention +from vllm.attention.selector import get_attn_backend +from vllm.config import CacheConfig, VllmConfig +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend +from vllm.v1.attention.backends.utils import ( + CommonAttentionMetadata, + subclass_attention_backend_with_overrides, +) +from vllm.v1.kv_cache_interface import AttentionSpec + +# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages +ISO639_1_SUPPORTED_LANGS = { + "af": "Afrikaans", + "ar": "Arabic", + "hy": "Armenian", + "az": "Azerbaijani", + "be": "Belarusian", + "bs": "Bosnian", + "bg": "Bulgarian", + "ca": "Catalan", + "zh": "Chinese", + "hr": "Croatian", + "cs": "Czech", + "da": "Danish", + "nl": "Dutch", + "en": "English", + "et": "Estonian", + "fi": "Finnish", + "fr": "French", + "gl": "Galician", + "de": "German", + "el": "Greek", + "he": "Hebrew", + "hi": "Hindi", + "hu": "Hungarian", + "is": "Icelandic", + "id": "Indonesian", + "it": "Italian", + "ja": "Japanese", + "kn": "Kannada", + "kk": "Kazakh", + "ko": "Korean", + "lv": "Latvian", + "lt": "Lithuanian", + "mk": "Macedonian", + "ms": "Malay", + "mr": "Marathi", + "mi": "Maori", + "ne": "Nepali", + "no": "Norwegian", + "fa": "Persian", + "pl": "Polish", + "pt": "Portuguese", + "ro": "Romanian", + "ru": "Russian", + "sr": "Serbian", + "sk": "Slovak", + "sl": "Slovenian", + "es": "Spanish", + "sw": "Swahili", + "sv": "Swedish", + "tl": "Tagalog", + "ta": "Tamil", + "th": "Thai", + "tr": "Turkish", + "uk": "Ukrainian", + "ur": "Urdu", + "vi": "Vietnamese", + "cy": "Welsh", +} + + +def _pad1d( + x: torch.Tensor, + paddings: tuple[int, int], + mode: str = "constant", + value: float = 0.0, +) -> torch.Tensor: + """Tiny wrapper around F.pad, just to allow for + reflect padding on small input. + If this is the case, we insert extra 0 padding + to the right before the reflection happen. + """ + length = x.shape[-1] + padding_left, padding_right = paddings + assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) + if mode == "reflect": + max_pad = max(padding_left, padding_right) + extra_pad = 0 + if length <= max_pad: + extra_pad = max_pad - length + 1 + x = F.pad(x, (0, extra_pad)) + padded = F.pad(x, paddings, mode, value) + end = padded.shape[-1] - extra_pad + return padded[..., :end] + else: + return F.pad(x, paddings, mode, value) + + +class WhisperCausalConv1d(nn.Conv1d): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + padding: int = 0, + bias: bool = True, + ) -> None: + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=bias, + ) + self._stride = self.stride[0] + self._effective_kernel_size = (kernel_size - 1) * self.dilation[0] + 1 + self._padding_total = self._effective_kernel_size - self._stride + + def forward(self, x: torch.Tensor) -> torch.Tensor: + n_frames = ( + x.shape[-1] - self._effective_kernel_size + self._padding_total + ) / self._stride + 1 + target_length = (math.ceil(n_frames) - 1) * self._stride + ( + self._effective_kernel_size - self._padding_total + ) + extra_padding = target_length - x.shape[-1] + x = _pad1d(x, (self._padding_total, extra_padding), mode="constant") + return super().forward(x) + + +@functools.lru_cache +def create_whisper_attention_backend_with_block_pooling( + underlying_attn_backend: AttentionBackend, block_pool_size: int +) -> type[AttentionBackend]: + prefix = "WhisperAttentionWithBlockPooling_" + underlying_builder = underlying_attn_backend.get_builder_cls() + + class WhisperAttentionWithBlockPoolingBuilder(underlying_builder): # type: ignore + def __init__( + self, + kv_cache_spec: AttentionSpec, + layer_names: list[str], + vllm_config: VllmConfig, + device: torch.device, + ): + assert kv_cache_spec.num_kv_heads % block_pool_size == 0 + kv_cache_spec = replace( + kv_cache_spec, + block_size=kv_cache_spec.block_size * block_pool_size, + num_kv_heads=kv_cache_spec.num_kv_heads // block_pool_size, + ) + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> AttentionMetadata: + new_common_attn_metadata = copy.deepcopy(common_attn_metadata) + new_common_attn_metadata.query_start_loc *= block_pool_size + new_common_attn_metadata.query_start_loc_cpu *= block_pool_size + new_common_attn_metadata.seq_lens *= block_pool_size + new_common_attn_metadata._seq_lens_cpu *= block_pool_size + new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size + new_common_attn_metadata.num_actual_tokens *= block_pool_size + new_common_attn_metadata.max_query_len *= block_pool_size + new_common_attn_metadata.max_seq_len *= block_pool_size + original_slot_mapping = common_attn_metadata.slot_mapping + common_prefix_len *= block_pool_size + new_common_attn_metadata.slot_mapping = ( + ( + original_slot_mapping.unsqueeze(1) * block_pool_size + + torch.arange(block_pool_size, device=original_slot_mapping.device) + ) + .flatten() + .clamp(min=-1) + ) + return super().build( + common_prefix_len, new_common_attn_metadata, fast_build + ) + + if not issubclass(underlying_attn_backend, FlashAttentionBackend): + raise NotImplementedError( + f"{underlying_attn_backend} is not yet supported." + "Contributions to support more backends are much " + "appreciated." + ) + + attn_backend = subclass_attention_backend_with_overrides( + name_prefix=prefix, + attention_backend_cls=underlying_attn_backend, + overrides={ + "get_builder_cls": lambda: WhisperAttentionWithBlockPoolingBuilder, + "get_kv_cache_shape": lambda num_blocks, + block_size, + num_kv_heads, + head_size, + cache_dtype_str: ( + 2, + num_blocks, + # we stretch each block by `block_pool_size` + block_size * block_pool_size, + num_kv_heads // block_pool_size, + head_size, + ), # TODO: generalize to other backends + }, + ) + + return attn_backend + + +class WhisperAttentionWithBlockPooling(Attention): + """Attention layer with block pooling.""" + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int | None = None, + alibi_slopes: list[float] | None = None, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + logits_soft_cap: float | None = None, + per_layer_sliding_window: int | None = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: str | None = None, + block_pool_size: int = 1, + attn_backend: type[AttentionBackend] | None = None, + **extra_impl_args, + ) -> None: + self.block_pool_size = block_pool_size + dtype = torch.get_default_dtype() + + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + else: + kv_cache_dtype = "auto" + block_size = 16 + + underlying_attn_backend = get_attn_backend( + head_size, + dtype, + kv_cache_dtype, + block_size, + attn_type=attn_type, + ) + attn_backend = create_whisper_attention_backend_with_block_pooling( + underlying_attn_backend, block_pool_size + ) + + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=alibi_slopes, + cache_config=cache_config, + quant_config=quant_config, + logits_soft_cap=logits_soft_cap, + per_layer_sliding_window=per_layer_sliding_window, + prefix=prefix, + attn_type=attn_type, + kv_sharing_target_layer_name=kv_sharing_target_layer_name, + attn_backend=attn_backend, + **extra_impl_args, + ) + + def get_kv_cache_spec(self, vllm_config: VllmConfig): + kv_cache_spec = super().get_kv_cache_spec(vllm_config) + assert isinstance(kv_cache_spec, AttentionSpec) + kv_cache_spec = replace( + kv_cache_spec, + num_kv_heads=self.block_pool_size * kv_cache_spec.num_kv_heads, + ) + return kv_cache_spec diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 51b8f77f29088..57e7be6344cd1 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -111,11 +111,16 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]: return librosa.load(filepath, sr=None) - def encode_base64(self, media: tuple[npt.NDArray, int]) -> str: + def encode_base64( + self, + media: tuple[npt.NDArray, int], + *, + audio_format: str = "WAV", + ) -> str: audio, sr = media with BytesIO() as buffer: - soundfile.write(buffer, audio, sr, format="WAV") + soundfile.write(buffer, audio, sr, format=audio_format) data = buffer.getvalue() return base64.b64encode(data).decode("utf-8") diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 1506ecb8c7aa0..8e1178bc7ea44 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -8,8 +8,12 @@ import pybase64 import torch from PIL import Image +from vllm.logger import init_logger + from .base import MediaIO, MediaWithBytes +logger = init_logger(__file__) + def rescale_image_size( image: Image.Image, size_factor: float, transpose: int = -1 @@ -104,8 +108,17 @@ class ImageMediaIO(MediaIO[Image.Image]): self, media: Image.Image, *, - image_format: str = "JPEG", + image_format: str | None = None, ) -> str: + if image_format is None: + logger.warning_once( + "The default format of `ImageMediaIO.encode_base64` will be changed " + 'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. ' + "To continue using the old default, " + 'pass `format="JPEG"` explicitly to silence this warning.' + ) + image_format = "JPEG" + image = media with BytesIO() as buffer: diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 7fd05af583b0a..b2b0d1734727c 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -3,6 +3,7 @@ import asyncio import atexit +import mimetypes from collections.abc import Generator, Set from concurrent.futures import ThreadPoolExecutor from itertools import groupby @@ -357,17 +358,31 @@ class MediaConnector: def encode_audio_base64( audio: np.ndarray, sampling_rate: int, + *, + format: str = "WAV", ) -> str: """Encode audio as base64.""" audio_io = AudioMediaIO() - return audio_io.encode_base64((audio, sampling_rate)) + return audio_io.encode_base64((audio, sampling_rate), audio_format=format) + + +def encode_audio_url( + audio: np.ndarray, + sampling_rate: int, + *, + format: str = "WAV", +) -> str: + """Encode audio as a data URL.""" + audio_b64 = encode_audio_base64(audio, sampling_rate, format=format) + mimetype = mimetypes.types_map.get("." + format.lower(), "audio") + return f"data:{mimetype};base64,{audio_b64}" def encode_image_base64( image: Image.Image, *, image_mode: str = "RGB", - format: str = "JPEG", + format: str | None = None, ) -> str: """ Encode a pillow image to base64 format. @@ -378,10 +393,45 @@ def encode_image_base64( return image_io.encode_base64(image, image_format=format) -def encode_video_base64(frames: npt.NDArray) -> str: +def encode_image_url( + image: Image.Image, + *, + image_mode: str = "RGB", + format: str = "PNG", +) -> str: + """ + Encode a pillow image as a data URL. + + By default, the image is converted into RGB format before being encoded. + """ + image_b64 = encode_image_base64(image, image_mode=image_mode, format=format) + mimetype = mimetypes.types_map.get("." + format.lower(), "image") + return f"data:{mimetype};base64,{image_b64}" + + +def encode_video_base64( + frames: npt.NDArray, + *, + format: str = "JPEG", +) -> str: image_io = ImageMediaIO() video_io = VideoMediaIO(image_io) - return video_io.encode_base64(frames) + return video_io.encode_base64(frames, video_format=format) + + +def encode_video_url( + frames: npt.NDArray, + *, + format: str = "JPEG", +) -> str: + video_b64 = encode_video_base64(frames, format=format) + + if format.lower() == "jpeg": + mimetype = "video/jpeg" + else: + mimetype = mimetypes.types_map.get("." + format.lower(), "video") + + return f"data:{mimetype};base64,{video_b64}" def argsort_mm_positions( diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 2e39a216a10a0..8fecd9e3e65d6 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Optional import torch -import vllm.envs as envs from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger @@ -168,32 +167,6 @@ class XPUPlatform(Platform): if vllm_config.kv_transfer_config is not None: vllm_config.kv_transfer_config.enable_permute_local_kv = True - if parallel_config.distributed_executor_backend is None: - if parallel_config.world_size > 1: - parallel_config.distributed_executor_backend = "ray" - else: - parallel_config.distributed_executor_backend = "uni" - elif parallel_config.distributed_executor_backend == "mp": - # FIXME(kunshang): - # spawn needs calling `if __name__ == '__main__':` - # fork is not supported for xpu start new process. - if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn": - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - logger.warning( - "Please use spawn as start method if you want to use mp." - ) - elif ( - parallel_config.distributed_executor_backend != "ray" - and parallel_config.distributed_executor_backend != "uni" - and parallel_config.distributed_executor_backend != "external_launcher" - ): - logger.warning( - "%s is not supported on XPU, fallback to ray distributed" - " executor backend.", - parallel_config.distributed_executor_backend, - ) - parallel_config.distributed_executor_backend = "ray" - if model_config and model_config.use_mla: logger.info( "MLA is enabled on a non-GPU platform; forcing chunked " diff --git a/vllm/tool_parsers/mistral_tool_parser.py b/vllm/tool_parsers/mistral_tool_parser.py index 49a175f69f434..35b853b0ad7e1 100644 --- a/vllm/tool_parsers/mistral_tool_parser.py +++ b/vllm/tool_parsers/mistral_tool_parser.py @@ -131,78 +131,105 @@ class MistralToolParser(ToolParser): request: ChatCompletionRequest, ) -> ExtractedToolCallInformation: """ - Extract the tool calls from a complete model response. Requires - find-and-replacing single quotes with double quotes for JSON parsing, - make sure your tool call arguments don't ever include quotes! + Extract the tool calls from a complete model response. + + Content and tool calls formatting depends on the Mistral's tokenizer version + used to train the model: + + - < v11: `content[BOT] [{tool_call1},{tool_call2}]` + - >= v11: `content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}` + + with [BOT] the tool call token. + + Note: + For tokenizer versions >= v11, tool calls with arguments wrongly formatted + are still returned as tool calls. This is to allow the model to know it + tried to make a tool call. It reduces chance of another failure and + prevents that the context is filled with tool calls wrongly placed in + assistant message contents. """ - # case -- if a tool call token is not present, return a text response + # If the tool call token is not present, return a text response if self.bot_token not in model_output: return ExtractedToolCallInformation( tools_called=False, tool_calls=[], content=model_output ) - # first remove the BOT token - tool_content = model_output.replace(self.bot_token, "").strip() + content_and_raw_tool_calls = model_output.split(self.bot_token) + content = content_and_raw_tool_calls[0] + raw_tool_calls = content_and_raw_tool_calls[1:] - try: + # >= v11: content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2} + if not self._is_pre_v11: + tool_calls = [] + for raw_tool_call in raw_tool_calls: + if "{" not in raw_tool_call: + continue + + end_name = raw_tool_call.find("{") + tool_name, args = ( + raw_tool_call[:end_name], + raw_tool_call[end_name:], + ) + + tool_calls.append({"name": tool_name, "arguments": args}) + + # < v11: content[BOT] [{tool_call1},{tool_call2}] + else: + if len(raw_tool_calls) != 1: + raise ValueError( + "Only one BOT token should have been outputted, " + f"but got {model_output}." + ) + stringified_tool_calls = raw_tool_calls[0].strip() try: - if not self._is_pre_v11: - function_call_arr = [] - for single_tool_content in model_output.split(self.bot_token): - if "{" not in single_tool_content: - continue - - end_name = single_tool_content.find("{") - fn_name, args = ( - single_tool_content[:end_name], - single_tool_content[end_name:], - ) - - # fn_name is encoded outside serialized json dump - # only arguments are serialized - function_call_arr.append( - {"name": fn_name, "arguments": json.loads(args)} - ) - else: - function_call_arr = json.loads(tool_content) + tool_calls = json.loads(stringified_tool_calls) except json.JSONDecodeError: # use a regex to find the part corresponding to the tool call. # NOTE: This use case should not happen if the model is trained # correctly. It's an easy possible fix so it's included, but # can be brittle for very complex / highly nested tool calls - raw_tool_call = self.tool_call_regex.findall(tool_content)[0] - function_call_arr = json.loads(raw_tool_call) - - # Tool Call - tool_calls: list[MistralToolCall] = [ - MistralToolCall( - type="function", - function=FunctionCall( - name=raw_function_call["name"], - # function call args are JSON but as a string - arguments=json.dumps( - raw_function_call["arguments"], ensure_ascii=False + try: + raw_tool_call = self.tool_call_regex.findall( + stringified_tool_calls + )[0] + tool_calls = json.loads(raw_tool_call) + except (IndexError, json.JSONDecodeError): + logger.exception("Error in extracting tool call from response: {e}") + # If raw decoding and decoding post regex rule fails, then just + # return content. + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=stringified_tool_calls, + ) + else: + tool_calls = [ + { + "name": tool_call["name"], + "arguments": json.dumps( + tool_call["arguments"], ensure_ascii=False ), - ), - ) - for raw_function_call in function_call_arr - ] + } + for tool_call in tool_calls + ] - # get any content before the tool call - content = model_output.split(self.bot_token)[0] - return ExtractedToolCallInformation( - tools_called=True, - tool_calls=tool_calls, - content=content if len(content) > 0 else None, + mistral_tool_calls: list[MistralToolCall] = [ + MistralToolCall( + type="function", + function=FunctionCall( + name=tool_call["name"], + arguments=tool_call["arguments"], + ), ) + for tool_call in tool_calls + ] - except Exception: - logger.exception("Error in extracting tool call from response.") - # return information to just treat the tool call as regular JSON - return ExtractedToolCallInformation( - tools_called=False, tool_calls=[], content=tool_content - ) + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=mistral_tool_calls, + content=content if len(content) > 0 else None, + ) def extract_tool_calls_streaming( self, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 887f936a2d8ae..ecb9849bb3b5e 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -15,7 +15,6 @@ from huggingface_hub import ( ) from packaging.version import Version from transformers import GenerationConfig, PretrainedConfig -from transformers.configuration_utils import ALLOWED_LAYER_TYPES from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, @@ -44,6 +43,16 @@ from .repo_utils import ( with_retry, ) +try: + # Transformers v5 + from transformers.configuration_utils import ALLOWED_ATTENTION_LAYER_TYPES +except ImportError: + # Transformers v4 + from transformers.configuration_utils import ( + ALLOWED_LAYER_TYPES as ALLOWED_ATTENTION_LAYER_TYPES, + ) + + if envs.VLLM_USE_MODELSCOPE: from modelscope import AutoConfig else: @@ -104,6 +113,14 @@ _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = { } +def is_rope_parameters_nested(rope_parameters: dict[str, Any]) -> bool: + """Check if rope_parameters is nested by layer types.""" + # Cannot be nested if rope_parameters is empty + if not rope_parameters: + return False + return set(rope_parameters.keys()).issubset(ALLOWED_ATTENTION_LAYER_TYPES) + + class HFConfigParser(ConfigParserBase): def parse( self, @@ -313,19 +330,25 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: rope_theta = getattr_iter(config, names, None, warn=True) names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"] partial_rotary_factor = getattr_iter(config, names, None, warn=True) + ompe = getattr(config, "original_max_position_embeddings", None) if Version(version("transformers")) < Version("5.0.0.dev0"): # Transformers v4 installed, legacy config fields may be present if (rope_scaling := getattr(config, "rope_scaling", None)) is not None: config.rope_parameters = rope_scaling if ( - rope_theta is not None or partial_rotary_factor is not None + rope_theta is not None + or partial_rotary_factor is not None + or ompe is not None ) and not getattr(config, "rope_parameters", None): config.rope_parameters = {"rope_type": "default"} + # Patch legacy fields into rope_parameters if rope_theta is not None: config.rope_parameters["rope_theta"] = rope_theta if partial_rotary_factor is not None: config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor + if ompe is not None: + config.rope_parameters["original_max_position_embeddings"] = ompe elif rope_theta is not None or getattr(config, "rope_parameters", None): # Transformers v5 installed # Patch these fields in case they used non-standard names @@ -341,12 +364,8 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: if getattr(config, "rope_parameters", None) is None: return - # Add original_max_position_embeddings if present - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe - # Handle nested rope_parameters in interleaved sliding attention models - if set(config.rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + if is_rope_parameters_nested(config.rope_parameters): for rope_parameters_layer_type in config.rope_parameters.values(): patch_rope_parameters_dict(rope_parameters_layer_type) else: diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index d59169d95f0c9..4776c892eb722 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -184,18 +184,42 @@ def _remap_mistral_audio_args(config: dict) -> dict: whisper_args = config["multimodal"].pop("whisper_model_args") encoder_args = whisper_args["encoder_args"] downsample_args = whisper_args["downsample_args"] + downsample_factor = downsample_args["downsample_factor"] + + # make sure that k/v blocks can be allocated with + # unified k/v cache class and pool whisper k/v cache blocks + # with downsample_factor:1 ratio + if encoder_args.get("causal"): + block_pool_size = downsample_factor + config["projection_size"] = downsample_factor * encoder_args["dim"] + else: + block_pool_size = 1 + + _maybe_sliding_window = encoder_args.get("ragged_attention", None) + if _maybe_sliding_window is None: + sliding_window = None + elif _maybe_sliding_window.isdigit(): + sliding_window = int(_maybe_sliding_window) + else: + raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}") + + architecture = ( + "VoxtralStreamingGeneration" + if encoder_args.get("causal") + else "VoxtralForConditionalGeneration" + ) quant_config = config.get("quantization_config") config = { - "model_type": "whixtral", - "architectures": ["VoxtralForConditionalGeneration"], + "model_type": "voxtral", + "architectures": [architecture], "text_config": PretrainedConfig.from_dict(config), "audio_config": WhisperConfig( num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"], window_size=encoder_args["audio_encoding_args"]["window_size"], sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"], hop_length=encoder_args["audio_encoding_args"]["hop_length"], - downsample_factor=downsample_args["downsample_factor"], + downsample_factor=downsample_factor, d_model=encoder_args["dim"], encoder_layers=encoder_args["n_layers"], encoder_ffn_dim=encoder_args["hidden_dim"], @@ -203,6 +227,10 @@ def _remap_mistral_audio_args(config: dict) -> dict: vocab_size=encoder_args["vocab_size"], max_source_positions=encoder_args["max_source_positions"], is_encoder_decoder=False, # Override WhisperConfig default + is_causal=encoder_args.get("causal", False), + sliding_window=sliding_window, + block_pool_size=block_pool_size, + pos_embed=encoder_args.get("pos_embed", "sinusoidal"), ), } if quant_config: diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py index fcda6134016ba..47dd44601377b 100644 --- a/vllm/v1/attention/backends/mamba1_attn.py +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -3,17 +3,11 @@ from dataclasses import dataclass -import torch - from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.backends.utils import PAD_SLOT_ID -from vllm.config import VllmConfig -from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder -from vllm.v1.attention.backends.utils import ( - CommonAttentionMetadata, - split_decodes_and_prefills, +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadata, + BaseMambaAttentionMetadataBuilder, ) -from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec class Mamba1AttentionBackend(AttentionBackend): @@ -23,137 +17,12 @@ class Mamba1AttentionBackend(AttentionBackend): @dataclass -class Mamba1AttentionMetadata: - query_start_loc_p: torch.Tensor - state_indices_tensor: torch.Tensor - has_initial_states_p: torch.Tensor | None - num_prefills: int - num_prefill_tokens: int - num_decodes: int - num_decode_tokens: int - - block_idx_last_scheduled_token: torch.Tensor # shape: [batch,] - block_idx_first_scheduled_token_p: torch.Tensor # shape: [batch,] - block_idx_last_computed_token: torch.Tensor # shape: [batch,] - num_computed_tokens_p: torch.Tensor # shape: [batch,] +class Mamba1AttentionMetadata(BaseMambaAttentionMetadata): + pass class Mamba1AttentionMetadataBuilder( BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata] ): - def __init__( - self, - kv_cache_spec: AttentionSpec, - layer_names: list[str], - vllm_config: VllmConfig, - device: torch.device, - ): - super().__init__(kv_cache_spec, layer_names, vllm_config, device) - assert isinstance(kv_cache_spec, MambaSpec) - - def build( - self, - common_prefix_len: int, - common_attn_metadata: CommonAttentionMetadata, - fast_build: bool = False, - ) -> Mamba1AttentionMetadata: - num_reqs = common_attn_metadata.num_reqs - - num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills( - common_attn_metadata, decode_threshold=self.reorder_batch_threshold - ) - ) - - has_initial_states_p = None - query_start_loc_p = None - num_computed_tokens, num_computed_tokens_p = None, None - block_idx_first_scheduled_token = None - block_idx_first_scheduled_token_p = None - - # TODO(@Josephasafg) Mamba1 and Mamba2 have a lot of code in common here. - # We should consolidate this code - if self.vllm_config.cache_config.enable_prefix_caching: - # Return a tensor of shape (#requests, #max blocks) - state_indices_tensor = common_attn_metadata.block_table_tensor - mamba_block_size = self.kv_cache_spec.block_size - num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to( - self.device - ) - ( - block_idx_last_computed_token, - block_idx_first_scheduled_token, - block_idx_last_scheduled_token, - ) = self._compute_prefix_caching_block_indices( - common_attn_metadata, mamba_block_size - ) - else: - # Always return just a single block per each request: - state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] - block_idx_last_scheduled_token = None - block_idx_last_computed_token = None - - if num_prefills > 0: - query_start_loc_p = ( - common_attn_metadata.query_start_loc[-num_prefills - 1 :] - - num_decode_tokens - ) - has_initial_states_cpu = ( - common_attn_metadata.num_computed_tokens_cpu[ - num_reqs - num_prefills : num_reqs - ] - > 0 - ) - has_initial_states_p = has_initial_states_cpu.to( - common_attn_metadata.query_start_loc.device - ) - - if self.vllm_config.cache_config.enable_prefix_caching: - assert num_computed_tokens is not None - num_computed_tokens_p = num_computed_tokens[ - num_reqs - num_prefills : num_reqs - ] - assert block_idx_first_scheduled_token is not None - block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[ - num_reqs - num_prefills : num_reqs - ] - - elif ( - num_decodes > 0 - and num_decodes <= self.decode_cudagraph_max_bs - and self.compilation_config.cudagraph_mode.has_full_cudagraphs() - ): - self.state_indices_tensor[:num_decodes].copy_( - state_indices_tensor, non_blocking=True - ) - state_indices_tensor = self.state_indices_tensor[:num_decode_tokens] - state_indices_tensor[num_decodes:] = PAD_SLOT_ID - - if self.vllm_config.cache_config.enable_prefix_caching: - self.block_idx_last_scheduled_token[:num_decodes].copy_( - block_idx_last_scheduled_token, non_blocking=True - ) - block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[ - :num_decode_tokens - ] - - self.block_idx_last_computed_token[:num_decodes].copy_( - block_idx_last_computed_token, non_blocking=True - ) - block_idx_last_computed_token = self.block_idx_last_computed_token[ - :num_decode_tokens - ] - - return Mamba1AttentionMetadata( - query_start_loc_p=query_start_loc_p, - has_initial_states_p=has_initial_states_p, - state_indices_tensor=state_indices_tensor, - num_prefills=num_prefills, - num_prefill_tokens=num_prefill_tokens, - num_decodes=num_decodes, - num_decode_tokens=num_decode_tokens, - block_idx_last_scheduled_token=block_idx_last_scheduled_token, - block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p, - block_idx_last_computed_token=block_idx_last_computed_token, - num_computed_tokens_p=num_computed_tokens_p, - ) + metadata_cls = Mamba1AttentionMetadata + supports_update_block_table: bool = False diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index f923371283aa0..b526f0a329972 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -1,19 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import copy import itertools -from dataclasses import dataclass +from dataclasses import dataclass, replace import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.utils.math_utils import cdiv -from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadata, + BaseMambaAttentionMetadataBuilder, +) from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, - compute_causal_conv1d_metadata, - split_decodes_and_prefills, ) from vllm.v1.kv_cache_interface import AttentionSpec @@ -94,48 +94,26 @@ class Mamba2AttentionBackend(AttentionBackend): @dataclass -class Mamba2AttentionMetadata: - num_prefills: int - num_prefill_tokens: int - num_decodes: int - num_decode_tokens: int - query_start_loc_p: torch.Tensor - seq_lens: torch.Tensor - - prep_initial_states: bool - chunk_size: int - - # The following tensors only contain prefill requests and will be None if - # the batch has no prefill request. - has_initial_states_p: torch.Tensor | None - seq_idx_p: torch.Tensor | None +class Mamba2AttentionMetadata(BaseMambaAttentionMetadata): + prep_initial_states: bool = False + chunk_size: int = 0 + # Chunk-related metadata (only for prefill) + seq_idx_p: torch.Tensor | None = None # cu_chunk_seqlen_p is a tensor of shape (nchunks+1,) that contains, for # each chunk, its offests into the varlen sequence dimension. It is defined # such that the i-th chunk contains tokens from cu_chunk_seqlen_p[i] to # cu_chunk_seqlen_p[i+1]. - cu_chunk_seqlen_p: torch.Tensor | None - + cu_chunk_seqlen_p: torch.Tensor | None = None # last_chunk_indices_p is a tensor of shape (batch,) that contains the # index of the last chunk for every sequence in the (prefill) batch. - last_chunk_indices_p: torch.Tensor | None - - state_indices_tensor: torch.Tensor # shape: [batch,] - block_idx_last_scheduled_token: torch.Tensor # shape: [batch,] - block_idx_first_scheduled_token_p: torch.Tensor # shape: [batch,] - block_idx_last_computed_token: torch.Tensor # shape: [batch,] - num_computed_tokens_p: torch.Tensor # shape: [batch,] - - # The following attributes are for triton implementation of causal_conv1d - nums_dict: dict | None = None - batch_ptr: torch.Tensor | None = None - token_chunk_offset_ptr: torch.Tensor | None = None + last_chunk_indices_p: torch.Tensor | None = None class Mamba2AttentionMetadataBuilder( BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata] ): - supports_update_block_table: bool = True + metadata_cls = Mamba2AttentionMetadata def __init__( self, @@ -150,87 +128,93 @@ class Mamba2AttentionMetadataBuilder( "chunk_size needs to be set in the model config for Mamba2 models" ) + def _compute_chunk_metadata( + self, + num_prefills: int, + num_computed_tokens_p_cpu: torch.Tensor, + query_start_loc_p_cpu: torch.Tensor, + ) -> tuple[list[int], list[int], list[int]]: + """ + Compute chunk-specific metadata for Mamba2. + + The code below carefully constructs the chunks such that: + 1. Chunks contain tokens from a *single* sequence only. + 2. For every sequence, we are guaranteed that we can + retrieve the mamba state *every* chunk_size tokens. + Constraint (1) dramatically simplifies the mamba2 kernels. + Constraint (2) dramatically simplifies the implementation + of prefix caching for mamba2 (wip). We need to take care + of the interaction with chunked prefill in order to + satisfy constraint (2). + """ + # TODO (tdoublep): This code could probably be optimized. + cu_chunk_seqlen = [] + seq_idx = [] + last_chunk_indices = [] + seqlen_pos = 0 + + for req_idx in range(num_prefills): + this_num_computed = num_computed_tokens_p_cpu[req_idx].item() + this_new_tokens = ( + query_start_loc_p_cpu[req_idx + 1].item() + - query_start_loc_p_cpu[req_idx].item() + ) + + # if computed tokens are not chunk-aligned, use the first + # chunk to finish it off + if this_num_computed % self.chunk_size != 0: + seq_idx.append(req_idx) + cu_chunk_seqlen.append(seqlen_pos) + # how many tokens to finish the chunk? + chunk_len = ( + cdiv(this_num_computed, self.chunk_size) * self.chunk_size + - this_num_computed + ) + # we can only use at most this_new_tokens + chunk_len = min(chunk_len, this_new_tokens) + seqlen_pos += chunk_len + this_new_tokens -= chunk_len + + n_chunks = cdiv(this_new_tokens, self.chunk_size) + for chunk in range(n_chunks): + seq_idx.append(req_idx) + cu_chunk_seqlen.append(seqlen_pos) + chunk_len = min(self.chunk_size, this_new_tokens) + seqlen_pos += chunk_len + this_new_tokens -= chunk_len + + assert this_new_tokens == 0 + last_chunk_indices.append(len(cu_chunk_seqlen) - 1) + + cu_chunk_seqlen.append(seqlen_pos) + + return cu_chunk_seqlen, seq_idx, last_chunk_indices + def build( self, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata, fast_build: bool = False, ) -> Mamba2AttentionMetadata: - num_reqs = common_attn_metadata.num_reqs - seq_lens = common_attn_metadata.seq_lens + common = self._compute_common_metadata(common_attn_metadata) - query_start_loc_p = None seq_idx_p = None cu_chunk_seqlen_p = None last_chunk_indices_p = None - - # Need flags to indicate if there are initial states - has_initial_states_p = None prep_initial_states = False - # for causal_conv1d - nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None - - num_computed_tokens, num_computed_tokens_p = None, None - block_idx_first_scheduled_token = None - block_idx_first_scheduled_token_p = None - - if self.vllm_config.cache_config.enable_prefix_caching: - # Return a tensor of shape (#requests, #max blocks) - state_indices_tensor = common_attn_metadata.block_table_tensor - # Additional cache-related varaiables: - mamba_block_size = self.kv_cache_spec.block_size - num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to( - self.device - ) - ( - block_idx_last_computed_token, - block_idx_first_scheduled_token, - block_idx_last_scheduled_token, - ) = self._compute_prefix_caching_block_indices( - common_attn_metadata, mamba_block_size - ) - else: - # Always return just a single block per each request: - state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] - # Additional cache-related varaiables: - block_idx_last_scheduled_token = None - block_idx_last_computed_token = None - - num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills( - common_attn_metadata, decode_threshold=self.reorder_batch_threshold - ) - ) - # Compute seq_idx for prefill only - if num_prefills > 0: - # [batch,] - has_initial_states_cpu = ( - common_attn_metadata.num_computed_tokens_cpu[ - num_reqs - num_prefills : num_reqs - ] - > 0 - ) - prep_initial_states = torch.any(has_initial_states_cpu).item() - has_initial_states_p = has_initial_states_cpu.to( - common_attn_metadata.query_start_loc.device + if common.num_prefills > 0: + prep_initial_states = ( + torch.any(common.has_initial_states_p).item() + if common.has_initial_states_p is not None + else False ) - query_start_loc_p = ( - common_attn_metadata.query_start_loc[-num_prefills - 1 :] - - num_decode_tokens - ) + num_reqs = common.num_reqs + num_prefills = common.num_prefills + num_decode_tokens = common.num_decode_tokens - if self.vllm_config.cache_config.enable_prefix_caching: - assert num_computed_tokens is not None - num_computed_tokens_p = num_computed_tokens[ - num_reqs - num_prefills : num_reqs - ] - assert block_idx_first_scheduled_token is not None - block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[ - num_reqs - num_prefills : num_reqs - ] num_computed_tokens_p_cpu = common_attn_metadata.num_computed_tokens_cpu[ num_reqs - num_prefills : num_reqs ] @@ -239,137 +223,33 @@ class Mamba2AttentionMetadataBuilder( - num_decode_tokens ) - # The code below carefully constructs the chunks such that: - # 1. Chunks contain tokens from a *single* sequence only. - # 2. For every sequence, we are guaranteed that we can - # retrieve the mamba state *every* chunk_size tokens. - # Constraint (1) dramatically simplifies the mamba2 kernels. - # Constraint (2) dramatically simplifies the implementation - # of prefix caching for mamba2 (wip). We need to take care - # of the interaction with chunked prefill in order to - # satisfy constraint (2). - # TODO (tdoublep): This code could probably be optimized. - cu_chunk_seqlen = [] - seq_idx = [] - last_chunk_indices = [] - seqlen_pos = 0 - for req_idx in range(num_prefills): - this_num_computed = num_computed_tokens_p_cpu[req_idx].item() - this_new_tokens = ( - query_start_loc_p_cpu[req_idx + 1].item() - - query_start_loc_p_cpu[req_idx].item() - ) - - # if computed tokens are not chunk-aligned, use the first - # chunk to finish it off - if this_num_computed % self.chunk_size != 0: - seq_idx.append(req_idx) - cu_chunk_seqlen.append(seqlen_pos) - # how many tokens to finish the chunk? - chunk_len = ( - cdiv(this_num_computed, self.chunk_size) * self.chunk_size - - this_num_computed - ) - # we can only use at most this_new_tokens - chunk_len = min(chunk_len, this_new_tokens) - seqlen_pos += chunk_len - this_new_tokens -= chunk_len - - n_chunks = cdiv(this_new_tokens, self.chunk_size) - for chunk in range(n_chunks): - seq_idx.append(req_idx) - cu_chunk_seqlen.append(seqlen_pos) - chunk_len = min(self.chunk_size, this_new_tokens) - seqlen_pos += chunk_len - this_new_tokens -= chunk_len - - assert this_new_tokens == 0 - last_chunk_indices.append(len(cu_chunk_seqlen) - 1) - - cu_chunk_seqlen.append(seqlen_pos) + cu_chunk_seqlen, seq_idx, last_chunk_indices = self._compute_chunk_metadata( + num_prefills, + num_computed_tokens_p_cpu, + query_start_loc_p_cpu, + ) seq_idx_p = torch.as_tensor( - seq_idx, device=query_start_loc_p.device, dtype=torch.int32 + seq_idx, + device=common_attn_metadata.query_start_loc.device, + dtype=torch.int32, ) cu_chunk_seqlen_p = torch.as_tensor( - cu_chunk_seqlen, device=query_start_loc_p.device, dtype=torch.int32 + cu_chunk_seqlen, + device=common_attn_metadata.query_start_loc.device, + dtype=torch.int32, ) last_chunk_indices_p = torch.as_tensor( - last_chunk_indices, device=query_start_loc_p.device, dtype=torch.int32 + last_chunk_indices, + device=common_attn_metadata.query_start_loc.device, + dtype=torch.int32, ) - nums_dict, batch_ptr, token_chunk_offset_ptr = ( - compute_causal_conv1d_metadata(query_start_loc_p) - ) - - elif ( - num_decodes <= self.decode_cudagraph_max_bs - and self.compilation_config.cudagraph_mode.has_full_cudagraphs() - ): - self.state_indices_tensor[:num_decodes].copy_( - state_indices_tensor, non_blocking=True - ) - state_indices_tensor = self.state_indices_tensor[:num_decode_tokens] - - if self.vllm_config.cache_config.enable_prefix_caching: - self.block_idx_last_scheduled_token[:num_decodes].copy_( - block_idx_last_scheduled_token, non_blocking=True - ) - block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[ - :num_decode_tokens - ] - - self.block_idx_last_computed_token[:num_decodes].copy_( - block_idx_last_computed_token, non_blocking=True - ) - block_idx_last_computed_token = self.block_idx_last_computed_token[ - :num_decode_tokens - ] - - attn_metadata = Mamba2AttentionMetadata( - num_prefills=num_prefills, - num_prefill_tokens=num_prefill_tokens, - num_decodes=num_decodes, - num_decode_tokens=num_decode_tokens, - query_start_loc_p=query_start_loc_p, - seq_lens=seq_lens, + return replace( + common, prep_initial_states=prep_initial_states, chunk_size=self.chunk_size, - has_initial_states_p=has_initial_states_p, seq_idx_p=seq_idx_p, - state_indices_tensor=state_indices_tensor, cu_chunk_seqlen_p=cu_chunk_seqlen_p, last_chunk_indices_p=last_chunk_indices_p, - nums_dict=nums_dict, - batch_ptr=batch_ptr, - token_chunk_offset_ptr=token_chunk_offset_ptr, - block_idx_last_scheduled_token=block_idx_last_scheduled_token, - block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p, - block_idx_last_computed_token=block_idx_last_computed_token, - num_computed_tokens_p=num_computed_tokens_p, ) - return attn_metadata - - def update_block_table( - self, - metadata: Mamba2AttentionMetadata, - blk_table: torch.Tensor, - slot_mapping: torch.Tensor, - ) -> Mamba2AttentionMetadata: - new_metadata = copy.copy(metadata) - prefix_caching = self.vllm_config.cache_config.enable_prefix_caching - state_indices_t = blk_table if prefix_caching else blk_table[:, 0] - num_reqs = blk_table.shape[0] - - # For CUDA graphs, copy to persistent buffer - if ( - metadata.num_prefills == 0 - and num_reqs <= self.decode_cudagraph_max_bs - and self.compilation_config.cudagraph_mode.has_full_cudagraphs() - ): - persistent_state_indices_t = self.state_indices_tensor[:num_reqs] - persistent_state_indices_t.copy_(state_indices_t, non_blocking=True) - state_indices_t = persistent_state_indices_t - - new_metadata.state_indices_tensor = state_indices_t - return new_metadata diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py index a9705db59f19d..4f876d66da147 100644 --- a/vllm/v1/attention/backends/mamba_attn.py +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import abc +import copy +from dataclasses import dataclass from typing import ClassVar, TypeVar import torch @@ -9,20 +11,52 @@ import torch from vllm.config import VllmConfig from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.utils import ( + PAD_SLOT_ID, AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, + compute_causal_conv1d_metadata, + split_decodes_and_prefills, ) from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec -M = TypeVar("M") +M = TypeVar("M", bound="BaseMambaAttentionMetadata") + + +@dataclass +class BaseMambaAttentionMetadata: + num_prefills: int + num_prefill_tokens: int + num_decodes: int + num_decode_tokens: int + num_reqs: int + + # The following tensors only contain prefill requests and will be None if + # the batch has no prefill request. + has_initial_states_p: torch.Tensor | None + query_start_loc_p: torch.Tensor | None + num_computed_tokens_p: torch.Tensor | None + + state_indices_tensor: torch.Tensor + + # The following tensors are only used for prefix caching and are None if disabled + block_idx_last_scheduled_token: torch.Tensor | None + block_idx_first_scheduled_token_p: torch.Tensor | None + block_idx_last_computed_token: torch.Tensor | None + + # The following attributes are for triton implementation of causal_conv1d + nums_dict: dict | None = None + batch_ptr: torch.Tensor | None = None + token_chunk_offset_ptr: torch.Tensor | None = None class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): + metadata_cls: type[M] reorder_batch_threshold: int = 1 _cudagraph_support: ClassVar[AttentionCGSupport] = ( AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE ) + supports_update_block_table: bool = True def __init__( self, @@ -87,6 +121,18 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): return self.build(0, m) + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> M: + """ + Default build implementation for Mamba-like attention backends. + Subclasses (e.g., Mamba2) can override to add additional metadata. + """ + return self._compute_common_metadata(common_attn_metadata) + def _compute_prefix_caching_block_indices( self, common_attn_metadata: CommonAttentionMetadata, @@ -115,3 +161,147 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): block_idx_first_scheduled_token, block_idx_last_scheduled_token, ) + + def _compute_common_metadata( + self, + common_attn_metadata: CommonAttentionMetadata, + ) -> M: + """ + Compute metadata common to both Mamba1 and Mamba2. + """ + num_reqs = common_attn_metadata.num_reqs + + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( + split_decodes_and_prefills( + common_attn_metadata, decode_threshold=self.reorder_batch_threshold + ) + ) + + # Need flags to indicate if there are initial states + has_initial_states_p = None + query_start_loc_p = None + num_computed_tokens = None + num_computed_tokens_p = None + + # for prefix caching + block_idx_first_scheduled_token = None + block_idx_first_scheduled_token_p = None + block_idx_last_computed_token = None + block_idx_last_scheduled_token = None + + # for causal_conv1d + nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None + + if self.vllm_config.cache_config.enable_prefix_caching: + # Return a tensor of shape (#requests, #max blocks) + state_indices_tensor = common_attn_metadata.block_table_tensor + # Additional cache-related varaiables: + mamba_block_size = self.kv_cache_spec.block_size + num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to( + self.device + ) + ( + block_idx_last_computed_token, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + ) = self._compute_prefix_caching_block_indices( + common_attn_metadata, mamba_block_size + ) + else: + # Always return just a single block per each request: + state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] + + if num_prefills > 0: + query_start_loc_p = ( + common_attn_metadata.query_start_loc[-num_prefills - 1 :] + - num_decode_tokens + ) + has_initial_states_cpu = ( + common_attn_metadata.num_computed_tokens_cpu[ + num_reqs - num_prefills : num_reqs + ] + > 0 + ) + has_initial_states_p = has_initial_states_cpu.to( + common_attn_metadata.query_start_loc.device + ) + + nums_dict, batch_ptr, token_chunk_offset_ptr = ( + compute_causal_conv1d_metadata(query_start_loc_p) + ) + + if self.vllm_config.cache_config.enable_prefix_caching: + assert num_computed_tokens is not None + num_computed_tokens_p = num_computed_tokens[ + num_reqs - num_prefills : num_reqs + ] + assert block_idx_first_scheduled_token is not None + block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[ + num_reqs - num_prefills : num_reqs + ] + elif ( + num_decodes <= self.decode_cudagraph_max_bs + and self.compilation_config.cudagraph_mode.has_full_cudagraphs() + ): + self.state_indices_tensor[:num_decodes].copy_( + state_indices_tensor, non_blocking=True + ) + state_indices_tensor = self.state_indices_tensor[:num_decode_tokens] + state_indices_tensor[num_decodes:] = PAD_SLOT_ID + + if self.vllm_config.cache_config.enable_prefix_caching: + self.block_idx_last_scheduled_token[:num_decodes].copy_( + block_idx_last_scheduled_token, non_blocking=True + ) + block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[ + :num_decode_tokens + ] + + self.block_idx_last_computed_token[:num_decodes].copy_( + block_idx_last_computed_token, non_blocking=True + ) + block_idx_last_computed_token = self.block_idx_last_computed_token[ + :num_decode_tokens + ] + + return self.metadata_cls( + num_prefills=num_prefills, + num_prefill_tokens=num_prefill_tokens, + num_decodes=num_decodes, + num_decode_tokens=num_decode_tokens, + query_start_loc_p=query_start_loc_p, + has_initial_states_p=has_initial_states_p, + state_indices_tensor=state_indices_tensor, + block_idx_last_scheduled_token=block_idx_last_scheduled_token, + block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p, + block_idx_last_computed_token=block_idx_last_computed_token, + num_computed_tokens_p=num_computed_tokens_p, + num_reqs=num_reqs, + nums_dict=nums_dict, + batch_ptr=batch_ptr, + token_chunk_offset_ptr=token_chunk_offset_ptr, + ) + + def update_block_table( + self, + metadata: M, + blk_table: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> M: + new_metadata = copy.copy(metadata) + prefix_caching = self.vllm_config.cache_config.enable_prefix_caching + state_indices_t = blk_table if prefix_caching else blk_table[:, 0] + num_reqs = blk_table.shape[0] + + # For CUDA graphs, copy to persistent buffer + if ( + metadata.num_prefills == 0 + and num_reqs <= self.decode_cudagraph_max_bs + and self.compilation_config.cudagraph_mode.has_full_cudagraphs() + ): + persistent_state_indices_t = self.state_indices_tensor[:num_reqs] + persistent_state_indices_t.copy_(state_indices_t, non_blocking=True) + state_indices_t = persistent_state_indices_t + + new_metadata.state_indices_tensor = state_indices_t + return new_metadata diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py index c8fe0faf71088..e2fae37f5619d 100644 --- a/vllm/v1/attention/backends/short_conv_attn.py +++ b/vllm/v1/attention/backends/short_conv_attn.py @@ -2,15 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -import torch - from vllm.attention.backends.abstract import AttentionBackend -from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder -from vllm.v1.attention.backends.utils import ( - PAD_SLOT_ID, - CommonAttentionMetadata, - compute_causal_conv1d_metadata, - split_decodes_and_prefills, +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadata, + BaseMambaAttentionMetadataBuilder, ) @@ -21,84 +16,11 @@ class ShortConvAttentionBackend(AttentionBackend): @dataclass -class ShortConvAttentionMetadata: - num_prefills: int - num_prefill_tokens: int - num_decodes: int - num_decode_tokens: int - - query_start_loc: torch.Tensor - state_indices_tensor: torch.Tensor - has_initial_states_p: torch.Tensor | None - - # For causal_conv1d - nums_dict: dict | None = None - batch_ptr: torch.Tensor | None = None - token_chunk_offset_ptr: torch.Tensor | None = None +class ShortConvAttentionMetadata(BaseMambaAttentionMetadata): + pass class ShortConvAttentionMetadataBuilder( BaseMambaAttentionMetadataBuilder[ShortConvAttentionMetadata] ): - def build( - self, - common_prefix_len: int, - common_attn_metadata: CommonAttentionMetadata, - fast_build: bool = False, - ) -> ShortConvAttentionMetadata: - num_reqs = common_attn_metadata.num_reqs - query_start_loc = common_attn_metadata.query_start_loc - state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] - - # for causal_conv1d - nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None - - num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills( - common_attn_metadata, decode_threshold=self.reorder_batch_threshold - ) - ) - - has_initial_states_p = None - if num_prefills > 0: - has_initial_states_cpu = ( - common_attn_metadata.num_computed_tokens_cpu[ - num_reqs - num_prefills : num_reqs - ] - > 0 - ) - has_initial_states_p = has_initial_states_cpu.to(query_start_loc.device) - - query_start_loc_p = ( - common_attn_metadata.query_start_loc[-num_prefills - 1 :] - - num_decode_tokens - ) - - nums_dict, batch_ptr, token_chunk_offset_ptr = ( - compute_causal_conv1d_metadata(query_start_loc_p) - ) - - elif ( - num_decodes > 0 - and num_decodes <= self.decode_cudagraph_max_bs - and self.compilation_config.cudagraph_mode.has_full_cudagraphs() - ): - self.state_indices_tensor[:num_decodes].copy_( - state_indices_tensor, non_blocking=True - ) - state_indices_tensor = self.state_indices_tensor[:num_decode_tokens] - state_indices_tensor[num_decodes:] = PAD_SLOT_ID - - attn_metadata = ShortConvAttentionMetadata( - query_start_loc=query_start_loc, - state_indices_tensor=state_indices_tensor, - has_initial_states_p=has_initial_states_p, - num_prefills=num_prefills, - num_prefill_tokens=num_prefill_tokens, - num_decodes=num_decodes, - num_decode_tokens=num_decode_tokens, - nums_dict=nums_dict, - batch_ptr=batch_ptr, - token_chunk_offset_ptr=token_chunk_offset_ptr, - ) - return attn_metadata + metadata_cls = ShortConvAttentionMetadata diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 56763f4b52539..6b94f786a26b2 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -835,6 +835,15 @@ def subclass_attention_backend( ) +def subclass_attention_backend_with_overrides( + name_prefix: str, + attention_backend_cls: type[AttentionBackend], + overrides: dict[str, Any], +) -> type[AttentionBackend]: + name: str = name_prefix + attention_backend_cls.__name__ # type: ignore + return type(name, (attention_backend_cls,), overrides) + + def split_decodes_prefills_and_extends( common_attn_metadata: CommonAttentionMetadata, decode_threshold: int = 1, diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 33d0b795b3115..7da0cce482b27 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -270,10 +270,8 @@ class BlockPool: if num_cached_blocks == 0: parent_block_hash: ExternalBlockHash | None = None else: - parent_block = blocks[num_cached_blocks - 1] - assert parent_block.block_hash is not None parent_block_hash = maybe_convert_block_hash( - get_block_hash(parent_block.block_hash) + block_hashes[num_cached_blocks - 1] ) self.kv_event_queue.append( diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index e4360de3717d1..1480a1f798ea0 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -606,6 +606,43 @@ def get_request_block_hasher( return request_block_hasher +def _check_enough_kv_cache_memory( + available_memory: int, + get_needed_memory: Callable[[], int], + max_model_len: int, + estimate_max_model_len: Callable[[int], int], +): + if available_memory <= 0: + raise ValueError( + "No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when initializing the engine. " + "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + "for more details." + ) + + needed_memory = get_needed_memory() + + if needed_memory > available_memory: + estimated_max_len = estimate_max_model_len(available_memory) + estimated_msg = "" + if estimated_max_len > 0: + estimated_msg = ( + "Based on the available memory, " + f"the estimated maximum model length is {estimated_max_len}. " + ) + + raise ValueError( + f"To serve at least one request with the models's max seq len " + f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV " + f"cache is needed, which is larger than the available KV cache " + f"memory ({available_memory / GiB_bytes:.2f} GiB). {estimated_msg}" + f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` " + f"when initializing the engine. " + f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + f"for more details." + ) + + def max_memory_usage_bytes( vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec] ) -> int: @@ -624,6 +661,9 @@ def estimate_max_model_len( Estimates the maximum model length that can fit in the available memory using binary search. + This function temporarily modifies max_model_len during estimation but + restores the original value before returning, ensuring no side effects. + Args: vllm_config: The global VllmConfig kv_cache_spec: The kv cache spec of each attention layer in the model @@ -632,33 +672,38 @@ def estimate_max_model_len( Returns: The estimated maximum model length that can fit in the available memory. """ + # Save the original max_model_len to restore after estimation + original_max_model_len = vllm_config.model_config.max_model_len # Define a function to check if a given model length fits in memory def fits_in_memory(model_len: int) -> bool: - # Modify the max_model_len for this calculation + # Temporarily modify the max_model_len for this calculation vllm_config.model_config.max_model_len = model_len # Calculate memory needed for the given model length memory_needed = max_memory_usage_bytes(vllm_config, kv_cache_spec.values()) return memory_needed <= available_memory - # Binary search for the maximum model length - current_max = vllm_config.model_config.max_model_len - left, right = 1, current_max + try: + # Binary search for the maximum model length + left, right = 1, original_max_model_len - # If even the smallest model length doesn't fit, return 0 - if not fits_in_memory(left): - return 0 + # If even the smallest model length doesn't fit, return 0 + if not fits_in_memory(left): + return 0 - # Binary search for the maximum model length that fits - result = 1 - while left <= right: - mid = (left + right) // 2 - if fits_in_memory(mid): - result = mid - left = mid + 1 - else: - right = mid - 1 - return result + # Binary search for the maximum model length that fits + result = 1 + while left <= right: + mid = (left + right) // 2 + if fits_in_memory(mid): + result = mid + left = mid + 1 + else: + right = mid - 1 + return result + finally: + # Always restore the original max_model_len to avoid side effects + vllm_config.model_config.max_model_len = original_max_model_len def check_enough_kv_cache_memory( @@ -680,43 +725,12 @@ def check_enough_kv_cache_memory( """ # No need to check for available memory if the kv_cache_spec is empty - if not kv_cache_spec: - return - - if available_memory <= 0: - raise ValueError( - "No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine. " - "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " - "for more details." - ) - - max_model_len = vllm_config.model_config.max_model_len - needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values()) - - if needed_memory > available_memory: - # Estimate the maximum model length that can fit in the available memory - estimated_max_len = estimate_max_model_len( - vllm_config, kv_cache_spec, available_memory - ) - estimated_msg = "" - if estimated_max_len > 0: - estimated_msg = ( - "Based on the available memory, " - f"the estimated maximum model length is {estimated_max_len}." - ) - - raise ValueError( - f"To serve at least one request with the models's max seq len " - f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV " - f"cache is needed, which is larger than the available KV cache " - f"memory ({available_memory / GiB_bytes:.2f} GiB). " - f"{estimated_msg} " - f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` " - f"when initializing the engine. " - f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " - f"for more details." + if kv_cache_spec: + _check_enough_kv_cache_memory( + available_memory, + lambda: max_memory_usage_bytes(vllm_config, kv_cache_spec.values()), + vllm_config.model_config.max_model_len, + lambda am: estimate_max_model_len(vllm_config, kv_cache_spec, am), ) @@ -1301,6 +1315,140 @@ def _report_kv_cache_config( ) +def _max_memory_usage_bytes_from_groups( + vllm_config: VllmConfig, + kv_cache_groups: list[KVCacheGroupSpec], +) -> int: + """ + Calculate maximum memory usage in bytes from KV cache groups. + + This correctly accounts for padding in hybrid models. For example, if a + model has 8 full attention layers and 9 sliding window layers, they will + be padded to 9 full + 9 sliding window for uniform group sizes. + """ + if not kv_cache_groups: + return 0 + + # UniformTypeKVCacheSpecs special case (single group, per-layer specs) + if len(kv_cache_groups) == 1 and isinstance( + kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs + ): + per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs + return sum( + spec.max_memory_usage_bytes(vllm_config) + for spec in per_layer_specs.values() + ) + + # General case: group_size pools, each shared by one layer per group + # Memory = group_size * page_size * blocks_for_max_len + group_size = max(len(group.layer_names) for group in kv_cache_groups) + page_size = get_uniform_page_size( + [group.kv_cache_spec for group in kv_cache_groups] + ) + any_spec = kv_cache_groups[0].kv_cache_spec + blocks_needed = cdiv(any_spec.max_memory_usage_bytes(vllm_config), page_size) + + return group_size * page_size * blocks_needed + + +def _estimate_max_model_len_from_groups( + vllm_config: VllmConfig, + kv_cache_groups: list[KVCacheGroupSpec], + available_memory: int, +) -> int: + """ + Binary search for the maximum model length that fits in available memory. + Returns 0 if even 1 token doesn't fit. + """ + original_max = vllm_config.model_config.max_model_len + + def fits(model_len: int) -> bool: + vllm_config.model_config.max_model_len = model_len + return ( + _max_memory_usage_bytes_from_groups(vllm_config, kv_cache_groups) + <= available_memory + ) + + try: + left, right = 1, original_max + if not fits(left): + return 0 + result = 1 + while left <= right: + mid = (left + right) // 2 + if fits(mid): + result = mid + left = mid + 1 + else: + right = mid - 1 + return result + finally: + vllm_config.model_config.max_model_len = original_max + + +def _auto_fit_max_model_len( + vllm_config: VllmConfig, + kv_cache_groups: list[KVCacheGroupSpec], + available_memory: list[int], +) -> None: + """ + When max_model_len is set to -1, this function estimates the largest + context length that can be supported with the available GPU memory. + It uses binary search to find the maximum length that fits across all + workers. + + Args: + vllm_config: The global VllmConfig (will be modified in-place) + kv_cache_groups: The global KV cache groups (from get_kv_cache_groups). + This correctly accounts for padding in hybrid models. + available_memory: Memory available for KV cache in bytes for each + worker. + """ + original_max = vllm_config.model_config.max_model_len + + if not kv_cache_groups: + # All workers have empty specs (attention-free model) + logger.info_once( + "Auto-fit max_model_len: attention-free model, " + "using derived max_model_len=%d", + original_max, + scope="local", + ) + return + + # Use minimum available memory across all workers + min_available_memory = min(available_memory) + auto_fit_max = _estimate_max_model_len_from_groups( + vllm_config, kv_cache_groups, min_available_memory + ) + + if auto_fit_max <= 0: + raise ValueError( + "Cannot auto-fit max_model_len: not enough GPU memory available " + "to serve even a single token. Try increasing `gpu_memory_utilization`." + ) + + if auto_fit_max >= original_max: + # The model's full context length fits in memory + logger.info_once( + "Auto-fit max_model_len: full model context length %d fits in " + "available GPU memory", + original_max, + scope="local", + ) + else: + # Need to reduce max_model_len to fit in memory + vllm_config.model_config.max_model_len = auto_fit_max + logger.info_once( + "Auto-fit max_model_len: reduced from %d to %d to fit in " + "available GPU memory (%.2f GiB available for KV cache)", + original_max, + auto_fit_max, + min_available_memory / GiB_bytes, + scope="local", + ) + + def get_kv_cache_configs( vllm_config: VllmConfig, kv_cache_specs: list[dict[str, KVCacheSpec]], @@ -1317,10 +1465,12 @@ def get_kv_cache_configs( 1. Merge the KV cache specs of all workers to get the KVCacheSpecs for the whole model. 2. Generate the KV cache groups based on the layer ratio of the whole model. - 3. Generate the KV cache configs for each worker based on the KV cache + This also handles spec unification for hybrid models. + 3. Handle auto-fit max_model_len and memory checks using the unified specs. + 4. Generate the KV cache configs for each worker based on the KV cache grouping strategy. (This is reasonable because the layer ratio of different PP stages are similar.) - 4. Change the num_blocks of each worker to the smallest among all workers + 5. Change the num_blocks of each worker to the smallest among all workers and shrink tensor sizes proportionally to avoid allocating unused memory. Args: @@ -1333,14 +1483,6 @@ def get_kv_cache_configs( The generated KVCacheConfigs for each worker. """ - # Check if the available memory is enough for each worker. - for kv_cache_spec_one_worker, available_memory_one_worker in zip( - kv_cache_specs, available_memory - ): - check_enough_kv_cache_memory( - vllm_config, kv_cache_spec_one_worker, available_memory_one_worker - ) - # Merge the KV cache specs of all workers. Different PP stages may have # different layer names, and different TP ranks of the same PP stage should # have the same KV cache spec. @@ -1354,8 +1496,32 @@ def get_kv_cache_configs( "The KV cache specs for the same layer are different " "across workers. This is not supported yet." ) + + # Get global KV cache groups. This also handles spec unification for + # hybrid models when disable_hybrid_kv_cache_manager is enabled. + # After this call, merged_kv_cache_specs may be modified in-place. global_kv_cache_groups = get_kv_cache_groups(vllm_config, merged_kv_cache_specs) + # If original_max_model_len was -1, automatically + # determine the maximum model length that fits in available GPU memory. + # We use the global groups here to correctly account for padding. + if vllm_config.model_config.original_max_model_len == -1: + _auto_fit_max_model_len(vllm_config, global_kv_cache_groups, available_memory) + + # Check if the available memory is enough (using min across all workers). + # We use the global groups to correctly account for padding. + if global_kv_cache_groups: + _check_enough_kv_cache_memory( + min(available_memory), + lambda: _max_memory_usage_bytes_from_groups( + vllm_config, global_kv_cache_groups + ), + vllm_config.model_config.max_model_len, + lambda am: _estimate_max_model_len_from_groups( + vllm_config, global_kv_cache_groups, am + ), + ) + kv_cache_configs: list[KVCacheConfig] = [] for kv_cache_spec_one_worker, available_memory_one_worker in zip( kv_cache_specs, available_memory diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 4f54d12f4b8d0..27d34f1c60da8 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -75,6 +75,12 @@ class EngineCoreRequest( trace_headers: Mapping[str, str] | None = None + # The user-provided request ID. This field is set internally, + # copied from the provided request_id that's originally assigned + # to the request_id field, see InputProcessor.assign_request_id(). + # Used in outputs and to support abort(req_id, internal=False). + external_req_id: str | None = None + @property def params(self) -> SamplingParams | PoolingParams: """Return the processed params (sampling or pooling).""" diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 1cbe4718f2e5c..87b700d13e9d8 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -290,12 +290,15 @@ class AsyncLLM(EngineClient): is_pooling = isinstance(params, PoolingParams) - # Create a new output collector for the request. - queue = RequestOutputCollector(output_kind=params.output_kind) - # Convert Input --> Request. if isinstance(prompt, EngineCoreRequest): request = prompt + if request_id != request.request_id: + logger.warning_once( + "AsyncLLM.add_request() was passed a request_id parameter that " + "does not match the EngineCoreRequest.request_id attribute. The " + "latter will be used, and the former will be ignored." + ) else: assert prompt_text is None request = self.input_processor.process_inputs( @@ -314,6 +317,11 @@ class AsyncLLM(EngineClient): elif isinstance(prompt, Mapping): prompt_text = cast(str | None, prompt.get("prompt")) + self.input_processor.assign_request_id(request) + + # Create a new output collector for the request. + queue = RequestOutputCollector(params.output_kind, request.request_id) + # Use cloned params that may have been updated in process_inputs() params = request.params @@ -325,7 +333,7 @@ class AsyncLLM(EngineClient): assert isinstance(parent_params, SamplingParams) # Fan out child requests (for n>1). - parent_request = ParentRequest(request_id, parent_params) + parent_request = ParentRequest(request) for idx in range(parent_params.n): request_id, child_params = parent_request.get_child_info(idx) child_request = request if idx == parent_params.n - 1 else copy(request) @@ -396,6 +404,7 @@ class AsyncLLM(EngineClient): "prompt logprobs" ) + q: RequestOutputCollector | None = None try: # We start the output_handler on the first call to generate() so # we can call __init__ before the event loop, which enables us @@ -446,7 +455,8 @@ class AsyncLLM(EngineClient): # is cancelled or the generator is garbage collected. So, # we abort the request if we end up here. except (asyncio.CancelledError, GeneratorExit): - await self.abort(request_id) + if q is not None: + await self.abort(q.request_id, internal=True) if self.log_requests: logger.info("Request %s aborted.", request_id) raise @@ -465,7 +475,8 @@ class AsyncLLM(EngineClient): # Unexpected error in the generate() task (possibly recoverable). except Exception as e: - await self.abort(request_id) + if q is not None: + await self.abort(q.request_id, internal=True) if self.log_requests: logger.info("Request %s failed.", request_id) raise EngineGenerateError() from e @@ -541,13 +552,15 @@ class AsyncLLM(EngineClient): self.output_handler = asyncio.create_task(output_handler()) - async def abort(self, request_id: str | Iterable[str]) -> None: + async def abort( + self, request_id: str | Iterable[str], internal: bool = False + ) -> None: """Abort RequestId in OutputProcessor and EngineCore.""" request_ids = ( (request_id,) if isinstance(request_id, str) else as_list(request_id) ) - all_request_ids = self.output_processor.abort_requests(request_ids) + all_request_ids = self.output_processor.abort_requests(request_ids, internal) await self.engine_core.abort_requests_async(all_request_ids) if self.log_requests: @@ -581,7 +594,7 @@ class AsyncLLM(EngineClient): if not wait_for_inflight_requests: request_ids = list(self.output_processor.request_states.keys()) if request_ids: - await self.abort(request_ids) + await self.abort(request_ids, internal=True) # Wait for running requests to drain before clearing cache. if self.output_processor.has_unfinished_requests(): @@ -633,6 +646,7 @@ class AsyncLLM(EngineClient): TODO: Remove truncate_prompt_tokens in v0.15. """ + q: RequestOutputCollector | None = None try: # We start the output_handler on the first call to generate() so # we can call __init__ before the event loop, which enables us @@ -687,7 +701,8 @@ class AsyncLLM(EngineClient): # If the request is disconnected by the client, generate() # is cancelled. So, we abort the request if we end up here. except asyncio.CancelledError: - await self.abort(request_id) + if q is not None: + await self.abort(q.request_id, internal=True) if self.log_requests: logger.info("Request %s aborted.", request_id) raise @@ -706,7 +721,8 @@ class AsyncLLM(EngineClient): # Unexpected error in the generate() task (possibly recoverable). except Exception as e: - await self.abort(request_id) + if q is not None: + await self.abort(q.request_id, internal=True) if self.log_requests: logger.info("Request %s failed.", request_id) raise EngineGenerateError() from e diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 40c3e9a515e18..5f8883c164b3e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -247,9 +247,20 @@ class EngineCore: assert len(kv_cache_specs) == len(available_gpu_memory) + # Track max_model_len before KV cache config to detect auto-fit changes + max_model_len_before = vllm_config.model_config.max_model_len + kv_cache_configs = get_kv_cache_configs( vllm_config, kv_cache_specs, available_gpu_memory ) + + # If auto-fit reduced max_model_len, sync the new value to workers. + # This is needed because workers were spawned before memory profiling + # and have the original (larger) max_model_len cached. + max_model_len_after = vllm_config.model_config.max_model_len + if max_model_len_after != max_model_len_before: + self.collective_rpc("update_max_model_len", args=(max_model_len_after,)) + scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs) num_gpu_blocks = scheduler_kv_cache_config.num_blocks num_cpu_blocks = 0 diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 29293877cb69d..1d43a8253843f 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -21,7 +21,7 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tokenizers import TokenizerLike from vllm.tokenizers.mistral import MistralTokenizer -from vllm.utils import length_from_prompt_token_ids_or_embeds +from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid from vllm.v1.engine import EngineCoreRequest from vllm.v1.metrics.stats import MultiModalCacheStats from vllm.v1.structured_output.backend_guidance import ( @@ -406,6 +406,19 @@ class InputProcessor: mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)] return mm_uuids + @staticmethod + def assign_request_id(request: EngineCoreRequest): + """Replace the externally supplied request ID with an internal request ID + that adds 8 random characters in order to ensure uniquness. + """ + if request.external_req_id is not None: + raise ValueError( + "The external_req_id field should not be set on EngineCoreRequests" + " passed to vLLM; use the request_id field." + ) + request.external_req_id = request.request_id + request.request_id = f"{request.external_req_id}-{random_uuid():.8}" + def process_inputs( self, request_id: str, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 1011317b706d3..33fc34b67af6f 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -213,10 +213,10 @@ class LLMEngine: def get_supported_tasks(self) -> tuple[SupportedTask, ...]: return self.engine_core.get_supported_tasks() - def abort_request(self, request_ids: list[str]) -> None: + def abort_request(self, request_ids: list[str], internal: bool = False) -> None: """Remove request_ids from EngineCore and Detokenizer.""" - request_ids = self.output_processor.abort_requests(request_ids) + request_ids = self.output_processor.abort_requests(request_ids, internal) self.engine_core.abort_requests(request_ids) def add_request( @@ -238,6 +238,12 @@ class LLMEngine: # Process raw inputs into the request. if isinstance(prompt, EngineCoreRequest): request = prompt + if request_id != request.request_id: + logger.warning_once( + "AsyncLLM.add_request() was passed a request_id parameter that " + "does not match the EngineCoreRequest.request_id attribute. The " + "latter will be used, and the former will be ignored." + ) else: assert prompt_text is None request = self.input_processor.process_inputs( @@ -255,6 +261,8 @@ class LLMEngine: elif isinstance(prompt, Mapping): prompt_text = cast(str | None, prompt.get("prompt")) + self.input_processor.assign_request_id(request) + # Use cloned params that may have been updated in process_inputs() params = request.params @@ -268,7 +276,7 @@ class LLMEngine: return # Fan out child requests (for n>1). - parent_req = ParentRequest(request_id, params) + parent_req = ParentRequest(request) for idx in range(n): request_id, child_params = parent_req.get_child_info(idx) child_request = request if idx == n - 1 else copy(request) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 8f7d8a71f1a2e..e8717e15198a7 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +from collections import defaultdict from collections.abc import Iterable from dataclasses import dataclass from typing import Any, cast @@ -40,8 +41,9 @@ class RequestOutputCollector: producer gets ahead of the consumer. """ - def __init__(self, output_kind: RequestOutputKind): + def __init__(self, output_kind: RequestOutputKind, request_id: str): self.aggregate = output_kind == RequestOutputKind.DELTA + self.request_id = request_id self.output: RequestOutput | PoolingRequestOutput | Exception | None = None self.ready = asyncio.Event() @@ -92,6 +94,7 @@ class RequestState: def __init__( self, request_id: str, + external_req_id: str, parent_req: ParentRequest | None, request_index: int, lora_request: LoRARequest | None, @@ -111,6 +114,7 @@ class RequestState: temperature: float | None = None, ): self.request_id = request_id + self.external_req_id = external_req_id self.parent_req = parent_req self.request_index = request_index self.lora_request = lora_request @@ -176,8 +180,10 @@ class RequestState: assert request.pooling_params is not None output_kind = request.pooling_params.output_kind + assert request.external_req_id is not None return cls( request_id=request.request_id, + external_req_id=request.external_req_id, parent_req=parent_req, request_index=request_index, lora_request=request.lora_request, @@ -235,10 +241,13 @@ class RequestState: ] self.sent_tokens_offset = len(self.detokenizer.output_token_ids) - request_id = self.request_id + external_req_id = self.external_req_id + if pooling_output is not None: return self._new_request_output( - request_id, [self._new_pooling_output(pooling_output)], finished + external_req_id, + [self._new_pooling_output(pooling_output)], + finished, ) output = self._new_completion_output(new_token_ids, finish_reason, stop_reason) @@ -246,19 +255,18 @@ class RequestState: if self.parent_req is None: outputs = [output] else: - request_id, outputs, finished = self.parent_req.get_outputs( - request_id, output - ) + outputs, finished = self.parent_req.get_outputs(self.request_id, output) if not outputs: return None + external_req_id = self.parent_req.external_req_id return self._new_request_output( - request_id, outputs, finished, kv_transfer_params + external_req_id, outputs, finished, kv_transfer_params ) def _new_request_output( self, - request_id: str, + external_req_id: str, outputs: list[CompletionOutput] | list[PoolingOutput], finished: bool, kv_transfer_params: dict[str, Any] | None = None, @@ -269,7 +277,7 @@ class RequestState: # Prompt embeddings are currently not supported by pooling requests. assert self.prompt_token_ids is not None return PoolingRequestOutput( - request_id=request_id, + request_id=external_req_id, outputs=first_output, num_cached_tokens=self.num_cached_tokens, prompt_token_ids=self.prompt_token_ids, @@ -288,7 +296,7 @@ class RequestState: prompt_token_ids = [0] * len(self.prompt_embeds) return RequestOutput( - request_id=request_id, + request_id=external_req_id, # request_id is what was provided externally lora_request=self.lora_request, prompt=self.prompt, prompt_token_ids=prompt_token_ids, @@ -352,6 +360,7 @@ class OutputProcessor: self.stream_interval = stream_interval self.request_states: dict[str, RequestState] = {} self.parent_requests: dict[str, ParentRequest] = {} + self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list) self.lora_states = LoRARequestStates(log_stats) self.tracer: Tracer | None = None self._requests_drained = asyncio.Event() @@ -375,12 +384,41 @@ class OutputProcessor: assert state.queue is not None state.queue.put(e) - def abort_requests( - self, - request_ids: Iterable[str], - ) -> list[str]: - request_ids_to_abort = [] + def abort_requests(self, request_ids: Iterable[str], internal: bool) -> list[str]: + """Abort a list of requests. + + The request_ids may be either external request IDs (those passed to + InputProcessor.process_inputs()) or internal request IDs (those randomly + generated when creating the EngineCoreRequest). + + If an external request ID is provided, and that external request ID + was used for multiple requests, all requests associated with that external + request ID are aborted. + + In the case of parallel sampling, a request ID may be used to identify + a parent request, in which case the associated child requests are aborted + also. + """ + + internal_req_ids = [] for request_id in request_ids: + if internal: + # Internal ID - this may be a parent request + internal_req_ids.append(request_id) + + # Remove internal ID from the external->internal mapping + if req_state := self.request_states.get(request_id): + external_req_id = req_state.external_req_id + internal_ids = self.external_req_ids[external_req_id] + internal_ids.remove(request_id) + if not internal_ids: + del self.external_req_ids[external_req_id] + elif internal_ids := self.external_req_ids.pop(request_id, []): + # External ID - abort all requests in the external->internal mapping + internal_req_ids.extend(internal_ids) + + request_ids_to_abort = [] + for request_id in internal_req_ids: req_state = self.request_states.pop(request_id, None) if req_state is not None: self.lora_states.request_finished(request_id, req_state.lora_name) @@ -404,7 +442,7 @@ class OutputProcessor: # Abort children prior to removing the parent. if parent.child_requests: child_reqs = list(parent.child_requests) - child_reqs = self.abort_requests(child_reqs) + child_reqs = self.abort_requests(child_reqs, internal=True) request_ids_to_abort.extend(child_reqs) self.parent_requests.pop(request_id, None) if not self.request_states: @@ -439,6 +477,9 @@ class OutputProcessor: if parent_req: self.parent_requests[parent_req.request_id] = parent_req + # Track the external_req_id -> [internal_req_id, ...] mapping + self.external_req_ids[req_state.external_req_id].append(request_id) + def process_outputs( self, engine_core_outputs: list[EngineCoreOutput], @@ -522,6 +563,12 @@ class OutputProcessor: # Free completed requests. if finish_reason is not None: self.request_states.pop(req_id) + + internal_ids = self.external_req_ids[req_state.external_req_id] + internal_ids.remove(req_id) + if not internal_ids: + del self.external_req_ids[req_state.external_req_id] + # Remove parent request if applicable. parent_req = req_state.parent_req if parent_req and not parent_req.child_requests: @@ -597,7 +644,9 @@ class OutputProcessor: ) # meta - span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, req_state.request_id) + span.set_attribute( + SpanAttributes.GEN_AI_REQUEST_ID, req_state.external_req_id + ) if req_state.top_p: span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p) if req_state.max_tokens_param: diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py index 59aacd1963076..b7761970ba92f 100644 --- a/vllm/v1/engine/parallel_sampling.py +++ b/vllm/v1/engine/parallel_sampling.py @@ -6,6 +6,7 @@ from typing import Optional, cast from vllm.outputs import CompletionOutput from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.v1.engine import EngineCoreRequest from vllm.v1.metrics.stats import IterationStats @@ -17,6 +18,7 @@ class ParentRequest: """ request_id: str + external_req_id: str sampling_params: SamplingParams # To track the completion of child requests @@ -31,8 +33,11 @@ class ParentRequest: # To efficiently obtain child sampling params cached_child_sampling_params: SamplingParams | None - def __init__(self, request_id: str, sampling_params: SamplingParams) -> None: - self.request_id = request_id + def __init__(self, request: EngineCoreRequest) -> None: + assert request.external_req_id is not None + sampling_params = request.params + self.request_id = request.request_id + self.external_req_id = request.external_req_id self.sampling_params = sampling_params self.child_requests = set() @@ -96,7 +101,7 @@ class ParentRequest: self, child_request_id: str, completion_output: CompletionOutput, - ) -> tuple[str, list[CompletionOutput], bool]: + ) -> tuple[list[CompletionOutput], bool]: already_finished_and_returned: bool = False if completion_output.finished(): if child_request_id in self.child_requests: @@ -118,7 +123,7 @@ class ParentRequest: outputs = [] if self.child_requests else self.output_aggregator finished = not self.child_requests - return self.request_id, outputs, finished + return outputs, finished def observe_num_generation_tokens(self, num_generation_tokens: int): self.max_num_generation_tokens = max( diff --git a/vllm/v1/worker/ec_connector_model_runner_mixin.py b/vllm/v1/worker/ec_connector_model_runner_mixin.py index 08a41532ea8e1..1a347a0b98ab2 100644 --- a/vllm/v1/worker/ec_connector_model_runner_mixin.py +++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py @@ -6,9 +6,7 @@ Define EC connector functionality mixin for model runners. from collections.abc import Generator from contextlib import AbstractContextManager, contextmanager, nullcontext -from typing import ( - TYPE_CHECKING, # noqa: UP035 -) +from typing import TYPE_CHECKING import torch diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 455406394d3ec..16fc9fd7cb4d8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -61,6 +61,7 @@ from vllm.model_executor.layers.rotary_embedding import ( ) from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.models.interfaces import ( + MultiModalEmbeddings, SupportsMRoPE, SupportsMultiModal, SupportsXDRoPE, @@ -78,11 +79,7 @@ from vllm.model_executor.models.interfaces_base import ( is_text_generation_model, ) from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import ( - BatchedTensorInputs, - MultiModalKwargsItem, - PlaceholderRange, -) +from vllm.multimodal.inputs import BatchedTensorInputs, MultiModalKwargsItem from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingType @@ -2097,28 +2094,27 @@ class GPUModelRunner( ] return logits_indices_padded - def _batch_mm_kwargs_from_scheduler( + def _batch_mm_inputs_from_scheduler( self, scheduler_output: "SchedulerOutput", - ) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]: - """Batch multimodal kwargs from scheduled encoder inputs. + ) -> tuple[list[str], list[MultiModalKwargsItem]]: + """Batch multimodal inputs from scheduled encoder inputs. Args: scheduler_output: The scheduler output containing scheduled encoder inputs. Returns: - A tuple of (mm_kwargs, req_ids_pos) where: - - mm_kwargs: List of multimodal kwargs items to be batched - - mm_hashes_pos: List of (mm_hash, position_info) tuples + A tuple of (mm_hashes, mm_kwargs) where: + - mm_hashes: List of multimodal hashes for each item + - mm_kwargs: List of multimodal kwargs for each item """ scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs if not scheduled_encoder_inputs: return [], [] - # Batch the multi-modal inputs. + + mm_hashes = list[str]() mm_kwargs = list[MultiModalKwargsItem]() - # list of tuple (mm_hash, position_info) - mm_hashes_pos = list[tuple[str, PlaceholderRange]]() for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): req_state = self.requests[req_id] @@ -2126,19 +2122,16 @@ class GPUModelRunner( mm_feature = req_state.mm_features[mm_input_id] if mm_feature.data is None: continue - mm_hash = mm_feature.identifier - mm_kwargs.append(mm_feature.data) - mm_hashes_pos.append((mm_hash, mm_feature.mm_position)) - return mm_kwargs, mm_hashes_pos + mm_hashes.append(mm_feature.identifier) + mm_kwargs.append(mm_feature.data) + + return mm_hashes, mm_kwargs def _execute_mm_encoder( self, scheduler_output: "SchedulerOutput" ) -> list[torch.Tensor]: - # Batch the multi-modal inputs using the helper method. - mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler( - scheduler_output - ) + mm_hashes, mm_kwargs = self._batch_mm_inputs_from_scheduler(scheduler_output) if not mm_kwargs: return [] @@ -2157,7 +2150,7 @@ class GPUModelRunner( device=self.device, pin_memory=self.pin_memory, ): - curr_group_outputs: list[torch.Tensor] = [] + curr_group_outputs: MultiModalEmbeddings # EVS-related change. # (ekhvedchenia): Temporary hack to limit peak memory usage when @@ -2173,6 +2166,7 @@ class GPUModelRunner( and modality == "video" and num_items > 1 ): + curr_group_outputs_lst = list[torch.Tensor]() for video_mm_kwargs_item in filter( lambda item: item.modality == "video", mm_kwargs ): @@ -2188,7 +2182,9 @@ class GPUModelRunner( **micro_batch_mm_inputs ) - curr_group_outputs.extend(micro_batch_outputs) + curr_group_outputs_lst.extend(micro_batch_outputs) + + curr_group_outputs = curr_group_outputs_lst else: # Run the encoder. # `curr_group_outputs` is either of the following: @@ -2197,7 +2193,7 @@ class GPUModelRunner( # 2. A list or tuple (length: num_items) of tensors, # each of shape (feature_size, hidden_size) in case the feature # size is dynamic depending on the input multimodal items. - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) # type: ignore[assignment] + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -2206,7 +2202,7 @@ class GPUModelRunner( encoder_outputs.extend(curr_group_outputs) # Cache the encoder outputs by mm_hash - for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs): + for mm_hash, output in zip(mm_hashes, encoder_outputs): self.encoder_cache[mm_hash] = output logger.debug("Finish execute for mm hash %s", mm_hash) self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash) @@ -2457,6 +2453,17 @@ class GPUModelRunner( return round_up(num_scheduled_tokens, tp_size) return num_scheduled_tokens + def _prepare_mm_inputs( + self, num_tokens: int + ) -> tuple[torch.Tensor | None, torch.Tensor]: + if self.model.requires_raw_input_tokens: + input_ids = self.input_ids.gpu[:num_tokens] + else: + input_ids = None + + inputs_embeds = self.inputs_embeds.gpu[:num_tokens] + return input_ids, inputs_embeds + def _preprocess( self, scheduler_output: "SchedulerOutput", @@ -2499,8 +2506,7 @@ class GPUModelRunner( # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(inputs_embeds_scheduled) - input_ids = None - inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens] + input_ids, inputs_embeds = self._prepare_mm_inputs(num_input_tokens) model_kwargs = { **self._init_model_kwargs(num_scheduled_tokens), **self._extract_mm_kwargs(scheduler_output), @@ -4220,8 +4226,8 @@ class GPUModelRunner( assert num_tokens_padded <= self.max_num_tokens model_kwargs = self._init_model_kwargs(num_tokens_padded) if self.supports_mm_inputs and not self.model_config.is_encoder_decoder: - input_ids = None - inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded] + input_ids, inputs_embeds = self._prepare_mm_inputs(num_tokens_padded) + model_kwargs = { **model_kwargs, **self._dummy_mm_kwargs(num_reqs), diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 4747388e22b3d..68fe0853370f7 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -387,6 +387,19 @@ class Worker(WorkerBase): def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: return self.model_runner.get_kv_cache_spec() + def update_max_model_len(self, max_model_len: int) -> None: + """Update max_model_len after auto-fit to GPU memory. + + This is called when max_model_len=-1 is used and the engine + automatically determines the maximum context length that fits + in GPU memory. Workers need to update their cached max_model_len + to match the engine's decision. + """ + self.model_config.max_model_len = max_model_len + if self.model_runner is not None: + self.model_runner.max_model_len = max_model_len + logger.debug("Updated max_model_len to %d", max_model_len) + def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: """Allocate GPU KV cache with the specified kv_cache_config.""" diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 2bcc87b63bcdf..7bb4ebe476ecf 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -7,9 +7,7 @@ Define KV connector functionality mixin for model runners. import copy from collections.abc import Generator from contextlib import AbstractContextManager, contextmanager, nullcontext -from typing import ( - TYPE_CHECKING, # noqa: UP035 -) +from typing import TYPE_CHECKING import torch