mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-01 04:47:05 +08:00
Merge branch 'main' into fix-blockstored-kvevent
This commit is contained in:
commit
62d79499de
@ -2,7 +2,7 @@
|
||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install lm-eval==0.4.9
|
||||
# pip install "lm-eval[api]>=0.4.9.2"
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
# pip install "lm-eval[api]>=0.4.9.2"
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
# pip install "lm-eval[api]>=0.4.9.2"
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
# pip install "lm-eval[api]>=0.4.9.2"
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
|
||||
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
echo "--- Python dependencies installed ---"
|
||||
|
||||
|
||||
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
echo "--- Python dependencies installed ---"
|
||||
|
||||
|
||||
@ -162,7 +162,10 @@ steps:
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py
|
||||
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
|
||||
# TODO: Remove after next torch update
|
||||
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
|
||||
- label: Entrypoints Integration Test (API Server 2)
|
||||
@ -219,6 +222,9 @@ steps:
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
commands:
|
||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
||||
# test with torchrun tp=2 and external_dp=2
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=2 and pp=2
|
||||
@ -267,9 +273,10 @@ steps:
|
||||
- vllm/v1/executor/uniproc_executor.py
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
#- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and dp=4 with ep
|
||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: EPLB Algorithm Test # 5min
|
||||
@ -979,7 +986,10 @@ steps:
|
||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
|
||||
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
|
||||
# TODO: Remove after next torch update
|
||||
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 5min
|
||||
@ -1288,6 +1298,9 @@ steps:
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
@ -1341,7 +1354,9 @@ steps:
|
||||
# end platform plugin tests
|
||||
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
|
||||
# TODO: Remove after next torch update
|
||||
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
@ -1510,7 +1525,7 @@ steps:
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
|
||||
10
csrc/cache.h
10
csrc/cache.h
@ -9,16 +9,6 @@
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
const torch::Tensor& block_mapping);
|
||||
|
||||
// Note: the key_caches and value_caches vectors are constant but
|
||||
// not the Tensors they contain. The vectors need to be const refs
|
||||
// in order to satisfy pytorch's C++ operator registration code.
|
||||
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
std::vector<torch::Tensor> const& value_caches,
|
||||
const torch::Tensor& block_mapping);
|
||||
|
||||
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
||||
const torch::Tensor& block_mapping);
|
||||
|
||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
|
||||
@ -119,94 +119,6 @@ __global__ void copy_blocks_mla_kernel(
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
// Note: the key_caches and value_caches vectors are constant but
|
||||
// not the Tensors they contain. The vectors need to be const refs
|
||||
// in order to satisfy pytorch's C++ operator registration code.
|
||||
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
std::vector<torch::Tensor> const& value_caches,
|
||||
const torch::Tensor& block_mapping) {
|
||||
int num_layers = key_caches.size();
|
||||
TORCH_CHECK(num_layers == value_caches.size());
|
||||
if (num_layers == 0) {
|
||||
return;
|
||||
}
|
||||
torch::Device cache_device = key_caches[0].device();
|
||||
TORCH_CHECK(cache_device.is_cuda());
|
||||
|
||||
// Create data structures for the kernel.
|
||||
// Create an array of pointers to the key and value caches.
|
||||
int64_t key_cache_ptrs[num_layers];
|
||||
int64_t value_cache_ptrs[num_layers];
|
||||
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
|
||||
key_cache_ptrs[layer_idx] =
|
||||
reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
|
||||
value_cache_ptrs[layer_idx] =
|
||||
reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
|
||||
}
|
||||
|
||||
// block_mapping is a 2D tensor with shape (num_pairs, 2).
|
||||
int num_pairs = block_mapping.size(0);
|
||||
|
||||
// Move the data structures to the GPU.
|
||||
// NOTE: This synchronizes the CPU and GPU.
|
||||
torch::Tensor key_cache_ptrs_tensor =
|
||||
torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
|
||||
.to(cache_device);
|
||||
torch::Tensor value_cache_ptrs_tensor =
|
||||
torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
|
||||
.to(cache_device);
|
||||
|
||||
// Launch the kernel.
|
||||
const int numel_per_block = key_caches[0][0].numel();
|
||||
dim3 grid(num_layers, num_pairs);
|
||||
dim3 block(std::min(1024, numel_per_block));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
|
||||
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
|
||||
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
key_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||
value_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||
block_mapping.data_ptr<int64_t>(), numel_per_block);
|
||||
}));
|
||||
}
|
||||
|
||||
// copy blocks kernel for MLA (assumes a joint KV-cache)
|
||||
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
||||
const torch::Tensor& block_mapping) {
|
||||
int num_layers = kv_caches.size();
|
||||
if (num_layers == 0) {
|
||||
return;
|
||||
}
|
||||
torch::Device cache_device = kv_caches[0].device();
|
||||
TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
|
||||
|
||||
std::vector<int64_t> cache_ptrs(num_layers);
|
||||
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
|
||||
cache_ptrs[layer_idx] =
|
||||
reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
|
||||
}
|
||||
torch::Tensor cache_ptrs_tensor =
|
||||
torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
|
||||
.to(cache_device);
|
||||
|
||||
int num_pairs = block_mapping.size(0);
|
||||
// We use the stride instead of numel in case the cache is padded for memory
|
||||
// alignment reasons, we assume the blocks data (inclusive of any padding)
|
||||
// is contiguous in memory
|
||||
int mem_footprint_per_block = kv_caches[0].stride(0);
|
||||
dim3 grid(num_layers, num_pairs);
|
||||
dim3 block(std::min(1024, mem_footprint_per_block));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
|
||||
kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
|
||||
vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||
block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
|
||||
}));
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Used to copy/convert one element
|
||||
@ -539,9 +451,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
for (int i = 0; i < VEC_SIZE; i++) {
|
||||
amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
|
||||
}
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
|
||||
// Reduced amax
|
||||
for (int mask = 16; mask > 0; mask /= 2) {
|
||||
@ -551,9 +460,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
|
||||
#endif
|
||||
}
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
|
||||
#if defined(__gfx942__)
|
||||
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
||||
#else
|
||||
|
||||
@ -24,6 +24,8 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
#ifndef VLLM_NUMA_DISABLED
|
||||
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
|
||||
TORCH_CHECK(omp_cpu_mask != nullptr,
|
||||
"Failed to parse CPU string: " + cpu_ids);
|
||||
TORCH_CHECK(omp_cpu_mask->size > 0);
|
||||
std::vector<int> omp_cpu_ids;
|
||||
omp_cpu_ids.reserve(omp_cpu_mask->size);
|
||||
@ -44,20 +46,12 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
|
||||
// Memory node binding
|
||||
if (numa_available() != -1) {
|
||||
int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
|
||||
std::set<int> node_ids;
|
||||
for (const auto& cpu_id : omp_cpu_ids) {
|
||||
int node_id = numa_node_of_cpu(cpu_id);
|
||||
if (node_id != -1) {
|
||||
node_ids.insert(node_id);
|
||||
}
|
||||
if (node_id != mem_node_id) {
|
||||
TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
|
||||
omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
|
||||
". All CPUs should be on the same NUMA node for optimal "
|
||||
"performance. Memory will be bound to NUMA node ",
|
||||
mem_node_id, ".");
|
||||
}
|
||||
}
|
||||
// Concatenate all node_ids into a single comma-separated string
|
||||
if (!node_ids.empty()) {
|
||||
@ -70,7 +64,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
}
|
||||
|
||||
bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
|
||||
bitmask* src_mask = numa_get_membind();
|
||||
bitmask* src_mask = numa_get_mems_allowed();
|
||||
|
||||
int pid = getpid();
|
||||
|
||||
@ -83,15 +77,46 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
std::to_string(errno));
|
||||
}
|
||||
|
||||
// restrict memory allocation node.
|
||||
numa_set_membind(mask);
|
||||
// Restrict memory allocation to the selected NUMA node(s).
|
||||
// Enhances memory locality for the threads bound to those NUMA CPUs.
|
||||
if (node_ids.size() > 1) {
|
||||
errno = 0;
|
||||
numa_set_interleave_mask(mask);
|
||||
if (errno != 0) {
|
||||
TORCH_WARN("numa_set_interleave_mask failed. errno: " +
|
||||
std::to_string(errno));
|
||||
} else {
|
||||
TORCH_WARN(
|
||||
"NUMA binding: Using INTERLEAVE policy for memory "
|
||||
"allocation across multiple NUMA nodes (nodes: " +
|
||||
node_ids_str +
|
||||
"). Memory allocations will be "
|
||||
"interleaved across the specified NUMA nodes.");
|
||||
}
|
||||
} else {
|
||||
errno = 0;
|
||||
numa_set_membind(mask);
|
||||
if (errno != 0) {
|
||||
TORCH_WARN("numa_set_membind failed. errno: " +
|
||||
std::to_string(errno));
|
||||
} else {
|
||||
TORCH_WARN(
|
||||
"NUMA binding: Using MEMBIND policy for memory "
|
||||
"allocation on the NUMA nodes (" +
|
||||
node_ids_str +
|
||||
"). Memory allocations will be "
|
||||
"strictly bound to these NUMA nodes.");
|
||||
}
|
||||
}
|
||||
|
||||
numa_set_strict(1);
|
||||
|
||||
numa_free_nodemask(mask);
|
||||
numa_free_nodemask(src_mask);
|
||||
} else {
|
||||
TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
|
||||
std::to_string(errno));
|
||||
TORCH_WARN(
|
||||
"numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
|
||||
std::to_string(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -35,7 +35,7 @@ template <typename Int>
|
||||
__host__ __device__ inline Int round_up(Int x, Int y) {
|
||||
static_assert(std::is_integral_v<Int>,
|
||||
"round_up argument must be integral type");
|
||||
return (x + y - 1) / y * y;
|
||||
return ((x + y - 1) / y) * y;
|
||||
}
|
||||
|
||||
// Compute effective rows for grid configuration with swizzled SF layouts.
|
||||
@ -61,37 +61,47 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
int sf_m = round_up<int>(numRows, 128);
|
||||
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
|
||||
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
|
||||
for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
|
||||
// Each thread writes 4 uint32_t elements.
|
||||
for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
|
||||
col += blockDim.x * 4) {
|
||||
SFout[row * sf_n_int + col] = 0x00;
|
||||
}
|
||||
}
|
||||
int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
|
||||
|
||||
// Get the global scaling factor, which will be applied to the SF.
|
||||
// Note SFScale is the same as next GEMM's alpha, which is
|
||||
// (448.f / (Alpha_A / 6.f)).
|
||||
float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
|
||||
|
||||
// Input tensor row/col loops.
|
||||
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
|
||||
for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
|
||||
// Iterate over all rows and cols including padded ones -
|
||||
// ensures we visit every single scale factor address to initialize it.
|
||||
for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
|
||||
for (int colIdx = threadIdx.x;
|
||||
colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
|
||||
colIdx += blockDim.x) {
|
||||
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
|
||||
|
||||
PackedVec in_vec;
|
||||
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
|
||||
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
|
||||
// Get the output tensor offset.
|
||||
// Same as inOffset because 8 elements are packed into one uint32_t.
|
||||
int64_t outOffset = inOffset;
|
||||
auto& out_pos = out[outOffset];
|
||||
|
||||
// If we are outside valid rows OR outside valid columns -> Use Zeros
|
||||
if (rowIdx >= numRows || elem_idx >= numCols) {
|
||||
memset(&in_vec, 0, sizeof(PackedVec));
|
||||
|
||||
} else {
|
||||
// Valid Region: Load actual data
|
||||
in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
|
||||
}
|
||||
|
||||
auto sf_out =
|
||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx, colIdx, numKTiles, SFout);
|
||||
|
||||
out_pos =
|
||||
auto out_val =
|
||||
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
|
||||
|
||||
// We do NOT write output for padding because the 'out' tensor is not
|
||||
// padded.
|
||||
if (rowIdx < numRows && elem_idx < numCols) {
|
||||
// Same as inOffset because 8 elements are packed into one uint32_t.
|
||||
out[inOffset] = out_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -134,4 +144,4 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
|
||||
reinterpret_cast<uint32_t*>(sf_out));
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -685,16 +685,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
|
||||
cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
|
||||
|
||||
// Copy the cache blocks from src to dst.
|
||||
cache_ops.def(
|
||||
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
|
||||
"Tensor block_mapping) -> ()");
|
||||
cache_ops.impl("copy_blocks", torch::kCUDA, ©_blocks);
|
||||
|
||||
cache_ops.def(
|
||||
"copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
|
||||
cache_ops.impl("copy_blocks_mla", torch::kCUDA, ©_blocks_mla);
|
||||
|
||||
// Reshape the key and value tensors and cache them.
|
||||
cache_ops.def(
|
||||
"reshape_and_cache(Tensor key, Tensor value,"
|
||||
|
||||
@ -183,7 +183,7 @@ ARG nvcc_threads=8
|
||||
ENV NVCC_THREADS=$nvcc_threads
|
||||
|
||||
ARG USE_SCCACHE
|
||||
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
|
||||
ARG SCCACHE_DOWNLOAD_URL
|
||||
ARG SCCACHE_ENDPOINT
|
||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
@ -201,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
&& case "${TARGETPLATFORM}" in \
|
||||
linux/arm64) SCCACHE_ARCH="aarch64" ;; \
|
||||
linux/amd64) SCCACHE_ARCH="x86_64" ;; \
|
||||
*) echo "Unsupported TARGETPLATFORM for sccache: ${TARGETPLATFORM}" >&2; exit 1 ;; \
|
||||
esac \
|
||||
&& export SCCACHE_DOWNLOAD_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
|
||||
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
|
||||
&& tar -xzf sccache.tar.gz \
|
||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& sudo mv sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl \
|
||||
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
|
||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||
|
||||
@ -2,4 +2,4 @@
|
||||
|
||||
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
|
||||
|
||||
Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.
|
||||
You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
|
||||
|
||||
5
docs/deployment/integrations/llm-d.md
Normal file
5
docs/deployment/integrations/llm-d.md
Normal file
@ -0,0 +1,5 @@
|
||||
# llm-d
|
||||
|
||||
vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers.
|
||||
|
||||
You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
|
||||
@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
|
||||
- [Helm](frameworks/helm.md)
|
||||
- [InftyAI/llmaz](integrations/llmaz.md)
|
||||
- [llm-d](integrations/llm-d.md)
|
||||
- [KAITO](integrations/kaito.md)
|
||||
- [KServe](integrations/kserve.md)
|
||||
- [Kthena](integrations/kthena.md)
|
||||
|
||||
@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio
|
||||
Install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||
|
||||
```bash
|
||||
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
pip install vllm "lm-eval[api]>=0.4.9.2"
|
||||
```
|
||||
|
||||
Load and run the model in `vllm`:
|
||||
|
||||
@ -18,7 +18,7 @@ pip install llmcompressor
|
||||
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||
|
||||
```bash
|
||||
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
pip install vllm "lm-eval[api]>=0.4.9.2"
|
||||
```
|
||||
|
||||
## Quantization Process
|
||||
|
||||
@ -23,7 +23,7 @@ pip install llmcompressor
|
||||
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||
|
||||
```bash
|
||||
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
pip install vllm "lm-eval[api]>=0.4.9.2"
|
||||
```
|
||||
|
||||
## Quantization Process
|
||||
|
||||
@ -20,7 +20,7 @@ for more installation details.
|
||||
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||
|
||||
```bash
|
||||
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
pip install vllm "lm-eval[api]>=0.4.9.2"
|
||||
```
|
||||
|
||||
## Quantization Process
|
||||
|
||||
@ -490,6 +490,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
|
||||
| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | |
|
||||
| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | |
|
||||
| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | |
|
||||
| `LlamaBidirectionalModel`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
|
||||
@ -543,8 +544,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
|
||||
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | |
|
||||
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | |
|
||||
| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | |
|
||||
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | |
|
||||
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
|
||||
@ -562,6 +564,11 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
|
||||
!!! note
|
||||
The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
|
||||
|
||||
!!! note
|
||||
`nvidia/llama-nemotron-rerank-1b-v2` require a specific prompt format to work correctly.
|
||||
|
||||
Examples : [offline_using_template.py](../../examples/pooling/score/offline_using_template.py) [online_using_template.py](../../examples/pooling/score/online_using_template.py)
|
||||
|
||||
!!! note
|
||||
Load the official original `mxbai-rerank-v2` by using the following command.
|
||||
|
||||
|
||||
@ -669,6 +669,21 @@ You can find the documentation for cross encoder models at [sbert.net](https://w
|
||||
|
||||
Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
|
||||
|
||||
#### Score Template
|
||||
|
||||
Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](#chat-template)).
|
||||
|
||||
Score templates are supported for **cross-encoder** models only. If you are using an **embedding** model for scoring, vLLM does not apply a score template.
|
||||
|
||||
Like chat templates, the score template receives a `messages` list. For scoring, each message has a `role` attribute—either `"query"` or `"document"`. For the usual kind of point-wise cross-encoder, you can expect exactly two messages: one query and one document. To access the query and document content, use Jinja's `selectattr` filter:
|
||||
|
||||
- **Query**: `{{ (messages | selectattr("role", "eq", "query") | first).content }}`
|
||||
- **Document**: `{{ (messages | selectattr("role", "eq", "document") | first).content }}`
|
||||
|
||||
This approach is more robust than index-based access (`messages[0]`, `messages[1]`) because it selects messages by their semantic role. It also avoids assumptions about message ordering if additional message types are added to `messages` in the future.
|
||||
|
||||
Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja)
|
||||
|
||||
#### Single inference
|
||||
|
||||
You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
|
||||
|
||||
27
examples/pooling/score/offline_using_template.py
Normal file
27
examples/pooling/score/offline_using_template.py
Normal file
@ -0,0 +1,27 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
from pathlib import Path
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
model_name = "nvidia/llama-nemotron-rerank-1b-v2"
|
||||
|
||||
# Path to template file
|
||||
template_path = Path(__file__).parent / "template" / "nemotron-rerank.jinja"
|
||||
chat_template = template_path.read_text()
|
||||
|
||||
llm = LLM(model=model_name, runner="pooling", trust_remote_code=True)
|
||||
|
||||
query = "how much protein should a female eat?"
|
||||
documents = [
|
||||
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
|
||||
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
|
||||
"Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
|
||||
]
|
||||
|
||||
outputs = llm.score(query, documents, chat_template=chat_template)
|
||||
|
||||
print("-" * 30)
|
||||
print([output.outputs.score for output in outputs])
|
||||
print("-" * 30)
|
||||
46
examples/pooling/score/online_using_template.py
Normal file
46
examples/pooling/score/online_using_template.py
Normal file
@ -0,0 +1,46 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
"""
|
||||
Example of using the rerank API with template.
|
||||
|
||||
run:
|
||||
vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import requests
|
||||
|
||||
url = "http://127.0.0.1:8000/rerank"
|
||||
|
||||
headers = {"accept": "application/json", "Content-Type": "application/json"}
|
||||
|
||||
query = "how much protein should a female eat?"
|
||||
documents = [
|
||||
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
|
||||
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
|
||||
"Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
|
||||
]
|
||||
|
||||
data = {
|
||||
"model": "nvidia/llama-nemotron-rerank-1b-v2",
|
||||
"query": query,
|
||||
"documents": documents,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
|
||||
# Check the response
|
||||
if response.status_code == 200:
|
||||
print("Request successful!")
|
||||
print(json.dumps(response.json(), indent=2))
|
||||
else:
|
||||
print(f"Request failed with status code: {response.status_code}")
|
||||
print(response.text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
3
examples/pooling/score/template/nemotron-rerank.jinja
Normal file
3
examples/pooling/score/template/nemotron-rerank.jinja
Normal file
@ -0,0 +1,3 @@
|
||||
question:{{ (messages | selectattr("role", "eq", "query") | first).content }}
|
||||
|
||||
passage:{{ (messages | selectattr("role", "eq", "document") | first).content }}
|
||||
@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.5 # required for voxtral test
|
||||
num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
|
||||
lm-eval[api]>=0.4.9.2 # required for model evaluation test
|
||||
mteb>=1.38.11, <2 # required for mteb test
|
||||
transformers==4.57.3
|
||||
tokenizers==0.22.0
|
||||
|
||||
@ -58,7 +58,7 @@ schemathesis==3.39.15
|
||||
# OpenAI schema test
|
||||
|
||||
# Evaluation and benchmarking
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
|
||||
lm-eval[api]>=0.4.9.2
|
||||
jiwer==4.0.0
|
||||
|
||||
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
|
||||
|
||||
@ -34,8 +34,7 @@ num2words # required for smolvlm test
|
||||
open_clip_torch==2.32.0 # Required for nemotron_vl test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
# TODO: Use lm-eval[api]==0.4.10 once released
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
|
||||
lm-eval[api]>=0.4.9.2 # required for model evaluation test
|
||||
mteb[bm25s]>=2, <3 # required for mteb test
|
||||
transformers==4.57.3
|
||||
tokenizers==0.22.0
|
||||
|
||||
@ -441,7 +441,7 @@ lightning-utilities==0.14.3
|
||||
# torchmetrics
|
||||
llvmlite==0.44.0
|
||||
# via numba
|
||||
lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
|
||||
lm-eval==0.4.9.2
|
||||
# via -r requirements/test.in
|
||||
lxml==5.3.0
|
||||
# via
|
||||
|
||||
152
setup.py
152
setup.py
@ -50,15 +50,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
|
||||
sys.platform,
|
||||
)
|
||||
VLLM_TARGET_DEVICE = "empty"
|
||||
elif (
|
||||
sys.platform.startswith("linux")
|
||||
and torch.version.cuda is None
|
||||
and os.getenv("VLLM_TARGET_DEVICE") is None
|
||||
and torch.version.hip is None
|
||||
):
|
||||
# if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
|
||||
# fallback to cpu
|
||||
VLLM_TARGET_DEVICE = "cpu"
|
||||
elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
|
||||
if torch.version.hip is not None:
|
||||
VLLM_TARGET_DEVICE = "rocm"
|
||||
logger.info("Auto-detected ROCm")
|
||||
elif torch.version.cuda is not None:
|
||||
VLLM_TARGET_DEVICE = "cuda"
|
||||
logger.info("Auto-detected CUDA")
|
||||
else:
|
||||
VLLM_TARGET_DEVICE = "cpu"
|
||||
|
||||
|
||||
def is_sccache_available() -> bool:
|
||||
@ -108,20 +108,26 @@ class cmake_build_ext(build_ext):
|
||||
num_jobs = os.cpu_count()
|
||||
|
||||
nvcc_threads = None
|
||||
if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
|
||||
# `nvcc_threads` is either the value of the NVCC_THREADS
|
||||
# environment variable (if defined) or 1.
|
||||
# when it is set, we reduce `num_jobs` to avoid
|
||||
# overloading the system.
|
||||
nvcc_threads = envs.NVCC_THREADS
|
||||
if nvcc_threads is not None:
|
||||
nvcc_threads = int(nvcc_threads)
|
||||
logger.info(
|
||||
"Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
|
||||
)
|
||||
else:
|
||||
nvcc_threads = 1
|
||||
num_jobs = max(1, num_jobs // nvcc_threads)
|
||||
if _is_cuda() and CUDA_HOME is not None:
|
||||
try:
|
||||
nvcc_version = get_nvcc_cuda_version()
|
||||
if nvcc_version >= Version("11.2"):
|
||||
# `nvcc_threads` is either the value of the NVCC_THREADS
|
||||
# environment variable (if defined) or 1.
|
||||
# when it is set, we reduce `num_jobs` to avoid
|
||||
# overloading the system.
|
||||
nvcc_threads = envs.NVCC_THREADS
|
||||
if nvcc_threads is not None:
|
||||
nvcc_threads = int(nvcc_threads)
|
||||
logger.info(
|
||||
"Using NVCC_THREADS=%d as the number of nvcc threads.",
|
||||
nvcc_threads,
|
||||
)
|
||||
else:
|
||||
nvcc_threads = 1
|
||||
num_jobs = max(1, num_jobs // nvcc_threads)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to get NVCC version: %s", e)
|
||||
|
||||
return num_jobs, nvcc_threads
|
||||
|
||||
@ -199,9 +205,9 @@ class cmake_build_ext(build_ext):
|
||||
# Default build tool to whatever cmake picks.
|
||||
build_tool = []
|
||||
# Make sure we use the nvcc from CUDA_HOME
|
||||
if _is_cuda():
|
||||
if _is_cuda() and CUDA_HOME is not None:
|
||||
cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
|
||||
elif _is_hip():
|
||||
elif _is_hip() and ROCM_HOME is not None:
|
||||
cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
|
||||
|
||||
other_cmake_args = os.environ.get("CMAKE_ARGS")
|
||||
@ -339,6 +345,89 @@ class precompiled_wheel_utils:
|
||||
wheels = json.loads(resp.read().decode("utf-8"))
|
||||
return wheels, repo_url
|
||||
|
||||
@staticmethod
|
||||
def is_rocm_system() -> bool:
|
||||
"""Detect ROCm without relying on torch (for build environment)."""
|
||||
if os.getenv("ROCM_PATH"):
|
||||
return True
|
||||
if os.path.isdir("/opt/rocm"):
|
||||
return True
|
||||
if which("rocminfo") is not None:
|
||||
return True
|
||||
try:
|
||||
import torch
|
||||
|
||||
return torch.version.hip is not None
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def find_local_rocm_wheel() -> str | None:
|
||||
"""Search for a local vllm wheel in common locations."""
|
||||
import glob
|
||||
|
||||
for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
|
||||
wheels = glob.glob(pattern)
|
||||
if wheels:
|
||||
return sorted(wheels)[-1]
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
|
||||
"""Fetch the latest wheel URL from a PyPI-style simple index."""
|
||||
import platform
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import urljoin
|
||||
from urllib.request import urlopen
|
||||
|
||||
arch = platform.machine()
|
||||
|
||||
class WheelLinkParser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.wheels = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == "a":
|
||||
for name, value in attrs:
|
||||
if name == "href" and value.endswith(".whl"):
|
||||
self.wheels.append(value)
|
||||
|
||||
simple_url = f"{index_url.rstrip('/')}/{package}/"
|
||||
print(f"Fetching wheel list from {simple_url}")
|
||||
with urlopen(simple_url) as resp:
|
||||
html = resp.read().decode("utf-8")
|
||||
|
||||
parser = WheelLinkParser()
|
||||
parser.feed(html)
|
||||
|
||||
for wheel in reversed(parser.wheels):
|
||||
if arch in wheel:
|
||||
if wheel.startswith("http"):
|
||||
return wheel
|
||||
return urljoin(simple_url, wheel)
|
||||
|
||||
raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
|
||||
|
||||
@staticmethod
|
||||
def determine_wheel_url_rocm() -> tuple[str, str | None]:
|
||||
"""Determine the precompiled wheel for ROCm."""
|
||||
# Search for local wheel first
|
||||
local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
|
||||
if local_wheel is not None:
|
||||
print(f"Found local ROCm wheel: {local_wheel}")
|
||||
return local_wheel, None
|
||||
|
||||
# Fall back to AMD's PyPI index
|
||||
index_url = os.getenv(
|
||||
"VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
|
||||
)
|
||||
print(f"Fetching ROCm precompiled wheel from {index_url}")
|
||||
wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
|
||||
download_filename = wheel_url.split("/")[-1].split("#")[0]
|
||||
print(f"Using ROCm precompiled wheel: {wheel_url}")
|
||||
return wheel_url, download_filename
|
||||
|
||||
@staticmethod
|
||||
def determine_wheel_url() -> tuple[str, str | None]:
|
||||
"""
|
||||
@ -359,6 +448,11 @@ class precompiled_wheel_utils:
|
||||
print(f"Using user-specified precompiled wheel location: {wheel_location}")
|
||||
return wheel_location, None
|
||||
else:
|
||||
# ROCm: use local wheel or AMD's PyPI index
|
||||
# TODO: When we have ROCm nightly wheels, we can update this logic.
|
||||
if precompiled_wheel_utils.is_rocm_system():
|
||||
return precompiled_wheel_utils.determine_wheel_url_rocm()
|
||||
|
||||
import platform
|
||||
|
||||
arch = platform.machine()
|
||||
@ -465,6 +559,8 @@ class precompiled_wheel_utils:
|
||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
# ROCm-specific libraries
|
||||
"vllm/_rocm_C.abi3.so",
|
||||
]
|
||||
|
||||
flash_attn_regex = re.compile(
|
||||
@ -601,6 +697,8 @@ def get_rocm_version():
|
||||
# Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
|
||||
# see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
|
||||
try:
|
||||
if ROCM_HOME is None:
|
||||
return None
|
||||
librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
|
||||
if not librocm_core_file.is_file():
|
||||
return None
|
||||
@ -745,7 +843,9 @@ if _is_hip():
|
||||
|
||||
if _is_cuda():
|
||||
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
|
||||
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
|
||||
if envs.VLLM_USE_PRECOMPILED or (
|
||||
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
|
||||
):
|
||||
# FA3 requires CUDA 12.3 or later
|
||||
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
|
||||
# Optional since this doesn't get built (produce an .so file) when
|
||||
|
||||
@ -410,7 +410,7 @@ class HfRunner:
|
||||
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
from transformers import AutoProcessor
|
||||
|
||||
self.processor = AutoProcessor.from_pretrained(
|
||||
model_name,
|
||||
|
||||
@ -511,6 +511,16 @@ def test_human_readable_model_len():
|
||||
args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
|
||||
assert args.max_model_len == 10212345123456
|
||||
|
||||
# Special value -1 for auto-fit to GPU memory
|
||||
args = parser.parse_args(["--max-model-len", "-1"])
|
||||
assert args.max_model_len == -1
|
||||
|
||||
# 'auto' is an alias for -1
|
||||
args = parser.parse_args(["--max-model-len", "auto"])
|
||||
assert args.max_model_len == -1
|
||||
args = parser.parse_args(["--max-model-len", "AUTO"])
|
||||
assert args.max_model_len == -1
|
||||
|
||||
# Invalid (do not allow decimals with binary multipliers)
|
||||
for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
|
||||
with pytest.raises(ArgumentError):
|
||||
|
||||
@ -5,6 +5,30 @@ import pytest
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
|
||||
def add_attention_backend(server_args, attention_config):
|
||||
"""Append attention backend CLI arg if specified.
|
||||
|
||||
Args:
|
||||
server_args: List of server arguments to extend in-place.
|
||||
attention_config: Dict with 'backend' key, or None.
|
||||
"""
|
||||
if attention_config and "backend" in attention_config:
|
||||
server_args.extend(["--attention-backend", attention_config["backend"]])
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def rocm_aiter_fa_attention():
|
||||
"""Return attention config for transcription/translation tests on ROCm.
|
||||
|
||||
On ROCm, audio tests require ROCM_AITER_FA attention backend.
|
||||
"""
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
return {"backend": "ROCM_AITER_FA"}
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mary_had_lamb():
|
||||
path = AudioAsset("mary_had_lamb").get_local_path()
|
||||
|
||||
@ -15,7 +15,7 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
|
||||
@ -8,7 +8,7 @@ import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.multimodal.utils import encode_audio_base64, fetch_audio
|
||||
from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]:
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def url_encoded_audio() -> dict[str, str]:
|
||||
return {
|
||||
audio_url: encode_audio_url(*fetch_audio(audio_url))
|
||||
for audio_url in TEST_AUDIO_URLS
|
||||
}
|
||||
|
||||
|
||||
def dummy_messages_from_audio_url(
|
||||
audio_urls: str | list[str],
|
||||
content_text: str = "What's happening in this audio?",
|
||||
@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
audio_url: str,
|
||||
base64_encoded_audio: dict[str, str],
|
||||
url_encoded_audio: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_audio_url(
|
||||
f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
|
||||
)
|
||||
messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url])
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
|
||||
@ -28,7 +28,7 @@ def zephyr_lora_files():
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(zephyr_lora_files): # noqa: F811
|
||||
def server(zephyr_lora_files):
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@ -254,12 +254,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{"role": "user", "content": "what is 1+1?"},
|
||||
]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
max_completion_tokens=5,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
)
|
||||
@ -267,13 +266,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=37, total_tokens=47
|
||||
completion_tokens=5, prompt_tokens=37, total_tokens=42
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.content is not None and len(message.content) >= 5
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
@ -282,7 +282,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
max_completion_tokens=5,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
@ -12,7 +12,7 @@ MODEL_NAME = "Qwen/QwQ-32B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
def server():
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
|
||||
@ -125,7 +125,7 @@ messages = [
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@ -212,7 +212,7 @@ async def test_function_tool_use(
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def k2_server(): # noqa: F811
|
||||
def k2_server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
|
||||
@ -23,7 +23,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def multimodal_server(): # noqa: F811
|
||||
def multimodal_server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
|
||||
@ -8,7 +8,7 @@ from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def chat_server_with_force_include_usage(request): # noqa: F811
|
||||
def chat_server_with_force_include_usage(request):
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
|
||||
@ -11,7 +11,7 @@ MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(): # noqa: F811
|
||||
def server():
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
|
||||
@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest):
|
||||
"2",
|
||||
*passed_params,
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -504,7 +504,11 @@ async def test_web_search(client: OpenAI, model_name: str):
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_code_interpreter(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
# Code interpreter may need more time for container init + code execution
|
||||
timeout_value = client.timeout * 3
|
||||
client_with_timeout = client.with_options(timeout=timeout_value)
|
||||
|
||||
response = await client_with_timeout.responses.create(
|
||||
model=model_name,
|
||||
# TODO: Ideally should be able to set max tool calls
|
||||
# to prevent multi-turn, but it is not currently supported
|
||||
@ -868,6 +872,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.flaky(reruns=3)
|
||||
async def test_function_call_with_previous_input_messages(
|
||||
client: OpenAI, model_name: str
|
||||
):
|
||||
|
||||
@ -37,7 +37,7 @@ def default_server_args(qwen3_lora_files):
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_fixture(request, default_server_args): # noqa: F811
|
||||
def server_fixture(request, default_server_args):
|
||||
use_server_flag = request.param
|
||||
if use_server_flag:
|
||||
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
|
||||
|
||||
@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # default with Qwen3
|
||||
)
|
||||
|
||||
for ignore_eos in [True, False]:
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
||||
}
|
||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
generate_data = generate_resp.json()
|
||||
generate_res = tokenizer.decode(
|
||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
||||
)
|
||||
gen_token_ids = generate_data["choices"][0]["token_ids"]
|
||||
generate_res = tokenizer.decode(gen_token_ids, skip_special_tokens=True)
|
||||
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
||||
"temperature": 0.0,
|
||||
"stream": False,
|
||||
"ignore_eos": ignore_eos,
|
||||
"chat_template_kwargs": dict(enable_thinking=False),
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
}
|
||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||
completions_data = completions_resp.json()
|
||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||
|
||||
if ignore_eos:
|
||||
# When ignoring EOS, only compare up to the first EOS token
|
||||
# Post-EOS generation is undefined and may differ
|
||||
eos_tokens = {
|
||||
tokenizer.eos_token_id,
|
||||
*tokenizer.additional_special_tokens_ids,
|
||||
}
|
||||
# Find first EOS in generated tokens
|
||||
eos_pos = None
|
||||
for i, tid in enumerate(gen_token_ids):
|
||||
if tid in eos_tokens:
|
||||
eos_pos = i
|
||||
break
|
||||
if eos_pos is not None:
|
||||
gen_token_ids_truncated = gen_token_ids[:eos_pos]
|
||||
generate_res = tokenizer.decode(
|
||||
gen_token_ids_truncated, skip_special_tokens=True
|
||||
)
|
||||
# Truncate completions_res to same length for comparison
|
||||
completions_res = completions_res[: len(generate_res)]
|
||||
|
||||
assert generate_res == completions_res
|
||||
|
||||
|
||||
|
||||
@ -9,10 +9,16 @@ import time
|
||||
import openai
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
|
||||
# GPU initialization might take take longer
|
||||
_IS_ROCM = current_platform.is_rocm()
|
||||
_SERVER_STARTUP_TIMEOUT = 120
|
||||
_PROCESS_EXIT_TIMEOUT = 15
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_shutdown_on_engine_failure():
|
||||
@ -45,9 +51,11 @@ async def test_shutdown_on_engine_failure():
|
||||
"2",
|
||||
"--disable-frontend-multiprocessing",
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
# ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
|
||||
# stdout/stderr pipes are enabled during ROCm GPU initialization.
|
||||
stdout=None if _IS_ROCM else subprocess.PIPE,
|
||||
stderr=None if _IS_ROCM else subprocess.PIPE,
|
||||
text=None if _IS_ROCM else True,
|
||||
preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
|
||||
)
|
||||
|
||||
@ -61,7 +69,7 @@ async def test_shutdown_on_engine_failure():
|
||||
)
|
||||
|
||||
# Poll until server is ready
|
||||
while time.time() - start_time < 30:
|
||||
while time.time() - start_time < _SERVER_STARTUP_TIMEOUT:
|
||||
try:
|
||||
await client.completions.create(
|
||||
model=MODEL_NAME, prompt="Hello", max_tokens=1
|
||||
@ -70,14 +78,18 @@ async def test_shutdown_on_engine_failure():
|
||||
except Exception:
|
||||
time.sleep(0.5)
|
||||
if proc.poll() is not None:
|
||||
stdout, stderr = proc.communicate(timeout=1)
|
||||
pytest.fail(
|
||||
f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
|
||||
)
|
||||
if _IS_ROCM:
|
||||
pytest.fail(f"Server died during startup: {proc.returncode}")
|
||||
else:
|
||||
stdout, stderr = proc.communicate(timeout=1)
|
||||
pytest.fail(
|
||||
f"Server died during startup. "
|
||||
f"stdout: {stdout}, stderr: {stderr}"
|
||||
)
|
||||
else:
|
||||
proc.terminate()
|
||||
proc.wait(timeout=5)
|
||||
pytest.fail("Server failed to start in 30 seconds")
|
||||
proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
|
||||
pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds")
|
||||
|
||||
# Kill server to simulate crash
|
||||
proc.terminate()
|
||||
@ -89,5 +101,5 @@ async def test_shutdown_on_engine_failure():
|
||||
model=MODEL_NAME, prompt="This should fail", max_tokens=1
|
||||
)
|
||||
|
||||
return_code = proc.wait(timeout=5)
|
||||
return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
|
||||
assert return_code is not None
|
||||
|
||||
@ -7,6 +7,7 @@ import json
|
||||
import pytest
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .conftest import add_attention_backend
|
||||
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode",
|
||||
@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
|
||||
async def test_basic_audio(mary_had_lamb, model_name):
|
||||
async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
if model_name.startswith("mistralai"):
|
||||
server_args += MISTRAL_FORMAT_ARGS
|
||||
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
|
||||
"""Ensure STT (transcribe) requests can pass LoRA through to generate."""
|
||||
# ROCm SPECIFIC CONFIGURATION:
|
||||
# To ensure the test passes on ROCm, we modify the max model length to 512.
|
||||
# We DO NOT apply this to other platforms to maintain strict upstream parity.
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
model_name = "ibm-granite/granite-speech-3.3-2b"
|
||||
lora_model_name = "speech"
|
||||
server_args = [
|
||||
@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
"--lora-modules",
|
||||
f"{lora_model_name}={model_name}",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"512" if current_platform.is_rocm() else "2048",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
]
|
||||
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_gemma(foscolo):
|
||||
async def test_basic_audio_gemma(foscolo, rocm_aiter_fa_attention):
|
||||
# Gemma accuracy on some of the audio samples we use is particularly bad,
|
||||
# hence we use a different one here. WER is evaluated separately.
|
||||
model_name = "google/gemma-3n-E2B-it"
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
model_name, server_args, max_wait_seconds=480
|
||||
) as remote_server:
|
||||
|
||||
@ -14,16 +14,26 @@ import pytest_asyncio
|
||||
import soundfile as sf
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .conftest import add_attention_backend
|
||||
|
||||
SERVER_ARGS = ["--enforce-eager"]
|
||||
|
||||
|
||||
def _get_server_args(attention_config):
|
||||
"""Get server args with attention backend if specified."""
|
||||
args = SERVER_ARGS.copy()
|
||||
add_attention_backend(args, attention_config)
|
||||
return args
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
|
||||
)
|
||||
def server(request):
|
||||
def server(request, rocm_aiter_fa_attention):
|
||||
# Parametrize over model name
|
||||
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
request.param, _get_server_args(rocm_aiter_fa_attention)
|
||||
) as remote_server:
|
||||
yield remote_server, request.param
|
||||
|
||||
|
||||
@ -35,10 +45,12 @@ async def client_and_model(server):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_asr_model(foscolo):
|
||||
async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
|
||||
# text to text model
|
||||
model_name = "JackFram/llama-68m"
|
||||
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
model_name, _get_server_args(rocm_aiter_fa_attention)
|
||||
) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
res = await client.audio.translations.create(
|
||||
model=model_name, file=foscolo, temperature=0.0
|
||||
@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
|
||||
"""Ensure STT (translate) requests can pass LoRA through to generate."""
|
||||
# ROCm SPECIFIC CONFIGURATION:
|
||||
# To ensure the test passes on ROCm, we modify the max model length to 512.
|
||||
# We DO NOT apply this to other platforms to maintain strict upstream parity.
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# NOTE - careful to call this test before the module scoped server
|
||||
# fixture, otherwise it'll OOMkill the CI
|
||||
model_name = "ibm-granite/granite-speech-3.3-2b"
|
||||
@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
"--lora-modules",
|
||||
f"{lora_model_name}={model_name}",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"512" if current_platform.is_rocm() else "2048",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
]
|
||||
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
|
||||
@ -7,7 +7,8 @@ import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.multimodal.utils import encode_video_base64, fetch_video
|
||||
from vllm.multimodal.utils import encode_video_url, fetch_video
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
@ -37,7 +38,16 @@ def server():
|
||||
json.dumps({"video": MAXIMUM_VIDEOS}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
# ROCm: Increase timeouts to handle potential network delays and slower
|
||||
# video processing when downloading multiple videos from external sources
|
||||
env_overrides = {}
|
||||
if current_platform.is_rocm():
|
||||
env_overrides = {
|
||||
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
|
||||
}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -48,9 +58,9 @@ async def client(server):
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_video() -> dict[str, str]:
|
||||
def url_encoded_video() -> dict[str, str]:
|
||||
return {
|
||||
video_url: encode_video_base64(fetch_video(video_url)[0])
|
||||
video_url: encode_video_url(fetch_video(video_url)[0])
|
||||
for video_url in TEST_VIDEO_URLS
|
||||
}
|
||||
|
||||
@ -175,11 +185,9 @@ async def test_single_chat_session_video_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
video_url: str,
|
||||
base64_encoded_video: dict[str, str],
|
||||
url_encoded_video: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_video_url(
|
||||
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||
)
|
||||
messages = dummy_messages_from_video_url(url_encoded_video[video_url])
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
@ -223,11 +231,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
video_url: str,
|
||||
base64_encoded_video: dict[str, str],
|
||||
url_encoded_video: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_video_url(
|
||||
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||
)
|
||||
messages = dummy_messages_from_video_url(url_encoded_video[video_url])
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
@ -291,6 +297,11 @@ async def test_chat_streaming_video(
|
||||
@pytest.mark.parametrize(
|
||||
"video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
|
||||
)
|
||||
@pytest.mark.flaky(
|
||||
reruns=2,
|
||||
reruns_delay=5,
|
||||
condition=current_platform.is_rocm(),
|
||||
)
|
||||
async def test_multi_video_input(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
|
||||
):
|
||||
|
||||
@ -9,7 +9,8 @@ import pytest_asyncio
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
from vllm.multimodal.utils import encode_image_url, fetch_image
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
@ -35,7 +36,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
|
||||
],
|
||||
[
|
||||
"The image shows a Venn diagram with three over",
|
||||
"The image shows a colorful Venn diagram with",
|
||||
"The image displays a Venn diagram with three over",
|
||||
],
|
||||
[
|
||||
"This image displays a gradient of colors ranging from",
|
||||
@ -43,6 +44,27 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
|
||||
],
|
||||
]
|
||||
|
||||
EXPECTED_MM_BEAM_SEARCH_RES_ROCM = [
|
||||
# MultiHeadAttention attn_backend: FLASH_ATTN
|
||||
# with Triton Attention backend
|
||||
[
|
||||
"The image shows a wooden boardwalk leading through a",
|
||||
"The image shows a wooden boardwalk extending into a",
|
||||
],
|
||||
[
|
||||
"The image shows two parrots perched on",
|
||||
"The image shows two birds perched on a cur",
|
||||
],
|
||||
[
|
||||
"The image shows a Venn diagram with three over",
|
||||
"The image contains a Venn diagram with three over",
|
||||
],
|
||||
[
|
||||
"This image displays a gradient of colors ranging from",
|
||||
"This image displays a gradient of colors transitioning from",
|
||||
],
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
@ -59,7 +81,16 @@ def server():
|
||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
# ROCm: Increase timeouts to handle potential network delays and slower
|
||||
# video processing when downloading multiple videos from external sources
|
||||
env_overrides = {}
|
||||
if current_platform.is_rocm():
|
||||
env_overrides = {
|
||||
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
|
||||
}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -70,11 +101,9 @@ async def client(server):
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
def url_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_asset: encode_image_base64(
|
||||
local_asset_server.get_image_asset(image_asset)
|
||||
)
|
||||
image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
|
||||
for image_asset in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
@ -234,11 +263,11 @@ async def test_single_chat_session_image_base64encoded(
|
||||
model_name: str,
|
||||
raw_image_url: str,
|
||||
image_url: str,
|
||||
base64_encoded_image: dict[str, str],
|
||||
url_encoded_image: dict[str, str],
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = dummy_messages_from_image_url(
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
|
||||
url_encoded_image[raw_image_url],
|
||||
content_text,
|
||||
)
|
||||
|
||||
@ -288,15 +317,20 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_idx: int,
|
||||
base64_encoded_image: dict[str, str],
|
||||
url_encoded_image: dict[str, str],
|
||||
):
|
||||
# ROCm: Switch expected results based on platform
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# NOTE: This test also validates that we pass MM data through beam search
|
||||
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
|
||||
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
|
||||
|
||||
messages = dummy_messages_from_image_url(
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
||||
)
|
||||
if current_platform.is_rocm():
|
||||
expected_res = EXPECTED_MM_BEAM_SEARCH_RES_ROCM[image_idx]
|
||||
else:
|
||||
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
|
||||
|
||||
messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
|
||||
@ -33,6 +33,7 @@ def _terratorch_dummy_messages():
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
|
||||
)
|
||||
|
||||
@ -9,11 +9,6 @@ from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
PROMPTS = [
|
||||
@ -35,6 +30,12 @@ TOKEN_IDS = [
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
|
||||
# that supports encoder-only models on ROCm.
|
||||
attention_config = None
|
||||
if current_platform.is_rocm():
|
||||
attention_config = {"backend": "FLEX_ATTENTION"}
|
||||
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
@ -44,6 +45,7 @@ def llm():
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
attention_config=attention_config,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
@ -9,11 +9,6 @@ import pytest_asyncio
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
|
||||
max_model_len = 128
|
||||
|
||||
@ -44,6 +39,10 @@ def server():
|
||||
str(max_model_len),
|
||||
]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
28
tests/entrypoints/pooling/embed/conftest.py
Normal file
28
tests/entrypoints/pooling/embed/conftest.py
Normal file
@ -0,0 +1,28 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Pytest configuration for vLLM pooling embed tests."""
|
||||
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
"""Configure ROCm-specific settings based on collected tests."""
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
|
||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
torch.backends.cuda.enable_math_sdp(True)
|
||||
warnings.warn(
|
||||
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
|
||||
"to avoid HuggingFace Transformers accuracy issues",
|
||||
UserWarning,
|
||||
stacklevel=1,
|
||||
)
|
||||
@ -4,7 +4,7 @@ import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
|
||||
MTEB_EMBED_TASKS,
|
||||
MTEB_EMBED_TOL,
|
||||
OpenAIClientMtebEncoder,
|
||||
@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
||||
|
||||
MODEL_NAME = "intfloat/e5-small"
|
||||
@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667
|
||||
def server():
|
||||
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
prompts = ["The chef prepared a delicious meal."]
|
||||
@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."]
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
|
||||
# that supports encoder-only models on ROCm.
|
||||
attention_config = None
|
||||
if current_platform.is_rocm():
|
||||
attention_config = {"backend": "FLEX_ATTENTION"}
|
||||
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
@ -32,6 +33,7 @@ def llm():
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
attention_config=attention_config,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
|
||||
decode_pooling_output,
|
||||
)
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
|
||||
DTYPE = "bfloat16"
|
||||
|
||||
|
||||
if current_platform.is_rocm():
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
|
||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
torch.backends.cuda.enable_math_sdp(True)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
@ -53,6 +57,10 @@ def server():
|
||||
DUMMY_CHAT_TEMPLATE,
|
||||
]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
|
||||
EmbedModelInfo(
|
||||
@ -62,6 +57,10 @@ def server(model_info, dtype: str):
|
||||
["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
|
||||
)
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(model_info.name, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
|
||||
def _generate_random_text(word_count: int) -> str:
|
||||
"""Generate random text with approximately the specified word count."""
|
||||
@ -228,6 +223,10 @@ def server_with_chunked_processing():
|
||||
"0.8",
|
||||
]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ from transformers import AutoProcessor
|
||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
|
||||
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
|
||||
MAXIMUM_IMAGES = 2
|
||||
@ -48,14 +48,6 @@ def server():
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
|
||||
for image_url in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
|
||||
def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_name, trust_remote_code=True, num_crops=4
|
||||
|
||||
@ -4,7 +4,7 @@ import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
|
||||
MTEB_RERANK_LANGS,
|
||||
MTEB_RERANK_TASKS,
|
||||
MTEB_RERANK_TOL,
|
||||
@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
||||
|
||||
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||
@ -30,6 +25,10 @@ st_main_score = 0.33457
|
||||
def server():
|
||||
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
|
||||
# that supports encoder-only models on ROCm.
|
||||
attention_config = None
|
||||
if current_platform.is_rocm():
|
||||
attention_config = {"backend": "FLEX_ATTENTION"}
|
||||
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
@ -30,6 +31,7 @@ def llm():
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
attention_config=attention_config,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
|
||||
from vllm.entrypoints.pooling.score.protocol import RerankResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "BAAI/bge-reranker-base"
|
||||
DTYPE = "bfloat16"
|
||||
|
||||
@ -24,6 +19,10 @@ DTYPE = "bfloat16"
|
||||
def server():
|
||||
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.score.protocol import ScoreResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
{"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
|
||||
{"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
|
||||
@ -44,6 +39,10 @@ def model(request):
|
||||
def server(model: dict[str, Any]):
|
||||
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(model["name"], args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
351
tests/entrypoints/pooling/score/test_utils.py
Normal file
351
tests/entrypoints/pooling/score/test_utils.py
Normal file
@ -0,0 +1,351 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
|
||||
from vllm.entrypoints.score_utils import get_score_prompt
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
# A cross-encoder model for testing
|
||||
CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||
|
||||
|
||||
def assert_prompt_tokenization_consistent(
|
||||
tokenizer, full_prompt, engine_prompt, add_special_tokens=True
|
||||
):
|
||||
"""Verify that engine_prompt token_ids match tokenizing full_prompt."""
|
||||
expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[
|
||||
"input_ids"
|
||||
]
|
||||
actual_ids = engine_prompt["prompt_token_ids"]
|
||||
assert actual_ids == expected_ids, (
|
||||
f"Token IDs don't match.\nExpected: {expected_ids}\nActual: {actual_ids}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def cross_encoder_model_config():
|
||||
return ModelConfig(
|
||||
CROSS_ENCODER_MODEL_ID,
|
||||
runner="pooling",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def cross_encoder_tokenizer(cross_encoder_model_config):
|
||||
return get_tokenizer(
|
||||
CROSS_ENCODER_MODEL_ID,
|
||||
trust_remote_code=cross_encoder_model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm_reranker_model_config():
|
||||
"""Model config for LLM-as-reranker style (no pad token)."""
|
||||
config = ModelConfig(
|
||||
CROSS_ENCODER_MODEL_ID,
|
||||
runner="pooling",
|
||||
)
|
||||
# use_pad_token is a property that reads from hf_config,
|
||||
# so we set it there to override the default (True)
|
||||
config.hf_config.use_pad_token = False
|
||||
return config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tokenization_kwargs():
|
||||
"""Common tokenization kwargs used across tests."""
|
||||
return {"add_special_tokens": True, "return_tensors": None}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_model_with_score_template():
|
||||
"""Mock model class that supports score template and tracks post_process calls."""
|
||||
|
||||
class MockModelWithScoreTemplate:
|
||||
supports_score_template = True
|
||||
post_process_called: list[TokensPrompt] = []
|
||||
|
||||
@staticmethod
|
||||
def get_score_template(p1: str, p2: str) -> str:
|
||||
return f"[QUERY]{p1}[SEP][DOC]{p2}"
|
||||
|
||||
@staticmethod
|
||||
def post_process_tokens(prompt: TokensPrompt) -> None:
|
||||
MockModelWithScoreTemplate.post_process_called.append(prompt)
|
||||
|
||||
return MockModelWithScoreTemplate
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_model_no_score_template():
|
||||
"""Mock model class that does not support score template."""
|
||||
|
||||
class MockModelNoScoreTemplate:
|
||||
supports_score_template = False
|
||||
|
||||
return MockModelNoScoreTemplate
|
||||
|
||||
|
||||
class TestGetScorePrompt:
|
||||
"""Tests for the get_score_prompt function."""
|
||||
|
||||
def test_tokenization_kwargs_passed_through(
|
||||
self,
|
||||
llm_reranker_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
):
|
||||
"""Test that tokenization kwargs are properly passed through."""
|
||||
data_1 = "Query text"
|
||||
data_2 = "Document text"
|
||||
|
||||
# Test with truncation - custom kwargs for this test
|
||||
custom_tokenization_kwargs = {
|
||||
"add_special_tokens": True,
|
||||
"return_tensors": None,
|
||||
"truncation": True,
|
||||
"max_length": 20,
|
||||
}
|
||||
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
llm_reranker_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
custom_tokenization_kwargs,
|
||||
data_1,
|
||||
data_2,
|
||||
)
|
||||
|
||||
assert isinstance(full_prompt, str)
|
||||
assert "prompt_token_ids" in engine_prompt
|
||||
# With max_length=20 and truncation, should not exceed this
|
||||
assert len(engine_prompt["prompt_token_ids"]) <= 20
|
||||
# Since truncation was applied, token_ids should be a prefix of full encoding
|
||||
full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[
|
||||
"input_ids"
|
||||
]
|
||||
actual_ids = engine_prompt["prompt_token_ids"]
|
||||
assert full_ids[: len(actual_ids)] == actual_ids, (
|
||||
f"Token IDs are not a prefix of full encoding.\n"
|
||||
f"Full IDs: {full_ids}\n"
|
||||
f"Actual IDs: {actual_ids}"
|
||||
)
|
||||
|
||||
def test_model_supports_score_template(
|
||||
self,
|
||||
cross_encoder_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
mock_model_with_score_template,
|
||||
):
|
||||
"""Test when model supports score template (no score_template arg)."""
|
||||
with patch(
|
||||
"vllm.model_executor.model_loader.get_model_cls",
|
||||
return_value=mock_model_with_score_template,
|
||||
):
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
cross_encoder_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
"query text",
|
||||
"document text",
|
||||
)
|
||||
|
||||
assert full_prompt == "[QUERY]query text[SEP][DOC]document text"
|
||||
assert "prompt_token_ids" in engine_prompt
|
||||
assert len(engine_prompt["prompt_token_ids"]) > 0
|
||||
assert_prompt_tokenization_consistent(
|
||||
cross_encoder_tokenizer, full_prompt, engine_prompt
|
||||
)
|
||||
|
||||
def test_model_supports_score_template_but_custom_template_provided(
|
||||
self,
|
||||
cross_encoder_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
mock_model_with_score_template,
|
||||
):
|
||||
"""Test when model supports score template but custom template is provided."""
|
||||
template = (
|
||||
'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
|
||||
)
|
||||
with (
|
||||
patch(
|
||||
"vllm.model_executor.model_loader.get_model_cls",
|
||||
return_value=mock_model_with_score_template,
|
||||
),
|
||||
):
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
cross_encoder_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
"query",
|
||||
"doc",
|
||||
score_template=template, # Providing a template
|
||||
)
|
||||
|
||||
assert "prompt_token_ids" in engine_prompt
|
||||
assert full_prompt == "TEMPLATE_USED query doc"
|
||||
|
||||
assert_prompt_tokenization_consistent(
|
||||
cross_encoder_tokenizer, full_prompt, engine_prompt
|
||||
)
|
||||
|
||||
def test_not_using_default_template(
|
||||
self,
|
||||
llm_reranker_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
mock_model_no_score_template,
|
||||
):
|
||||
# FIXME: For now, we only apply a template when one is explicitly provided.
|
||||
# We cannot rely on the tokenizer's chat template because many models
|
||||
# inherit junk templates from their base LLM, which breaks both the models
|
||||
# and the tests that use them.
|
||||
with (
|
||||
patch(
|
||||
"vllm.model_executor.model_loader.get_model_cls",
|
||||
return_value=mock_model_no_score_template,
|
||||
),
|
||||
patch(
|
||||
"vllm.entrypoints.score_utils.apply_hf_chat_template",
|
||||
return_value="test querytest doc",
|
||||
),
|
||||
):
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
llm_reranker_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
"test query",
|
||||
"test doc",
|
||||
)
|
||||
|
||||
assert full_prompt == "test querytest doc"
|
||||
assert "prompt_token_ids" in engine_prompt
|
||||
assert_prompt_tokenization_consistent(
|
||||
cross_encoder_tokenizer, full_prompt, engine_prompt
|
||||
)
|
||||
|
||||
def test_fallback_with_pad_token(
|
||||
self,
|
||||
cross_encoder_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
mock_model_no_score_template,
|
||||
):
|
||||
"""Test fallback path when ChatTemplateResolutionError
|
||||
and use_pad_token=True."""
|
||||
with (
|
||||
patch(
|
||||
"vllm.model_executor.model_loader.get_model_cls",
|
||||
return_value=mock_model_no_score_template,
|
||||
),
|
||||
patch(
|
||||
"vllm.entrypoints.score_utils.apply_hf_chat_template",
|
||||
side_effect=ChatTemplateResolutionError("No template"),
|
||||
),
|
||||
):
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
cross_encoder_model_config, # use_pad_token=True
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
"query",
|
||||
"document",
|
||||
)
|
||||
|
||||
assert "prompt_token_ids" in engine_prompt
|
||||
# Should have token_type_ids from text_pair encoding
|
||||
assert "token_type_ids" in engine_prompt
|
||||
assert "query" in full_prompt
|
||||
assert "document" in full_prompt
|
||||
assert full_prompt != "querydocument"
|
||||
assert (
|
||||
engine_prompt["prompt_token_ids"]
|
||||
== cross_encoder_tokenizer(
|
||||
"query", text_pair="document", add_special_tokens=True
|
||||
)["input_ids"]
|
||||
)
|
||||
|
||||
# FIXME(?): add_special_tokens=False is needed because in this case
|
||||
# full_prompt is obtained by decoding the tokenized prompt, which includes
|
||||
# special tokens and we would get duplicated special tokens otherwise.
|
||||
# This is inconsistent with other cases.
|
||||
assert_prompt_tokenization_consistent(
|
||||
cross_encoder_tokenizer,
|
||||
full_prompt,
|
||||
engine_prompt,
|
||||
add_special_tokens=False,
|
||||
)
|
||||
|
||||
def test_fallback_without_pad_token(
|
||||
self,
|
||||
llm_reranker_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
mock_model_no_score_template,
|
||||
):
|
||||
"""Test fallback path when ChatTemplateResolutionError
|
||||
and use_pad_token=False."""
|
||||
with (
|
||||
patch(
|
||||
"vllm.model_executor.model_loader.get_model_cls",
|
||||
return_value=mock_model_no_score_template,
|
||||
),
|
||||
patch(
|
||||
"vllm.entrypoints.score_utils.apply_hf_chat_template",
|
||||
side_effect=ChatTemplateResolutionError("No template"),
|
||||
),
|
||||
):
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
llm_reranker_model_config, # use_pad_token=False
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
"query",
|
||||
"document",
|
||||
)
|
||||
|
||||
assert full_prompt == "querydocument"
|
||||
assert "prompt_token_ids" in engine_prompt
|
||||
assert_prompt_tokenization_consistent(
|
||||
cross_encoder_tokenizer, full_prompt, engine_prompt
|
||||
)
|
||||
|
||||
def test_post_process_tokens_called(
|
||||
self,
|
||||
cross_encoder_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
mock_model_with_score_template,
|
||||
):
|
||||
"""Test that post_process_tokens is called on the engine prompt."""
|
||||
# Reset the call tracker
|
||||
mock_model_with_score_template.post_process_called.clear()
|
||||
|
||||
with (
|
||||
patch(
|
||||
"vllm.model_executor.model_loader.get_model_cls",
|
||||
return_value=mock_model_with_score_template,
|
||||
),
|
||||
patch(
|
||||
"vllm.entrypoints.score_utils.apply_hf_chat_template",
|
||||
side_effect=ChatTemplateResolutionError("No template"),
|
||||
),
|
||||
):
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
cross_encoder_model_config,
|
||||
cross_encoder_tokenizer,
|
||||
tokenization_kwargs,
|
||||
"query",
|
||||
"doc",
|
||||
)
|
||||
|
||||
# post_process_tokens should have been called once
|
||||
assert len(mock_model_with_score_template.post_process_called) == 1
|
||||
assert mock_model_with_score_template.post_process_called[0] is engine_prompt
|
||||
assert_prompt_tokenization_consistent(
|
||||
cross_encoder_tokenizer, full_prompt, engine_prompt
|
||||
)
|
||||
@ -25,9 +25,9 @@ from vllm.entrypoints.chat_utils import (
|
||||
)
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
|
||||
from vllm.multimodal.utils import (
|
||||
encode_audio_base64,
|
||||
encode_image_base64,
|
||||
encode_video_base64,
|
||||
encode_audio_url,
|
||||
encode_image_url,
|
||||
encode_video_url,
|
||||
)
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
@ -141,22 +141,19 @@ def mistral_model_config():
|
||||
@pytest.fixture(scope="module")
|
||||
def image_url():
|
||||
image = ImageAsset("cherry_blossom")
|
||||
base64 = encode_image_base64(image.pil_image)
|
||||
return f"data:image/jpeg;base64,{base64}"
|
||||
return encode_image_url(image.pil_image)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def video_url():
|
||||
video = VideoAsset("baby_reading", 1)
|
||||
base64 = encode_video_base64(video.np_ndarrays)
|
||||
return f"data:video/jpeg;base64,{base64}"
|
||||
return encode_video_url(video.np_ndarrays)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def audio_url():
|
||||
audio = AudioAsset("mary_had_lamb")
|
||||
base64 = encode_audio_base64(*audio.audio_and_sample_rate)
|
||||
return f"data:audio/ogg;base64,{base64}"
|
||||
return encode_audio_url(*audio.audio_and_sample_rate)
|
||||
|
||||
|
||||
def _assert_mm_data_is_image_input(
|
||||
|
||||
11
tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
Normal file
11
tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
Normal file
@ -0,0 +1,11 @@
|
||||
model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
|
||||
accuracy_threshold: 0.85
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: >-
|
||||
--max-model-len 4096
|
||||
--tensor-parallel-size 2
|
||||
--enable-expert-parallel
|
||||
--async-scheduling
|
||||
env:
|
||||
VLLM_USE_FLASHINFER_MOE_FP8: "1"
|
||||
@ -4,3 +4,4 @@ Qwen1.5-MoE-W4A16-CT.yaml
|
||||
DeepSeek-V2-Lite-Instruct-FP8.yaml
|
||||
Qwen3-30B-A3B-NVFP4.yaml
|
||||
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
|
||||
Qwen3-Next-FP8-EP2.yaml
|
||||
|
||||
@ -71,6 +71,7 @@ def test_gsm8k_correctness(config_filename):
|
||||
print(f"Number of questions: {eval_config['num_questions']}")
|
||||
print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
|
||||
print(f"Server args: {' '.join(server_args)}")
|
||||
print(f"Environment variables: {env_dict}")
|
||||
|
||||
# Launch server and run evaluation
|
||||
with RemoteOpenAIServer(
|
||||
|
||||
@ -40,93 +40,6 @@ KV_CACHE_DTYPE = ["auto", "fp8"]
|
||||
RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
|
||||
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||
@torch.inference_mode()
|
||||
def test_copy_blocks(
|
||||
kv_cache_factory,
|
||||
num_mappings: int,
|
||||
num_layers: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
kv_cache_dtype: str,
|
||||
device: str,
|
||||
) -> None:
|
||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||
pytest.skip()
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
# Generate random block mappings where each source block is mapped to two
|
||||
# destination blocks.
|
||||
assert 2 * num_mappings <= num_blocks
|
||||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
||||
remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
|
||||
dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
|
||||
block_mapping: list[tuple[int, int]] = []
|
||||
for i in range(num_mappings):
|
||||
src = src_blocks[i]
|
||||
dst1 = dst_blocks[2 * i]
|
||||
dst2 = dst_blocks[2 * i + 1]
|
||||
block_mapping.append((src, dst1))
|
||||
block_mapping.append((src, dst2))
|
||||
|
||||
# Create the KV caches.
|
||||
key_caches, value_caches = kv_cache_factory(
|
||||
num_blocks,
|
||||
block_size,
|
||||
num_layers,
|
||||
num_heads,
|
||||
head_size,
|
||||
kv_cache_dtype,
|
||||
dtype,
|
||||
seed,
|
||||
device,
|
||||
)
|
||||
|
||||
# Clone the KV caches.
|
||||
cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
|
||||
cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
|
||||
|
||||
# Call the copy blocks kernel.
|
||||
block_mapping_tensor = torch.tensor(
|
||||
block_mapping, dtype=torch.int64, device=device
|
||||
).view(-1, 2)
|
||||
|
||||
opcheck(
|
||||
torch.ops._C_cache_ops.copy_blocks,
|
||||
(key_caches, value_caches, block_mapping_tensor),
|
||||
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
|
||||
cond=(head_size == HEAD_SIZES[0]),
|
||||
)
|
||||
ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
|
||||
|
||||
# Run the reference implementation.
|
||||
for src, dst in block_mapping:
|
||||
for cloned_key_cache in cloned_key_caches:
|
||||
cloned_key_cache[dst].copy_(cloned_key_cache[src])
|
||||
for cloned_value_cache in cloned_value_caches:
|
||||
cloned_value_cache[dst].copy_(cloned_value_cache[src])
|
||||
|
||||
# Compare the results.
|
||||
for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
|
||||
torch.testing.assert_close(key_cache, cloned_key_cache)
|
||||
for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
|
||||
torch.testing.assert_close(value_cache, cloned_value_cache)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@ -763,73 +676,6 @@ def test_concat_and_cache_ds_mla(
|
||||
torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
|
||||
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
|
||||
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||
@torch.inference_mode()
|
||||
def test_copy_blocks_mla(
|
||||
kv_lora_rank: int,
|
||||
qk_rope_head_dim: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
num_layers: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
kv_cache_dtype: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
entry_size = kv_lora_rank + qk_rope_head_dim
|
||||
|
||||
kv_caches = []
|
||||
for _ in range(num_layers):
|
||||
kv_cache = _create_mla_cache(
|
||||
num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
|
||||
)
|
||||
_fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
|
||||
kv_caches.append(kv_cache)
|
||||
|
||||
ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
|
||||
|
||||
num_mappings = min(2, num_blocks // 2)
|
||||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
||||
remaining = list(set(range(num_blocks)) - set(src_blocks))
|
||||
dst_blocks = random.sample(remaining, 2 * num_mappings)
|
||||
block_mapping = []
|
||||
for i in range(num_mappings):
|
||||
src = src_blocks[i]
|
||||
dst1 = dst_blocks[2 * i]
|
||||
dst2 = dst_blocks[2 * i + 1]
|
||||
block_mapping.append((src, dst1))
|
||||
block_mapping.append((src, dst2))
|
||||
block_mapping_tensor = torch.tensor(
|
||||
block_mapping, dtype=torch.int64, device=device
|
||||
).view(-1, 2)
|
||||
|
||||
for src, dst in block_mapping:
|
||||
for ref_cache in ref_caches:
|
||||
ref_cache[dst].copy_(ref_cache[src])
|
||||
|
||||
opcheck(
|
||||
torch.ops._C_cache_ops.copy_blocks_mla,
|
||||
(kv_caches, block_mapping_tensor),
|
||||
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
|
||||
)
|
||||
ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
|
||||
|
||||
for kv_cache, ref_cache in zip(kv_caches, ref_caches):
|
||||
torch.testing.assert_close(kv_cache, ref_cache)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
|
||||
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
|
||||
|
||||
@ -1,11 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import tempfile
|
||||
|
||||
import mteb
|
||||
import numpy as np
|
||||
import requests
|
||||
import torch
|
||||
from mteb.models import ModelMeta
|
||||
from mteb.types import Array
|
||||
@ -14,7 +11,6 @@ from torch.utils.data import DataLoader
|
||||
import tests.ci_envs as ci_envs
|
||||
from tests.models.utils import (
|
||||
EmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
check_embeddings_close,
|
||||
get_vllm_extra_kwargs,
|
||||
)
|
||||
@ -27,10 +23,6 @@ from tests.models.utils import (
|
||||
MTEB_EMBED_TASKS = ["STS12"]
|
||||
MTEB_EMBED_TOL = 1e-4
|
||||
|
||||
# See #19344
|
||||
MTEB_RERANK_TASKS = ["NFCorpus"]
|
||||
MTEB_RERANK_LANGS = ["eng"]
|
||||
MTEB_RERANK_TOL = 2e-3
|
||||
|
||||
_empty_model_meta = ModelMeta(
|
||||
loader=None,
|
||||
@ -54,29 +46,9 @@ _empty_model_meta = ModelMeta(
|
||||
)
|
||||
|
||||
|
||||
class VllmMtebEncoder(mteb.EncoderProtocol):
|
||||
class MtebEmbedMixin(mteb.EncoderProtocol):
|
||||
mteb_model_meta = _empty_model_meta
|
||||
|
||||
def __init__(self, vllm_model):
|
||||
self.llm = vllm_model
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def encode(
|
||||
self,
|
||||
inputs: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
# Hoping to discover potential scheduling
|
||||
# issues by randomizing the order.
|
||||
sentences = [text for batch in inputs for text in batch["text"]]
|
||||
r = self.rng.permutation(len(sentences))
|
||||
sentences = [sentences[i] for i in r]
|
||||
outputs = self.llm.embed(sentences, use_tqdm=False)
|
||||
embeds = np.array(outputs)
|
||||
embeds = embeds[np.argsort(r)]
|
||||
return embeds
|
||||
|
||||
def similarity(
|
||||
self,
|
||||
embeddings1: np.ndarray,
|
||||
@ -102,31 +74,29 @@ class VllmMtebEncoder(mteb.EncoderProtocol):
|
||||
return sim
|
||||
|
||||
|
||||
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
|
||||
mteb_model_meta = _empty_model_meta
|
||||
|
||||
class VllmMtebEncoder(MtebEmbedMixin):
|
||||
def __init__(self, vllm_model):
|
||||
self.llm = vllm_model
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def predict(
|
||||
def encode(
|
||||
self,
|
||||
inputs1: DataLoader[mteb.types.BatchedInput],
|
||||
inputs2: DataLoader[mteb.types.BatchedInput],
|
||||
inputs: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
queries = [text for batch in inputs1 for text in batch["text"]]
|
||||
corpus = [text for batch in inputs2 for text in batch["text"]]
|
||||
|
||||
outputs = self.llm.score(
|
||||
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
|
||||
)
|
||||
scores = np.array(outputs)
|
||||
return scores
|
||||
# Hoping to discover potential scheduling
|
||||
# issues by randomizing the order.
|
||||
sentences = [text for batch in inputs for text in batch["text"]]
|
||||
r = self.rng.permutation(len(sentences))
|
||||
sentences = [sentences[i] for i in r]
|
||||
outputs = self.llm.embed(sentences, use_tqdm=False)
|
||||
embeds = np.array(outputs)
|
||||
embeds = embeds[np.argsort(r)]
|
||||
return embeds
|
||||
|
||||
|
||||
class OpenAIClientMtebEncoder(VllmMtebEncoder):
|
||||
class OpenAIClientMtebEncoder(MtebEmbedMixin):
|
||||
def __init__(self, model_name: str, client):
|
||||
self.model_name = model_name
|
||||
self.client = client
|
||||
@ -153,58 +123,6 @@ class OpenAIClientMtebEncoder(VllmMtebEncoder):
|
||||
return embeds
|
||||
|
||||
|
||||
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
|
||||
mteb_model_meta = _empty_model_meta
|
||||
|
||||
def __init__(self, model_name: str, url):
|
||||
self.model_name = model_name
|
||||
self.url = url
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def predict(
|
||||
self,
|
||||
inputs1: DataLoader[mteb.types.BatchedInput],
|
||||
inputs2: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
queries = [text for batch in inputs1 for text in batch["text"]]
|
||||
full_corpus = [text for batch in inputs2 for text in batch["text"]]
|
||||
|
||||
outputs = []
|
||||
for query, corpus in zip(queries, full_corpus):
|
||||
outputs.append(self.get_score(query, corpus))
|
||||
|
||||
scores = np.array(outputs)
|
||||
return scores
|
||||
|
||||
def get_score(self, query, corpus):
|
||||
response = requests.post(
|
||||
self.url,
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"text_1": query,
|
||||
"text_2": corpus,
|
||||
"truncate_prompt_tokens": -1,
|
||||
},
|
||||
).json()
|
||||
return response["data"][0]["score"]
|
||||
|
||||
|
||||
class RerankClientMtebEncoder(ScoreClientMtebEncoder):
|
||||
def get_score(self, query, corpus):
|
||||
response = requests.post(
|
||||
self.url,
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"query": query,
|
||||
"documents": [corpus],
|
||||
"truncate_prompt_tokens": -1,
|
||||
},
|
||||
).json()
|
||||
return response["results"][0]["relevance_score"]
|
||||
|
||||
|
||||
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
|
||||
tasks = mteb.get_tasks(tasks=tasks)
|
||||
results = mteb.evaluate(
|
||||
@ -243,12 +161,21 @@ def mteb_test_embed_models(
|
||||
if model_info.architecture:
|
||||
assert model_info.architecture in model_config.architectures
|
||||
|
||||
# Confirm whether vllm uses the correct default_pooling_type, which
|
||||
# relates to whether chunked prefill and prefix caching are enabled
|
||||
assert (
|
||||
model_config._model_info.default_pooling_type
|
||||
== model_info.default_pooling_type
|
||||
)
|
||||
# Confirm whether the important configs in model_config are correct.
|
||||
if model_info.pooling_type is not None:
|
||||
assert model_config.pooler_config.pooling_type == model_info.pooling_type
|
||||
if model_info.attn_type is not None:
|
||||
assert model_config.attn_type == model_info.attn_type
|
||||
if model_info.is_prefix_caching_supported is not None:
|
||||
assert (
|
||||
model_config.is_prefix_caching_supported
|
||||
== model_info.is_prefix_caching_supported
|
||||
)
|
||||
if model_info.is_chunked_prefill_supported is not None:
|
||||
assert (
|
||||
model_config.is_chunked_prefill_supported
|
||||
== model_info.is_chunked_prefill_supported
|
||||
)
|
||||
|
||||
vllm_main_score = run_mteb_embed_task(
|
||||
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
|
||||
@ -299,117 +226,3 @@ def mteb_test_embed_models(
|
||||
# We are not concerned that the vllm mteb results are better
|
||||
# than SentenceTransformers, so we only perform one-sided testing.
|
||||
assert st_main_score - vllm_main_score < atol
|
||||
|
||||
|
||||
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
|
||||
with tempfile.TemporaryDirectory() as prediction_folder:
|
||||
bm25s = mteb.get_model("bm25s")
|
||||
eval_splits = ["test"]
|
||||
|
||||
mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
|
||||
tasks=tasks, languages=languages, eval_splits=eval_splits
|
||||
)
|
||||
|
||||
mteb.evaluate(
|
||||
bm25s,
|
||||
mteb_tasks,
|
||||
prediction_folder=prediction_folder,
|
||||
show_progress_bar=False,
|
||||
# don't save results for test runs
|
||||
cache=None,
|
||||
overwrite_strategy="always",
|
||||
)
|
||||
|
||||
second_stage_tasks = []
|
||||
for task in mteb_tasks:
|
||||
second_stage_tasks.append(
|
||||
task.convert_to_reranking(
|
||||
prediction_folder,
|
||||
top_k=10,
|
||||
)
|
||||
)
|
||||
|
||||
results = mteb.evaluate(
|
||||
cross_encoder,
|
||||
second_stage_tasks,
|
||||
show_progress_bar=False,
|
||||
cache=None,
|
||||
)
|
||||
main_score = results[0].scores["test"][0]["main_score"]
|
||||
return main_score
|
||||
|
||||
|
||||
def mteb_test_rerank_models_hf(
|
||||
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
|
||||
):
|
||||
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
st_main_score = run_mteb_rerank(
|
||||
hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
|
||||
)
|
||||
st_dtype = next(hf_model.model.model.parameters()).dtype
|
||||
return st_main_score, st_dtype
|
||||
|
||||
|
||||
def mteb_test_rerank_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: RerankModelInfo,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
vllm_mteb_encoder=VllmMtebCrossEncoder,
|
||||
atol=MTEB_RERANK_TOL,
|
||||
):
|
||||
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
max_num_seqs=8,
|
||||
**vllm_extra_kwargs,
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
# Confirm whether vllm is using the correct architecture
|
||||
if model_info.architecture:
|
||||
assert model_info.architecture in model_config.architectures
|
||||
|
||||
# Score API is only enabled for num_labels == 1
|
||||
assert model_config.hf_config.num_labels == 1
|
||||
|
||||
# Confirm whether vllm uses the correct default_pooling_type, which
|
||||
# relates to whether chunked prefill and prefix caching are enabled
|
||||
assert (
|
||||
model_config._model_info.default_pooling_type
|
||||
== model_info.default_pooling_type
|
||||
)
|
||||
|
||||
vllm_main_score = run_mteb_rerank(
|
||||
vllm_mteb_encoder(vllm_model),
|
||||
tasks=MTEB_RERANK_TASKS,
|
||||
languages=MTEB_RERANK_LANGS,
|
||||
)
|
||||
vllm_dtype = model_config.dtype
|
||||
head_dtype = model_config.head_dtype
|
||||
|
||||
# Accelerate mteb test by setting
|
||||
# SentenceTransformers mteb score to a constant
|
||||
if model_info.mteb_score is None:
|
||||
st_main_score, st_dtype = mteb_test_rerank_models_hf(
|
||||
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
|
||||
)
|
||||
else:
|
||||
st_main_score = model_info.mteb_score
|
||||
st_dtype = "Constant"
|
||||
|
||||
print("Model:", model_info.name)
|
||||
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
|
||||
print("SentenceTransformers:", st_dtype, st_main_score)
|
||||
print("Difference:", st_main_score - vllm_main_score)
|
||||
|
||||
# We are not concerned that the vllm mteb results are better
|
||||
# than SentenceTransformers, so we only perform one-sided testing.
|
||||
assert st_main_score - vllm_main_score < atol
|
||||
259
tests/models/language/pooling_mteb_test/mteb_score_utils.py
Normal file
259
tests/models/language/pooling_mteb_test/mteb_score_utils.py
Normal file
@ -0,0 +1,259 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import mteb
|
||||
import numpy as np
|
||||
import requests
|
||||
from mteb.models import ModelMeta
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from tests.models.utils import (
|
||||
RerankModelInfo,
|
||||
get_vllm_extra_kwargs,
|
||||
)
|
||||
|
||||
# See #19344
|
||||
MTEB_RERANK_TASKS = ["NFCorpus"]
|
||||
MTEB_RERANK_LANGS = ["eng"]
|
||||
MTEB_RERANK_TOL = 2e-3
|
||||
|
||||
template_home = (
|
||||
Path(__file__).parent.parent.parent.parent.parent
|
||||
/ "examples/pooling/score/template"
|
||||
)
|
||||
|
||||
_empty_model_meta = ModelMeta(
|
||||
loader=None,
|
||||
name="vllm/model",
|
||||
revision="1",
|
||||
release_date=None,
|
||||
languages=None,
|
||||
framework=[],
|
||||
similarity_fn_name=None,
|
||||
n_parameters=None,
|
||||
memory_usage_mb=None,
|
||||
max_tokens=None,
|
||||
embed_dim=None,
|
||||
license=None,
|
||||
open_weights=None,
|
||||
public_training_code=None,
|
||||
public_training_data=None,
|
||||
use_instructions=None,
|
||||
training_datasets=None,
|
||||
modalities=["text"], # 'image' can be added to evaluate multimodal models
|
||||
)
|
||||
|
||||
|
||||
class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol):
|
||||
mteb_model_meta = _empty_model_meta
|
||||
|
||||
|
||||
class VllmMtebCrossEncoder(MtebCrossEncoderMixin):
|
||||
def __init__(self, vllm_model):
|
||||
self.llm = vllm_model
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
self.chat_template: str | None = getattr(vllm_model, "chat_template", None)
|
||||
|
||||
def predict(
|
||||
self,
|
||||
inputs1: DataLoader[mteb.types.BatchedInput],
|
||||
inputs2: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
queries = [text for batch in inputs1 for text in batch["text"]]
|
||||
corpus = [text for batch in inputs2 for text in batch["text"]]
|
||||
|
||||
outputs = self.llm.score(
|
||||
queries,
|
||||
corpus,
|
||||
truncate_prompt_tokens=-1,
|
||||
use_tqdm=False,
|
||||
chat_template=self.chat_template,
|
||||
)
|
||||
scores = np.array(outputs)
|
||||
return scores
|
||||
|
||||
|
||||
class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
|
||||
mteb_model_meta = _empty_model_meta
|
||||
|
||||
def __init__(self, model_name: str, url):
|
||||
self.model_name = model_name
|
||||
self.url = url
|
||||
self.rng = np.random.default_rng(seed=42)
|
||||
|
||||
def predict(
|
||||
self,
|
||||
inputs1: DataLoader[mteb.types.BatchedInput],
|
||||
inputs2: DataLoader[mteb.types.BatchedInput],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> np.ndarray:
|
||||
queries = [text for batch in inputs1 for text in batch["text"]]
|
||||
full_corpus = [text for batch in inputs2 for text in batch["text"]]
|
||||
|
||||
outputs = []
|
||||
for query, corpus in zip(queries, full_corpus):
|
||||
outputs.append(self.get_score(query, corpus))
|
||||
|
||||
scores = np.array(outputs)
|
||||
return scores
|
||||
|
||||
def get_score(self, query, corpus):
|
||||
response = requests.post(
|
||||
self.url,
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"text_1": query,
|
||||
"text_2": corpus,
|
||||
"truncate_prompt_tokens": -1,
|
||||
},
|
||||
).json()
|
||||
return response["data"][0]["score"]
|
||||
|
||||
|
||||
class RerankClientMtebEncoder(ScoreClientMtebEncoder):
|
||||
def get_score(self, query, corpus):
|
||||
response = requests.post(
|
||||
self.url,
|
||||
json={
|
||||
"model": self.model_name,
|
||||
"query": query,
|
||||
"documents": [corpus],
|
||||
"truncate_prompt_tokens": -1,
|
||||
},
|
||||
).json()
|
||||
return response["results"][0]["relevance_score"]
|
||||
|
||||
|
||||
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
|
||||
with tempfile.TemporaryDirectory() as prediction_folder:
|
||||
bm25s = mteb.get_model("bm25s")
|
||||
eval_splits = ["test"]
|
||||
|
||||
mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
|
||||
tasks=tasks, languages=languages, eval_splits=eval_splits
|
||||
)
|
||||
|
||||
mteb.evaluate(
|
||||
bm25s,
|
||||
mteb_tasks,
|
||||
prediction_folder=prediction_folder,
|
||||
show_progress_bar=False,
|
||||
# don't save results for test runs
|
||||
cache=None,
|
||||
overwrite_strategy="always",
|
||||
)
|
||||
|
||||
second_stage_tasks = []
|
||||
for task in mteb_tasks:
|
||||
second_stage_tasks.append(
|
||||
task.convert_to_reranking(
|
||||
prediction_folder,
|
||||
top_k=10,
|
||||
)
|
||||
)
|
||||
|
||||
results = mteb.evaluate(
|
||||
cross_encoder,
|
||||
second_stage_tasks,
|
||||
show_progress_bar=False,
|
||||
cache=None,
|
||||
)
|
||||
main_score = results[0].scores["test"][0]["main_score"]
|
||||
return main_score
|
||||
|
||||
|
||||
def mteb_test_rerank_models_hf(
|
||||
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
|
||||
):
|
||||
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
|
||||
if hf_model_callback is not None:
|
||||
hf_model_callback(hf_model)
|
||||
|
||||
st_main_score = run_mteb_rerank(
|
||||
hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
|
||||
)
|
||||
st_dtype = next(hf_model.model.model.parameters()).dtype
|
||||
return st_main_score, st_dtype
|
||||
|
||||
|
||||
def mteb_test_rerank_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model_info: RerankModelInfo,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
vllm_mteb_encoder=VllmMtebCrossEncoder,
|
||||
atol=MTEB_RERANK_TOL,
|
||||
):
|
||||
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
|
||||
|
||||
with vllm_runner(
|
||||
model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
max_num_seqs=8,
|
||||
**vllm_extra_kwargs,
|
||||
) as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
# Confirm whether vllm is using the correct architecture
|
||||
if model_info.architecture:
|
||||
assert model_info.architecture in model_config.architectures
|
||||
|
||||
# Score API is only enabled for num_labels == 1
|
||||
assert model_config.hf_config.num_labels == 1
|
||||
|
||||
# Maybe load chat_template.
|
||||
chat_template: str | None = None
|
||||
if model_info.chat_template_name is not None:
|
||||
chat_template = (template_home / model_info.chat_template_name).read_text()
|
||||
vllm_model.chat_template = chat_template
|
||||
|
||||
# Confirm whether the important configs in model_config are correct.
|
||||
if model_info.pooling_type is not None:
|
||||
assert model_config.pooler_config.pooling_type == model_info.pooling_type
|
||||
if model_info.attn_type is not None:
|
||||
assert model_config.attn_type == model_info.attn_type
|
||||
if model_info.is_prefix_caching_supported is not None:
|
||||
assert (
|
||||
model_config.is_prefix_caching_supported
|
||||
== model_info.is_prefix_caching_supported
|
||||
)
|
||||
if model_info.is_chunked_prefill_supported is not None:
|
||||
assert (
|
||||
model_config.is_chunked_prefill_supported
|
||||
== model_info.is_chunked_prefill_supported
|
||||
)
|
||||
|
||||
vllm_main_score = run_mteb_rerank(
|
||||
vllm_mteb_encoder(vllm_model),
|
||||
tasks=MTEB_RERANK_TASKS,
|
||||
languages=MTEB_RERANK_LANGS,
|
||||
)
|
||||
vllm_dtype = model_config.dtype
|
||||
head_dtype = model_config.head_dtype
|
||||
|
||||
# Accelerate mteb test by setting
|
||||
# SentenceTransformers mteb score to a constant
|
||||
if model_info.mteb_score is None:
|
||||
st_main_score, st_dtype = mteb_test_rerank_models_hf(
|
||||
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
|
||||
)
|
||||
else:
|
||||
st_main_score = model_info.mteb_score
|
||||
st_dtype = "Constant"
|
||||
|
||||
print("Model:", model_info.name)
|
||||
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
|
||||
print("SentenceTransformers:", st_dtype, st_main_score)
|
||||
print("Difference:", st_main_score - vllm_main_score)
|
||||
|
||||
# We are not concerned that the vllm mteb results are better
|
||||
# than SentenceTransformers, so we only perform one-sided testing.
|
||||
assert st_main_score - vllm_main_score < atol
|
||||
@ -4,90 +4,94 @@ import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo,
|
||||
EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
||||
from .mteb_embed_utils import mteb_test_embed_models
|
||||
from .mteb_score_utils import mteb_test_rerank_models
|
||||
|
||||
MODELS = [
|
||||
########## BertModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"BAAI/bge-base-en",
|
||||
architecture="BertModel",
|
||||
mteb_score=0.779336792,
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-small-en", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
|
||||
EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
|
||||
EmbedModelInfo(
|
||||
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
|
||||
),
|
||||
########## XLMRobertaModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"BAAI/bge-m3",
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.787343078,
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
########## Qwen2Model
|
||||
LASTPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"BAAI/bge-code-v1",
|
||||
architecture="Qwen2Model",
|
||||
mteb_score=0.75724465,
|
||||
dtype="float32",
|
||||
pooling_type="LAST",
|
||||
attn_type="decoder",
|
||||
is_prefix_caching_supported=True,
|
||||
is_chunked_prefill_supported=True,
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
RERANK_MODELS = [
|
||||
########## XLMRobertaForSequenceClassification
|
||||
CLSPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"BAAI/bge-reranker-base",
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
mteb_score=0.32398,
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"BAAI/bge-reranker-large",
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"BAAI/bge-reranker-v2-m3",
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
enable_test=False,
|
||||
|
||||
@ -9,14 +9,12 @@ import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
VllmMtebCrossEncoder,
|
||||
mteb_test_rerank_models,
|
||||
)
|
||||
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
||||
from tests.models.utils import RerankModelInfo
|
||||
|
||||
from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"BAAI/bge-reranker-v2-gemma",
|
||||
architecture="GemmaForSequenceClassification",
|
||||
mteb_score=0.33757,
|
||||
@ -25,6 +23,10 @@ RERANK_MODELS = [
|
||||
"classifier_from_token": ["Yes"],
|
||||
"method": "no_post_processing",
|
||||
},
|
||||
pooling_type="LAST",
|
||||
attn_type="decoder",
|
||||
is_prefix_caching_supported=True,
|
||||
is_chunked_prefill_supported=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@ -3,23 +3,29 @@
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import (
|
||||
CLSPoolingRerankModelInfo,
|
||||
LASTPoolingRerankModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_rerank_models
|
||||
from .mteb_score_utils import mteb_test_rerank_models
|
||||
|
||||
RERANK_MODELS = [
|
||||
CLSPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
|
||||
mteb_score=0.32898,
|
||||
architecture="BertForSequenceClassification",
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
),
|
||||
LASTPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
|
||||
mteb_score=0.25736,
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
pooling_type="LAST",
|
||||
attn_type="decoder",
|
||||
is_prefix_caching_supported=True,
|
||||
is_chunked_prefill_supported=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@ -5,36 +5,32 @@ import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo,
|
||||
EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
||||
from .mteb_embed_utils import mteb_test_embed_models
|
||||
from .mteb_score_utils import mteb_test_rerank_models
|
||||
|
||||
MODELS = [
|
||||
########## BertModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"thenlper/gte-large",
|
||||
mteb_score=0.76807651,
|
||||
architecture="BertModel",
|
||||
pooling_type="MEAN",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-base", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-small", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
|
||||
EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
|
||||
EmbedModelInfo(
|
||||
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
|
||||
EmbedModelInfo(
|
||||
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False
|
||||
),
|
||||
########### NewModel
|
||||
@ -43,48 +39,64 @@ MODELS = [
|
||||
# - whether to use token_type_embeddings
|
||||
# - whether to use context expansion
|
||||
# So only test one (the most widely used) model
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Alibaba-NLP/gte-multilingual-base",
|
||||
architecture="GteNewModel",
|
||||
mteb_score=0.775074696,
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Alibaba-NLP/gte-base-en-v1.5",
|
||||
architecture="GteNewModel",
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Alibaba-NLP/gte-large-en-v1.5",
|
||||
architecture="GteNewModel",
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=False,
|
||||
),
|
||||
########### Qwen2ForCausalLM
|
||||
LASTPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
||||
mteb_score=0.758473459018872,
|
||||
architecture="Qwen2ForCausalLM",
|
||||
pooling_type="LAST",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
########## ModernBertModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Alibaba-NLP/gte-modernbert-base",
|
||||
mteb_score=0.748193353,
|
||||
architecture="ModernBertModel",
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
########## Qwen3ForCausalLM
|
||||
LASTPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Qwen/Qwen3-Embedding-0.6B",
|
||||
mteb_score=0.771163695,
|
||||
architecture="Qwen3ForCausalLM",
|
||||
dtype="float32",
|
||||
pooling_type="LAST",
|
||||
attn_type="decoder",
|
||||
is_prefix_caching_supported=True,
|
||||
is_chunked_prefill_supported=True,
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Qwen/Qwen3-Embedding-4B",
|
||||
architecture="Qwen3ForCausalLM",
|
||||
dtype="float32",
|
||||
@ -93,18 +105,26 @@ MODELS = [
|
||||
]
|
||||
|
||||
RERANK_MODELS = [
|
||||
CLSPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
# classifier_pooling: mean
|
||||
"Alibaba-NLP/gte-reranker-modernbert-base",
|
||||
mteb_score=0.33386,
|
||||
architecture="ModernBertForSequenceClassification",
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"Alibaba-NLP/gte-multilingual-reranker-base",
|
||||
mteb_score=0.33062,
|
||||
architecture="GteNewForSequenceClassification",
|
||||
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
@ -3,40 +3,44 @@
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
||||
from tests.models.utils import EmbedModelInfo
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
from .mteb_embed_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
########## BertModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"intfloat/e5-small",
|
||||
architecture="BertModel",
|
||||
mteb_score=0.742285423,
|
||||
pooling_type="MEAN",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/e5-base", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
"intfloat/e5-large", architecture="BertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
|
||||
EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
|
||||
EmbedModelInfo(
|
||||
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
|
||||
),
|
||||
########## XLMRobertaModel
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"intfloat/multilingual-e5-base",
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.779325955,
|
||||
pooling_type="MEAN",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"intfloat/multilingual-e5-large",
|
||||
architecture="XLMRobertaModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"intfloat/multilingual-e5-large-instruct",
|
||||
architecture="XLMRobertaModel",
|
||||
enable_test=False,
|
||||
|
||||
@ -10,30 +10,37 @@ from tests.models.language.pooling.embed_utils import (
|
||||
matryoshka_fy,
|
||||
)
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
CLSPoolingRerankModelInfo,
|
||||
EmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
from vllm import PoolingParams
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
|
||||
from .mteb_embed_utils import mteb_test_embed_models
|
||||
from .mteb_score_utils import mteb_test_rerank_models
|
||||
|
||||
EMBEDDING_MODELS = [
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"jinaai/jina-embeddings-v3",
|
||||
mteb_score=0.824413164,
|
||||
architecture="XLMRobertaModel",
|
||||
is_matryoshka=True,
|
||||
pooling_type="MEAN",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
dtype="float32",
|
||||
)
|
||||
]
|
||||
|
||||
RERANK_MODELS = [
|
||||
CLSPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"jinaai/jina-reranker-v2-base-multilingual",
|
||||
mteb_score=0.33643,
|
||||
architecture="XLMRobertaForSequenceClassification",
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@ -6,9 +6,9 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
||||
from tests.models.utils import RerankModelInfo
|
||||
|
||||
from .mteb_utils import mteb_test_rerank_models
|
||||
from .mteb_score_utils import mteb_test_rerank_models
|
||||
|
||||
mxbai_rerank_hf_overrides = {
|
||||
"architectures": ["Qwen2ForSequenceClassification"],
|
||||
@ -17,14 +17,18 @@ mxbai_rerank_hf_overrides = {
|
||||
}
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"mixedbread-ai/mxbai-rerank-base-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
hf_overrides=mxbai_rerank_hf_overrides,
|
||||
mteb_score=0.273,
|
||||
pooling_type="LAST",
|
||||
attn_type="decoder",
|
||||
is_prefix_caching_supported=True,
|
||||
is_chunked_prefill_supported=True,
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"mixedbread-ai/mxbai-rerank-large-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
hf_overrides=mxbai_rerank_hf_overrides,
|
||||
|
||||
52
tests/models/language/pooling_mteb_test/test_nemotron.py
Normal file
52
tests/models/language/pooling_mteb_test/test_nemotron.py
Normal file
@ -0,0 +1,52 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
|
||||
mteb_test_embed_models,
|
||||
)
|
||||
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
|
||||
mteb_test_rerank_models,
|
||||
)
|
||||
from tests.models.utils import (
|
||||
EmbedModelInfo,
|
||||
RerankModelInfo,
|
||||
)
|
||||
|
||||
EMBEDDING_MODELS = [
|
||||
EmbedModelInfo(
|
||||
"nvidia/llama-nemotron-embed-1b-v2",
|
||||
architecture="LlamaBidirectionalModel",
|
||||
mteb_score=0.689164662128673,
|
||||
pooling_type="MEAN",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
)
|
||||
]
|
||||
|
||||
RERANK_MODELS = [
|
||||
RerankModelInfo(
|
||||
"nvidia/llama-nemotron-rerank-1b-v2",
|
||||
architecture="LlamaBidirectionalForSequenceClassification",
|
||||
chat_template_name="nemotron-rerank.jinja",
|
||||
mteb_score=0.33994,
|
||||
pooling_type="MEAN",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(
|
||||
hf_runner, vllm_runner, model_info: RerankModelInfo
|
||||
) -> None:
|
||||
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
|
||||
@ -4,30 +4,38 @@
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
||||
from tests.models.utils import EmbedModelInfo
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
from .mteb_embed_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"nomic-ai/nomic-embed-text-v1",
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.737568559,
|
||||
enable_test=True,
|
||||
pooling_type="MEAN",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"nomic-ai/nomic-embed-text-v1.5",
|
||||
architecture="NomicBertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"nomic-ai/nomic-embed-text-v2-moe",
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.715488912,
|
||||
enable_test=True,
|
||||
pooling_type="MEAN",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@ -6,10 +6,10 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
||||
from tests.models.utils import RerankModelInfo
|
||||
from tests.utils import multi_gpu_test
|
||||
|
||||
from .mteb_utils import mteb_test_rerank_models
|
||||
from .mteb_score_utils import mteb_test_rerank_models
|
||||
|
||||
qwen3_reranker_hf_overrides = {
|
||||
"architectures": ["Qwen3ForSequenceClassification"],
|
||||
@ -18,14 +18,18 @@ qwen3_reranker_hf_overrides = {
|
||||
}
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"Qwen/Qwen3-Reranker-0.6B",
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
mteb_score=0.25736,
|
||||
hf_overrides=qwen3_reranker_hf_overrides,
|
||||
pooling_type="LAST",
|
||||
attn_type="decoder",
|
||||
is_prefix_caching_supported=True,
|
||||
is_chunked_prefill_supported=True,
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingRerankModelInfo(
|
||||
RerankModelInfo(
|
||||
"Qwen/Qwen3-Reranker-4B",
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
hf_overrides=qwen3_reranker_hf_overrides,
|
||||
|
||||
@ -4,62 +4,82 @@
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
|
||||
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
||||
from tests.models.utils import EmbedModelInfo
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
from .mteb_embed_utils import mteb_test_embed_models
|
||||
|
||||
MODELS = [
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-xs",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
mteb_score=0.714927797,
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-s",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m-long",
|
||||
is_matryoshka=False,
|
||||
architecture="NomicBertModel",
|
||||
mteb_score=0.681146831,
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-l",
|
||||
is_matryoshka=False,
|
||||
architecture="BertModel",
|
||||
enable_test=False,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m-v1.5",
|
||||
is_matryoshka=True,
|
||||
architecture="BertModel",
|
||||
mteb_score=0.649088363,
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-l-v2.0",
|
||||
is_matryoshka=True,
|
||||
architecture="XLMRobertaModel",
|
||||
mteb_score=0.712258299,
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"Snowflake/snowflake-arctic-embed-m-v2.0",
|
||||
is_matryoshka=True,
|
||||
architecture="GteModel",
|
||||
mteb_score=0.706622444,
|
||||
pooling_type="CLS",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
]
|
||||
|
||||
@ -3,25 +3,31 @@
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import (
|
||||
CLSPoolingEmbedModelInfo,
|
||||
EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo,
|
||||
)
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
from .mteb_embed_utils import mteb_test_embed_models
|
||||
|
||||
# ST models with projector (Dense) layers
|
||||
ST_PROJECTOR_MODELS = [
|
||||
CLSPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"TencentBAC/Conan-embedding-v1",
|
||||
architecture="BertModel",
|
||||
mteb_score=0.688611955,
|
||||
pooling_type="MEAN",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingEmbedModelInfo(
|
||||
EmbedModelInfo(
|
||||
"google/embeddinggemma-300m",
|
||||
architecture="Gemma3TextModel",
|
||||
mteb_score=0.7473819294684156,
|
||||
pooling_type="MEAN",
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
dtype="float32",
|
||||
),
|
||||
|
||||
@ -19,7 +19,7 @@ def pytest_collection_modifyitems(config, items):
|
||||
return
|
||||
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
# accuracy issues
|
||||
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
|
||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
|
||||
@ -513,6 +513,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=8192,
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||
num_logprobs=10 if current_platform.is_rocm() else 5,
|
||||
),
|
||||
"intern_vl-hf": VLMTestInfo(
|
||||
models=["OpenGVLab/InternVL3-1B-hf"],
|
||||
|
||||
@ -8,7 +8,7 @@ from PIL.Image import Image
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
|
||||
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||
|
||||
@ -31,10 +31,7 @@ def test_keye_vl(
|
||||
question: str,
|
||||
):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
|
||||
]
|
||||
image_urls = [encode_image_url(image) for image in images]
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
|
||||
@ -267,7 +267,7 @@ def run_embedding_input_test(
|
||||
"""Inference result should be the same between
|
||||
original image/video input and image/video embeddings input.
|
||||
"""
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
from transformers import AutoProcessor
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model)
|
||||
|
||||
|
||||
@ -15,7 +15,7 @@ from transformers import AutoProcessor
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
from vllm.multimodal.video import sample_frames_from_video
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
|
||||
"""Build Dots.OCR specific prompt with OCR instructions."""
|
||||
# Use only stop_sign image for Dots.OCR
|
||||
image = images[0] # Already filtered to stop_sign
|
||||
|
||||
image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
|
||||
image_url = encode_image_url(image)
|
||||
|
||||
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
|
||||
messages = [
|
||||
@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
|
||||
config["model_name"], trust_remote_code=True
|
||||
)
|
||||
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
|
||||
]
|
||||
image_urls = [encode_image_url(img) for img in images]
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
@ -225,9 +222,7 @@ def build_processor_prompt(images, config):
|
||||
|
||||
def build_ovis_prompt(images, config):
|
||||
"""Build Ovis2.5 specific prompt with custom format."""
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
|
||||
]
|
||||
image_urls = [encode_image_url(img) for img in images]
|
||||
|
||||
placeholders = "\n".join(
|
||||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||||
|
||||
@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.message.content == "In the first audio clip, you hear a brief"
|
||||
assert choice.finish_reason == "length"
|
||||
|
||||
@ -488,6 +488,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
|
||||
),
|
||||
"JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),
|
||||
"LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
|
||||
"LlamaBidirectionalModel": _HfExamplesInfo(
|
||||
"nvidia/llama-nemotron-embed-1b-v2", trust_remote_code=True
|
||||
),
|
||||
"MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
|
||||
"ModernBertModel": _HfExamplesInfo(
|
||||
"Alibaba-NLP/gte-modernbert-base", trust_remote_code=True
|
||||
@ -554,6 +557,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True,
|
||||
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
|
||||
),
|
||||
"LlamaBidirectionalForSequenceClassification": _HfExamplesInfo(
|
||||
"nvidia/llama-nemotron-rerank-1b-v2", trust_remote_code=True
|
||||
),
|
||||
"ModernBertForSequenceClassification": _HfExamplesInfo(
|
||||
"Alibaba-NLP/gte-reranker-modernbert-base"
|
||||
),
|
||||
@ -854,6 +860,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
# disable this temporarily until we support HF format
|
||||
is_available_online=False,
|
||||
),
|
||||
"VoxtralStreamingGeneration": _HfExamplesInfo(
|
||||
"<place-holder>",
|
||||
# disable this temporarily until we support HF format
|
||||
is_available_online=False,
|
||||
),
|
||||
# [Encoder-decoder]
|
||||
"WhisperForConditionalGeneration": _HfExamplesInfo(
|
||||
"openai/whisper-large-v3-turbo",
|
||||
|
||||
@ -38,7 +38,7 @@ def test_inference(
|
||||
max_num_seqs=32,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.llm.encode(prompt)
|
||||
vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin")
|
||||
assert torch.equal(
|
||||
torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
|
||||
)
|
||||
|
||||
@ -10,7 +10,7 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.config.model import ModelConfig, ModelDType, RunnerOption
|
||||
from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption
|
||||
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
||||
from vllm.multimodal.processing import InputProcessingContext
|
||||
from vllm.tokenizers import cached_tokenizer_from_config
|
||||
@ -375,7 +375,10 @@ class ModelInfo:
|
||||
max_model_len: int | None = None
|
||||
hf_dtype: str = "float32"
|
||||
hf_overrides: dict[str, Any] | None = None
|
||||
default_pooling_type: str = ""
|
||||
pooling_type: str | None = None
|
||||
attn_type: AttnTypeStr | None = None
|
||||
is_prefix_caching_supported: bool | None = None
|
||||
is_chunked_prefill_supported: bool | None = None
|
||||
enable_test: bool = True
|
||||
|
||||
|
||||
@ -386,29 +389,10 @@ class EmbedModelInfo(ModelInfo):
|
||||
matryoshka_dimensions: list[int] | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLSPoolingEmbedModelInfo(EmbedModelInfo):
|
||||
default_pooling_type: str = "CLS"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LASTPoolingEmbedModelInfo(EmbedModelInfo):
|
||||
default_pooling_type: str = "LAST"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RerankModelInfo(ModelInfo):
|
||||
mteb_score: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLSPoolingRerankModelInfo(RerankModelInfo):
|
||||
default_pooling_type: str = "CLS"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LASTPoolingRerankModelInfo(RerankModelInfo):
|
||||
default_pooling_type: str = "LAST"
|
||||
chat_template_name: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@ -4,6 +4,11 @@
|
||||
set -e
|
||||
set -x
|
||||
|
||||
if command -v rocminfo >/dev/null 2>&1; then
|
||||
echo "Skipping test for ROCm platform"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
cd /vllm-workspace/
|
||||
|
||||
rm -rf .venv
|
||||
@ -36,7 +41,7 @@ if diff before.txt after.txt; then
|
||||
echo "torch version not overridden."
|
||||
else
|
||||
echo "torch version overridden by nightly_torch_test.txt, \
|
||||
if the dependency is not triggered by the pytroch nightly test,\
|
||||
if the dependency is not triggered by the pytorch nightly test,\
|
||||
please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@ -281,6 +281,8 @@ def test_extract_tool_calls_pre_v11_tokenizer(
|
||||
"single_tool_add",
|
||||
"single_tool_weather",
|
||||
"multiple_tool_calls",
|
||||
"complex",
|
||||
"wrong_json",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
@ -326,6 +328,36 @@ def test_extract_tool_calls_pre_v11_tokenizer(
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
# Complex
|
||||
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="bash",
|
||||
arguments=json.dumps(
|
||||
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
|
||||
)[:-2],
|
||||
)
|
||||
)
|
||||
],
|
||||
"hi{hi",
|
||||
),
|
||||
(
|
||||
# Wrong json
|
||||
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="bash",
|
||||
arguments=json.dumps(
|
||||
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
"hi{hi",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls(
|
||||
@ -673,7 +705,7 @@ def test_extract_tool_calls_streaming(
|
||||
),
|
||||
(
|
||||
# Complex
|
||||
"""[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
|
||||
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
@ -684,7 +716,7 @@ def test_extract_tool_calls_streaming(
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
"hi{hi",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
@ -106,6 +106,7 @@ class RemoteOpenAIServer:
|
||||
env.update(env_dict)
|
||||
serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
|
||||
print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
|
||||
print(f"Environment variables: {env}")
|
||||
self.proc: subprocess.Popen = subprocess.Popen(
|
||||
serve_cmd,
|
||||
env=env,
|
||||
|
||||
@ -1798,3 +1798,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
|
||||
)
|
||||
)
|
||||
assert block_hashes[1] == expected_hash2
|
||||
|
||||
|
||||
def test_auto_fit_max_model_len():
|
||||
"""Test that max_model_len=-1 auto-fits to available GPU memory."""
|
||||
# Create config with original_max_model_len=-1 to trigger auto-fit
|
||||
model_config = ModelConfig(max_model_len=1024)
|
||||
# Simulate the user passing -1 by setting original_max_model_len
|
||||
model_config.original_max_model_len = -1
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # 16KB per block per layer
|
||||
kv_cache_specs = {
|
||||
"layer_1": new_kv_cache_spec(),
|
||||
"layer_2": new_kv_cache_spec(),
|
||||
}
|
||||
|
||||
# With enough memory, max_model_len stays at the derived max
|
||||
large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [large_available_memory]
|
||||
)
|
||||
assert vllm_config.model_config.max_model_len == 1024
|
||||
|
||||
# Reset for next test
|
||||
model_config = ModelConfig(max_model_len=1024)
|
||||
model_config.original_max_model_len = -1
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
# With limited memory, max_model_len should be reduced
|
||||
# Need memory for at least max_model_len tokens
|
||||
# 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
|
||||
limited_memory = mem_per_block_per_layer * 2 * 32
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [limited_memory]
|
||||
)
|
||||
# Should be reduced to fit in memory
|
||||
assert vllm_config.model_config.max_model_len < 1024
|
||||
assert vllm_config.model_config.max_model_len > 0
|
||||
|
||||
|
||||
def test_auto_fit_max_model_len_not_triggered():
|
||||
"""Test that auto-fit is not triggered when original_max_model_len is not -1."""
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
# original_max_model_len should be None by default, not -1
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
|
||||
kv_cache_specs = {
|
||||
"layer_1": new_kv_cache_spec(),
|
||||
"layer_2": new_kv_cache_spec(),
|
||||
}
|
||||
|
||||
# This should work normally without auto-fit
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
|
||||
)
|
||||
assert vllm_config.model_config.max_model_len == 16
|
||||
|
||||
@ -1356,6 +1356,69 @@ def test_kv_cache_events(blocks_to_cache: int):
|
||||
assert len(manager.block_pool.cached_block_hash_to_block) == 0
|
||||
|
||||
|
||||
def test_null_parent_block_hash():
|
||||
block_size = 1
|
||||
num_cached_blocks = 2
|
||||
num_full_blocks = 4
|
||||
|
||||
pool = BlockPool(
|
||||
num_gpu_blocks=8,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
enable_kv_cache_events=True,
|
||||
)
|
||||
|
||||
req = make_request(
|
||||
"req_null_parent",
|
||||
prompt_token_ids=[10, 11, 12, 13],
|
||||
block_size=block_size,
|
||||
hash_fn=sha256,
|
||||
)
|
||||
assert len(req.block_hashes) == num_full_blocks
|
||||
|
||||
# Physical parent is `null_block` (no hash), while the logical parent hash
|
||||
# still exists in `request.block_hashes[num_cached_blocks - 1]`.
|
||||
assert pool.null_block.block_hash is None
|
||||
new_blocks = pool.get_new_blocks(num_full_blocks - 1)
|
||||
blocks = [
|
||||
new_blocks[: num_cached_blocks - 1],
|
||||
pool.null_block, # physical parent
|
||||
*new_blocks[num_cached_blocks - 1 :],
|
||||
]
|
||||
|
||||
pool.cache_full_blocks(
|
||||
request=req,
|
||||
blocks=blocks,
|
||||
num_cached_blocks=num_cached_blocks,
|
||||
num_full_blocks=num_full_blocks,
|
||||
block_size=block_size,
|
||||
kv_cache_group_id=0,
|
||||
)
|
||||
|
||||
events = pool.take_events()
|
||||
assert len(events) == 1
|
||||
event = events[0]
|
||||
assert isinstance(event, BlockStored)
|
||||
|
||||
expected_parent = kv_cache_utils.maybe_convert_block_hash(
|
||||
req.block_hashes[num_cached_blocks - 1]
|
||||
)
|
||||
assert event.parent_block_hash == expected_parent
|
||||
assert event.parent_block_hash is not None
|
||||
|
||||
expected_new_hashes = [
|
||||
kv_cache_utils.maybe_convert_block_hash(h)
|
||||
for h in req.block_hashes[num_cached_blocks:num_full_blocks]
|
||||
]
|
||||
assert event.block_hashes == expected_new_hashes
|
||||
|
||||
# Ensure we didn't accidentally assign a hash to the null block.
|
||||
assert pool.null_block.block_hash is None
|
||||
# Sanity check: newly cached physical blocks should have hashes assigned.
|
||||
assert blocks[num_cached_blocks].block_hash is not None
|
||||
assert blocks[num_full_blocks - 1].block_hash is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
|
||||
def test_kv_cache_events_with_lora(blocks_to_cache: int):
|
||||
"""Test BlockStored events contain correct lora_id when using LoRA requests."""
|
||||
|
||||
@ -31,7 +31,7 @@ import openai
|
||||
import requests
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
|
||||
MAX_OUTPUT_LEN = 256
|
||||
|
||||
@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image;base64,{encode_image_base64(image_1)}"
|
||||
},
|
||||
"image_url": {"url": encode_image_url(image_1)},
|
||||
},
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image;base64,{encode_image_base64(image_2)}"
|
||||
},
|
||||
"image_url": {"url": encode_image_url(image_2)},
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
|
||||
@ -260,7 +260,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
|
||||
|
||||
# Use multi-abort to abort multiple requests at once
|
||||
abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
|
||||
await engine.abort(abort_request_ids)
|
||||
await engine.abort(abort_request_ids, internal=False)
|
||||
|
||||
# Wait for all tasks to complete
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
@ -609,7 +609,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Abort the request
|
||||
await engine.abort(request_id)
|
||||
await engine.abort(request_id, internal=False)
|
||||
|
||||
# Wait for generation to complete and return final output
|
||||
final_output = await generated
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user