mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-08 09:33:32 +08:00
Merge branch 'main' into upstream_mori_
This commit is contained in:
commit
8b5e2e69fb
@ -162,7 +162,10 @@ steps:
|
|||||||
- tests/entrypoints/test_chat_utils
|
- tests/entrypoints/test_chat_utils
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py
|
||||||
|
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
|
||||||
|
# TODO: Remove after next torch update
|
||||||
|
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (API Server 2)
|
- label: Entrypoints Integration Test (API Server 2)
|
||||||
@ -219,6 +222,9 @@ steps:
|
|||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
commands:
|
commands:
|
||||||
|
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||||
|
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||||
|
- export TORCH_NCCL_BLOCKING_WAIT=1
|
||||||
# test with torchrun tp=2 and external_dp=2
|
# test with torchrun tp=2 and external_dp=2
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with torchrun tp=2 and pp=2
|
# test with torchrun tp=2 and pp=2
|
||||||
@ -267,9 +273,10 @@ steps:
|
|||||||
- vllm/v1/executor/uniproc_executor.py
|
- vllm/v1/executor/uniproc_executor.py
|
||||||
- vllm/v1/worker/gpu_worker.py
|
- vllm/v1/worker/gpu_worker.py
|
||||||
commands:
|
commands:
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
|
||||||
#- export NCCL_CUMEM_HOST_ENABLE=0
|
|
||||||
# test with torchrun tp=2 and dp=4 with ep
|
# test with torchrun tp=2 and dp=4 with ep
|
||||||
|
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||||
|
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||||
|
- export TORCH_NCCL_BLOCKING_WAIT=1
|
||||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||||
|
|
||||||
- label: EPLB Algorithm Test # 5min
|
- label: EPLB Algorithm Test # 5min
|
||||||
@ -979,7 +986,10 @@ steps:
|
|||||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
- export MIOPEN_DEBUG_CONV_GEMM=0
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pip freeze | grep -E 'torch'
|
- pip freeze | grep -E 'torch'
|
||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
|
||||||
|
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
|
||||||
|
# TODO: Remove after next torch update
|
||||||
|
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
- label: Multi-Modal Accuracy Eval (Small Models) # 5min
|
- label: Multi-Modal Accuracy Eval (Small Models) # 5min
|
||||||
@ -1288,6 +1298,9 @@ steps:
|
|||||||
- tests/v1/shutdown
|
- tests/v1/shutdown
|
||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
commands:
|
commands:
|
||||||
|
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||||
|
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||||
|
- export TORCH_NCCL_BLOCKING_WAIT=1
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
@ -1341,7 +1354,9 @@ steps:
|
|||||||
# end platform plugin tests
|
# end platform plugin tests
|
||||||
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
||||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
|
||||||
|
# TODO: Remove after next torch update
|
||||||
|
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||||
- pip uninstall prithvi_io_processor_plugin -y
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
# end io_processor plugins test
|
# end io_processor plugins test
|
||||||
# begin stat_logger plugins test
|
# begin stat_logger plugins test
|
||||||
@ -1510,7 +1525,7 @@ steps:
|
|||||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### B200 test #####
|
##### B200 test #####
|
||||||
|
|||||||
10
csrc/cache.h
10
csrc/cache.h
@ -9,16 +9,6 @@
|
|||||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||||
const torch::Tensor& block_mapping);
|
const torch::Tensor& block_mapping);
|
||||||
|
|
||||||
// Note: the key_caches and value_caches vectors are constant but
|
|
||||||
// not the Tensors they contain. The vectors need to be const refs
|
|
||||||
// in order to satisfy pytorch's C++ operator registration code.
|
|
||||||
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
|
||||||
std::vector<torch::Tensor> const& value_caches,
|
|
||||||
const torch::Tensor& block_mapping);
|
|
||||||
|
|
||||||
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
|
||||||
const torch::Tensor& block_mapping);
|
|
||||||
|
|
||||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||||
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||||
torch::Tensor& slot_mapping,
|
torch::Tensor& slot_mapping,
|
||||||
|
|||||||
@ -119,94 +119,6 @@ __global__ void copy_blocks_mla_kernel(
|
|||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
// Note: the key_caches and value_caches vectors are constant but
|
|
||||||
// not the Tensors they contain. The vectors need to be const refs
|
|
||||||
// in order to satisfy pytorch's C++ operator registration code.
|
|
||||||
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
|
||||||
std::vector<torch::Tensor> const& value_caches,
|
|
||||||
const torch::Tensor& block_mapping) {
|
|
||||||
int num_layers = key_caches.size();
|
|
||||||
TORCH_CHECK(num_layers == value_caches.size());
|
|
||||||
if (num_layers == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
torch::Device cache_device = key_caches[0].device();
|
|
||||||
TORCH_CHECK(cache_device.is_cuda());
|
|
||||||
|
|
||||||
// Create data structures for the kernel.
|
|
||||||
// Create an array of pointers to the key and value caches.
|
|
||||||
int64_t key_cache_ptrs[num_layers];
|
|
||||||
int64_t value_cache_ptrs[num_layers];
|
|
||||||
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
|
|
||||||
key_cache_ptrs[layer_idx] =
|
|
||||||
reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
|
|
||||||
value_cache_ptrs[layer_idx] =
|
|
||||||
reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
|
|
||||||
}
|
|
||||||
|
|
||||||
// block_mapping is a 2D tensor with shape (num_pairs, 2).
|
|
||||||
int num_pairs = block_mapping.size(0);
|
|
||||||
|
|
||||||
// Move the data structures to the GPU.
|
|
||||||
// NOTE: This synchronizes the CPU and GPU.
|
|
||||||
torch::Tensor key_cache_ptrs_tensor =
|
|
||||||
torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
|
|
||||||
.to(cache_device);
|
|
||||||
torch::Tensor value_cache_ptrs_tensor =
|
|
||||||
torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
|
|
||||||
.to(cache_device);
|
|
||||||
|
|
||||||
// Launch the kernel.
|
|
||||||
const int numel_per_block = key_caches[0][0].numel();
|
|
||||||
dim3 grid(num_layers, num_pairs);
|
|
||||||
dim3 block(std::min(1024, numel_per_block));
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
|
|
||||||
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
|
|
||||||
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
|
||||||
key_cache_ptrs_tensor.data_ptr<int64_t>(),
|
|
||||||
value_cache_ptrs_tensor.data_ptr<int64_t>(),
|
|
||||||
block_mapping.data_ptr<int64_t>(), numel_per_block);
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
// copy blocks kernel for MLA (assumes a joint KV-cache)
|
|
||||||
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
|
||||||
const torch::Tensor& block_mapping) {
|
|
||||||
int num_layers = kv_caches.size();
|
|
||||||
if (num_layers == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
torch::Device cache_device = kv_caches[0].device();
|
|
||||||
TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
|
|
||||||
|
|
||||||
std::vector<int64_t> cache_ptrs(num_layers);
|
|
||||||
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
|
|
||||||
cache_ptrs[layer_idx] =
|
|
||||||
reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
|
|
||||||
}
|
|
||||||
torch::Tensor cache_ptrs_tensor =
|
|
||||||
torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
|
|
||||||
.to(cache_device);
|
|
||||||
|
|
||||||
int num_pairs = block_mapping.size(0);
|
|
||||||
// We use the stride instead of numel in case the cache is padded for memory
|
|
||||||
// alignment reasons, we assume the blocks data (inclusive of any padding)
|
|
||||||
// is contiguous in memory
|
|
||||||
int mem_footprint_per_block = kv_caches[0].stride(0);
|
|
||||||
dim3 grid(num_layers, num_pairs);
|
|
||||||
dim3 block(std::min(1024, mem_footprint_per_block));
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
|
|
||||||
kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
|
|
||||||
vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
|
||||||
cache_ptrs_tensor.data_ptr<int64_t>(),
|
|
||||||
block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Used to copy/convert one element
|
// Used to copy/convert one element
|
||||||
@ -539,9 +451,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
|||||||
for (int i = 0; i < VEC_SIZE; i++) {
|
for (int i = 0; i < VEC_SIZE; i++) {
|
||||||
amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
|
amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
|
||||||
}
|
}
|
||||||
#ifndef USE_ROCM
|
|
||||||
__syncwarp();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Reduced amax
|
// Reduced amax
|
||||||
for (int mask = 16; mask > 0; mask /= 2) {
|
for (int mask = 16; mask > 0; mask /= 2) {
|
||||||
@ -551,9 +460,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
|||||||
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
|
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#ifndef USE_ROCM
|
|
||||||
__syncwarp();
|
|
||||||
#endif
|
|
||||||
#if defined(__gfx942__)
|
#if defined(__gfx942__)
|
||||||
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -35,7 +35,7 @@ template <typename Int>
|
|||||||
__host__ __device__ inline Int round_up(Int x, Int y) {
|
__host__ __device__ inline Int round_up(Int x, Int y) {
|
||||||
static_assert(std::is_integral_v<Int>,
|
static_assert(std::is_integral_v<Int>,
|
||||||
"round_up argument must be integral type");
|
"round_up argument must be integral type");
|
||||||
return (x + y - 1) / y * y;
|
return ((x + y - 1) / y) * y;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute effective rows for grid configuration with swizzled SF layouts.
|
// Compute effective rows for grid configuration with swizzled SF layouts.
|
||||||
@ -61,37 +61,47 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
|||||||
int sf_m = round_up<int>(numRows, 128);
|
int sf_m = round_up<int>(numRows, 128);
|
||||||
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
|
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
|
||||||
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
|
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
|
||||||
for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
|
int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
|
||||||
// Each thread writes 4 uint32_t elements.
|
|
||||||
for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
|
|
||||||
col += blockDim.x * 4) {
|
|
||||||
SFout[row * sf_n_int + col] = 0x00;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the global scaling factor, which will be applied to the SF.
|
// Get the global scaling factor, which will be applied to the SF.
|
||||||
// Note SFScale is the same as next GEMM's alpha, which is
|
// Note SFScale is the same as next GEMM's alpha, which is
|
||||||
// (448.f / (Alpha_A / 6.f)).
|
// (448.f / (Alpha_A / 6.f)).
|
||||||
float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
|
float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
|
||||||
|
|
||||||
// Input tensor row/col loops.
|
// Iterate over all rows and cols including padded ones -
|
||||||
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
|
// ensures we visit every single scale factor address to initialize it.
|
||||||
for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
|
for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
|
||||||
|
for (int colIdx = threadIdx.x;
|
||||||
|
colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
|
||||||
colIdx += blockDim.x) {
|
colIdx += blockDim.x) {
|
||||||
|
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
|
||||||
|
|
||||||
|
PackedVec in_vec;
|
||||||
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
|
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
|
||||||
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
|
|
||||||
// Get the output tensor offset.
|
// If we are outside valid rows OR outside valid columns -> Use Zeros
|
||||||
// Same as inOffset because 8 elements are packed into one uint32_t.
|
if (rowIdx >= numRows || elem_idx >= numCols) {
|
||||||
int64_t outOffset = inOffset;
|
memset(&in_vec, 0, sizeof(PackedVec));
|
||||||
auto& out_pos = out[outOffset];
|
|
||||||
|
} else {
|
||||||
|
// Valid Region: Load actual data
|
||||||
|
in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
|
||||||
|
}
|
||||||
|
|
||||||
auto sf_out =
|
auto sf_out =
|
||||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||||
rowIdx, colIdx, numKTiles, SFout);
|
rowIdx, colIdx, numKTiles, SFout);
|
||||||
|
|
||||||
out_pos =
|
auto out_val =
|
||||||
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
|
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
|
||||||
|
|
||||||
|
// We do NOT write output for padding because the 'out' tensor is not
|
||||||
|
// padded.
|
||||||
|
if (rowIdx < numRows && elem_idx < numCols) {
|
||||||
|
// Same as inOffset because 8 elements are packed into one uint32_t.
|
||||||
|
out[inOffset] = out_val;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -134,4 +144,4 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
|||||||
m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
|
m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
|
||||||
reinterpret_cast<uint32_t*>(sf_out));
|
reinterpret_cast<uint32_t*>(sf_out));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -685,16 +685,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
|
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
|
||||||
cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
|
cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
|
||||||
|
|
||||||
// Copy the cache blocks from src to dst.
|
|
||||||
cache_ops.def(
|
|
||||||
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
|
|
||||||
"Tensor block_mapping) -> ()");
|
|
||||||
cache_ops.impl("copy_blocks", torch::kCUDA, ©_blocks);
|
|
||||||
|
|
||||||
cache_ops.def(
|
|
||||||
"copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
|
|
||||||
cache_ops.impl("copy_blocks_mla", torch::kCUDA, ©_blocks_mla);
|
|
||||||
|
|
||||||
// Reshape the key and value tensors and cache them.
|
// Reshape the key and value tensors and cache them.
|
||||||
cache_ops.def(
|
cache_ops.def(
|
||||||
"reshape_and_cache(Tensor key, Tensor value,"
|
"reshape_and_cache(Tensor key, Tensor value,"
|
||||||
|
|||||||
@ -183,7 +183,7 @@ ARG nvcc_threads=8
|
|||||||
ENV NVCC_THREADS=$nvcc_threads
|
ENV NVCC_THREADS=$nvcc_threads
|
||||||
|
|
||||||
ARG USE_SCCACHE
|
ARG USE_SCCACHE
|
||||||
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
|
ARG SCCACHE_DOWNLOAD_URL
|
||||||
ARG SCCACHE_ENDPOINT
|
ARG SCCACHE_ENDPOINT
|
||||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||||
ARG SCCACHE_REGION_NAME=us-west-2
|
ARG SCCACHE_REGION_NAME=us-west-2
|
||||||
@ -201,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
|
|||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
echo "Installing sccache..." \
|
echo "Installing sccache..." \
|
||||||
|
&& case "${TARGETPLATFORM}" in \
|
||||||
|
linux/arm64) SCCACHE_ARCH="aarch64" ;; \
|
||||||
|
linux/amd64) SCCACHE_ARCH="x86_64" ;; \
|
||||||
|
*) echo "Unsupported TARGETPLATFORM for sccache: ${TARGETPLATFORM}" >&2; exit 1 ;; \
|
||||||
|
esac \
|
||||||
|
&& export SCCACHE_DOWNLOAD_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
|
||||||
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
|
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
|
||||||
&& tar -xzf sccache.tar.gz \
|
&& tar -xzf sccache.tar.gz \
|
||||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
&& sudo mv sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
&& rm -rf sccache.tar.gz sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl \
|
||||||
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
|
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
|
||||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||||
|
|||||||
@ -2,4 +2,4 @@
|
|||||||
|
|
||||||
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
|
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
|
||||||
|
|
||||||
Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.
|
You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
|
||||||
|
|||||||
5
docs/deployment/integrations/llm-d.md
Normal file
5
docs/deployment/integrations/llm-d.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# llm-d
|
||||||
|
|
||||||
|
vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers.
|
||||||
|
|
||||||
|
You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
|
||||||
@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
|||||||
|
|
||||||
- [Helm](frameworks/helm.md)
|
- [Helm](frameworks/helm.md)
|
||||||
- [InftyAI/llmaz](integrations/llmaz.md)
|
- [InftyAI/llmaz](integrations/llmaz.md)
|
||||||
|
- [llm-d](integrations/llm-d.md)
|
||||||
- [KAITO](integrations/kaito.md)
|
- [KAITO](integrations/kaito.md)
|
||||||
- [KServe](integrations/kserve.md)
|
- [KServe](integrations/kserve.md)
|
||||||
- [Kthena](integrations/kthena.md)
|
- [Kthena](integrations/kthena.md)
|
||||||
|
|||||||
152
setup.py
152
setup.py
@ -50,15 +50,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
|
|||||||
sys.platform,
|
sys.platform,
|
||||||
)
|
)
|
||||||
VLLM_TARGET_DEVICE = "empty"
|
VLLM_TARGET_DEVICE = "empty"
|
||||||
elif (
|
elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
|
||||||
sys.platform.startswith("linux")
|
if torch.version.hip is not None:
|
||||||
and torch.version.cuda is None
|
VLLM_TARGET_DEVICE = "rocm"
|
||||||
and os.getenv("VLLM_TARGET_DEVICE") is None
|
logger.info("Auto-detected ROCm")
|
||||||
and torch.version.hip is None
|
elif torch.version.cuda is not None:
|
||||||
):
|
VLLM_TARGET_DEVICE = "cuda"
|
||||||
# if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
|
logger.info("Auto-detected CUDA")
|
||||||
# fallback to cpu
|
else:
|
||||||
VLLM_TARGET_DEVICE = "cpu"
|
VLLM_TARGET_DEVICE = "cpu"
|
||||||
|
|
||||||
|
|
||||||
def is_sccache_available() -> bool:
|
def is_sccache_available() -> bool:
|
||||||
@ -108,20 +108,26 @@ class cmake_build_ext(build_ext):
|
|||||||
num_jobs = os.cpu_count()
|
num_jobs = os.cpu_count()
|
||||||
|
|
||||||
nvcc_threads = None
|
nvcc_threads = None
|
||||||
if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
|
if _is_cuda() and CUDA_HOME is not None:
|
||||||
# `nvcc_threads` is either the value of the NVCC_THREADS
|
try:
|
||||||
# environment variable (if defined) or 1.
|
nvcc_version = get_nvcc_cuda_version()
|
||||||
# when it is set, we reduce `num_jobs` to avoid
|
if nvcc_version >= Version("11.2"):
|
||||||
# overloading the system.
|
# `nvcc_threads` is either the value of the NVCC_THREADS
|
||||||
nvcc_threads = envs.NVCC_THREADS
|
# environment variable (if defined) or 1.
|
||||||
if nvcc_threads is not None:
|
# when it is set, we reduce `num_jobs` to avoid
|
||||||
nvcc_threads = int(nvcc_threads)
|
# overloading the system.
|
||||||
logger.info(
|
nvcc_threads = envs.NVCC_THREADS
|
||||||
"Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
|
if nvcc_threads is not None:
|
||||||
)
|
nvcc_threads = int(nvcc_threads)
|
||||||
else:
|
logger.info(
|
||||||
nvcc_threads = 1
|
"Using NVCC_THREADS=%d as the number of nvcc threads.",
|
||||||
num_jobs = max(1, num_jobs // nvcc_threads)
|
nvcc_threads,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
nvcc_threads = 1
|
||||||
|
num_jobs = max(1, num_jobs // nvcc_threads)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to get NVCC version: %s", e)
|
||||||
|
|
||||||
return num_jobs, nvcc_threads
|
return num_jobs, nvcc_threads
|
||||||
|
|
||||||
@ -199,9 +205,9 @@ class cmake_build_ext(build_ext):
|
|||||||
# Default build tool to whatever cmake picks.
|
# Default build tool to whatever cmake picks.
|
||||||
build_tool = []
|
build_tool = []
|
||||||
# Make sure we use the nvcc from CUDA_HOME
|
# Make sure we use the nvcc from CUDA_HOME
|
||||||
if _is_cuda():
|
if _is_cuda() and CUDA_HOME is not None:
|
||||||
cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
|
cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
|
||||||
elif _is_hip():
|
elif _is_hip() and ROCM_HOME is not None:
|
||||||
cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
|
cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
|
||||||
|
|
||||||
other_cmake_args = os.environ.get("CMAKE_ARGS")
|
other_cmake_args = os.environ.get("CMAKE_ARGS")
|
||||||
@ -339,6 +345,89 @@ class precompiled_wheel_utils:
|
|||||||
wheels = json.loads(resp.read().decode("utf-8"))
|
wheels = json.loads(resp.read().decode("utf-8"))
|
||||||
return wheels, repo_url
|
return wheels, repo_url
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_rocm_system() -> bool:
|
||||||
|
"""Detect ROCm without relying on torch (for build environment)."""
|
||||||
|
if os.getenv("ROCM_PATH"):
|
||||||
|
return True
|
||||||
|
if os.path.isdir("/opt/rocm"):
|
||||||
|
return True
|
||||||
|
if which("rocminfo") is not None:
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
return torch.version.hip is not None
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def find_local_rocm_wheel() -> str | None:
|
||||||
|
"""Search for a local vllm wheel in common locations."""
|
||||||
|
import glob
|
||||||
|
|
||||||
|
for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
|
||||||
|
wheels = glob.glob(pattern)
|
||||||
|
if wheels:
|
||||||
|
return sorted(wheels)[-1]
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
|
||||||
|
"""Fetch the latest wheel URL from a PyPI-style simple index."""
|
||||||
|
import platform
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from urllib.request import urlopen
|
||||||
|
|
||||||
|
arch = platform.machine()
|
||||||
|
|
||||||
|
class WheelLinkParser(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.wheels = []
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag == "a":
|
||||||
|
for name, value in attrs:
|
||||||
|
if name == "href" and value.endswith(".whl"):
|
||||||
|
self.wheels.append(value)
|
||||||
|
|
||||||
|
simple_url = f"{index_url.rstrip('/')}/{package}/"
|
||||||
|
print(f"Fetching wheel list from {simple_url}")
|
||||||
|
with urlopen(simple_url) as resp:
|
||||||
|
html = resp.read().decode("utf-8")
|
||||||
|
|
||||||
|
parser = WheelLinkParser()
|
||||||
|
parser.feed(html)
|
||||||
|
|
||||||
|
for wheel in reversed(parser.wheels):
|
||||||
|
if arch in wheel:
|
||||||
|
if wheel.startswith("http"):
|
||||||
|
return wheel
|
||||||
|
return urljoin(simple_url, wheel)
|
||||||
|
|
||||||
|
raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def determine_wheel_url_rocm() -> tuple[str, str | None]:
|
||||||
|
"""Determine the precompiled wheel for ROCm."""
|
||||||
|
# Search for local wheel first
|
||||||
|
local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
|
||||||
|
if local_wheel is not None:
|
||||||
|
print(f"Found local ROCm wheel: {local_wheel}")
|
||||||
|
return local_wheel, None
|
||||||
|
|
||||||
|
# Fall back to AMD's PyPI index
|
||||||
|
index_url = os.getenv(
|
||||||
|
"VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
|
||||||
|
)
|
||||||
|
print(f"Fetching ROCm precompiled wheel from {index_url}")
|
||||||
|
wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
|
||||||
|
download_filename = wheel_url.split("/")[-1].split("#")[0]
|
||||||
|
print(f"Using ROCm precompiled wheel: {wheel_url}")
|
||||||
|
return wheel_url, download_filename
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def determine_wheel_url() -> tuple[str, str | None]:
|
def determine_wheel_url() -> tuple[str, str | None]:
|
||||||
"""
|
"""
|
||||||
@ -359,6 +448,11 @@ class precompiled_wheel_utils:
|
|||||||
print(f"Using user-specified precompiled wheel location: {wheel_location}")
|
print(f"Using user-specified precompiled wheel location: {wheel_location}")
|
||||||
return wheel_location, None
|
return wheel_location, None
|
||||||
else:
|
else:
|
||||||
|
# ROCm: use local wheel or AMD's PyPI index
|
||||||
|
# TODO: When we have ROCm nightly wheels, we can update this logic.
|
||||||
|
if precompiled_wheel_utils.is_rocm_system():
|
||||||
|
return precompiled_wheel_utils.determine_wheel_url_rocm()
|
||||||
|
|
||||||
import platform
|
import platform
|
||||||
|
|
||||||
arch = platform.machine()
|
arch = platform.machine()
|
||||||
@ -465,6 +559,8 @@ class precompiled_wheel_utils:
|
|||||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||||
"vllm/cumem_allocator.abi3.so",
|
"vllm/cumem_allocator.abi3.so",
|
||||||
|
# ROCm-specific libraries
|
||||||
|
"vllm/_rocm_C.abi3.so",
|
||||||
]
|
]
|
||||||
|
|
||||||
flash_attn_regex = re.compile(
|
flash_attn_regex = re.compile(
|
||||||
@ -601,6 +697,8 @@ def get_rocm_version():
|
|||||||
# Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
|
# Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
|
||||||
# see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
|
# see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
|
||||||
try:
|
try:
|
||||||
|
if ROCM_HOME is None:
|
||||||
|
return None
|
||||||
librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
|
librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
|
||||||
if not librocm_core_file.is_file():
|
if not librocm_core_file.is_file():
|
||||||
return None
|
return None
|
||||||
@ -745,7 +843,9 @@ if _is_hip():
|
|||||||
|
|
||||||
if _is_cuda():
|
if _is_cuda():
|
||||||
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
|
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
|
||||||
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
|
if envs.VLLM_USE_PRECOMPILED or (
|
||||||
|
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
|
||||||
|
):
|
||||||
# FA3 requires CUDA 12.3 or later
|
# FA3 requires CUDA 12.3 or later
|
||||||
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
|
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
|
||||||
# Optional since this doesn't get built (produce an .so file) when
|
# Optional since this doesn't get built (produce an .so file) when
|
||||||
|
|||||||
@ -511,6 +511,16 @@ def test_human_readable_model_len():
|
|||||||
args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
|
args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
|
||||||
assert args.max_model_len == 10212345123456
|
assert args.max_model_len == 10212345123456
|
||||||
|
|
||||||
|
# Special value -1 for auto-fit to GPU memory
|
||||||
|
args = parser.parse_args(["--max-model-len", "-1"])
|
||||||
|
assert args.max_model_len == -1
|
||||||
|
|
||||||
|
# 'auto' is an alias for -1
|
||||||
|
args = parser.parse_args(["--max-model-len", "auto"])
|
||||||
|
assert args.max_model_len == -1
|
||||||
|
args = parser.parse_args(["--max-model-len", "AUTO"])
|
||||||
|
assert args.max_model_len == -1
|
||||||
|
|
||||||
# Invalid (do not allow decimals with binary multipliers)
|
# Invalid (do not allow decimals with binary multipliers)
|
||||||
for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
|
for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
|
||||||
with pytest.raises(ArgumentError):
|
with pytest.raises(ArgumentError):
|
||||||
|
|||||||
@ -5,6 +5,30 @@ import pytest
|
|||||||
from vllm.assets.audio import AudioAsset
|
from vllm.assets.audio import AudioAsset
|
||||||
|
|
||||||
|
|
||||||
|
def add_attention_backend(server_args, attention_config):
|
||||||
|
"""Append attention backend CLI arg if specified.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
server_args: List of server arguments to extend in-place.
|
||||||
|
attention_config: Dict with 'backend' key, or None.
|
||||||
|
"""
|
||||||
|
if attention_config and "backend" in attention_config:
|
||||||
|
server_args.extend(["--attention-backend", attention_config["backend"]])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def rocm_aiter_fa_attention():
|
||||||
|
"""Return attention config for transcription/translation tests on ROCm.
|
||||||
|
|
||||||
|
On ROCm, audio tests require ROCM_AITER_FA attention backend.
|
||||||
|
"""
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
return {"backend": "ROCM_AITER_FA"}
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mary_had_lamb():
|
def mary_had_lamb():
|
||||||
path = AudioAsset("mary_had_lamb").get_local_path()
|
path = AudioAsset("mary_had_lamb").get_local_path()
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import pytest
|
|||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
|
|
||||||
from vllm.assets.audio import AudioAsset
|
from vllm.assets.audio import AudioAsset
|
||||||
from vllm.multimodal.utils import encode_audio_base64, fetch_audio
|
from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
|
||||||
|
|
||||||
from ...utils import RemoteOpenAIServer
|
from ...utils import RemoteOpenAIServer
|
||||||
|
|
||||||
@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def url_encoded_audio() -> dict[str, str]:
|
||||||
|
return {
|
||||||
|
audio_url: encode_audio_url(*fetch_audio(audio_url))
|
||||||
|
for audio_url in TEST_AUDIO_URLS
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def dummy_messages_from_audio_url(
|
def dummy_messages_from_audio_url(
|
||||||
audio_urls: str | list[str],
|
audio_urls: str | list[str],
|
||||||
content_text: str = "What's happening in this audio?",
|
content_text: str = "What's happening in this audio?",
|
||||||
@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded(
|
|||||||
client: openai.AsyncOpenAI,
|
client: openai.AsyncOpenAI,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
audio_url: str,
|
audio_url: str,
|
||||||
base64_encoded_audio: dict[str, str],
|
url_encoded_audio: dict[str, str],
|
||||||
):
|
):
|
||||||
messages = dummy_messages_from_audio_url(
|
messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url])
|
||||||
f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# test single completion
|
# test single completion
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
|
|||||||
@ -254,12 +254,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
|
|||||||
{"role": "system", "content": "you are a helpful assistant"},
|
{"role": "system", "content": "you are a helpful assistant"},
|
||||||
{"role": "user", "content": "what is 1+1?"},
|
{"role": "user", "content": "what is 1+1?"},
|
||||||
]
|
]
|
||||||
|
|
||||||
# test single completion
|
# test single completion
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=5,
|
||||||
logprobs=True,
|
logprobs=True,
|
||||||
top_logprobs=5,
|
top_logprobs=5,
|
||||||
)
|
)
|
||||||
@ -267,13 +266,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
|
|||||||
assert len(chat_completion.choices) == 1
|
assert len(chat_completion.choices) == 1
|
||||||
|
|
||||||
choice = chat_completion.choices[0]
|
choice = chat_completion.choices[0]
|
||||||
|
|
||||||
assert choice.finish_reason == "length"
|
assert choice.finish_reason == "length"
|
||||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||||
completion_tokens=10, prompt_tokens=37, total_tokens=47
|
completion_tokens=5, prompt_tokens=37, total_tokens=42
|
||||||
)
|
)
|
||||||
|
|
||||||
message = choice.message
|
message = choice.message
|
||||||
assert message.content is not None and len(message.content) >= 10
|
assert message.content is not None and len(message.content) >= 5
|
||||||
assert message.role == "assistant"
|
assert message.role == "assistant"
|
||||||
messages.append({"role": "assistant", "content": message.content})
|
messages.append({"role": "assistant", "content": message.content})
|
||||||
|
|
||||||
@ -282,7 +282,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
|
|||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=5,
|
||||||
)
|
)
|
||||||
message = chat_completion.choices[0].message
|
message = chat_completion.choices[0].message
|
||||||
assert message.content is not None and len(message.content) >= 0
|
assert message.content is not None and len(message.content) >= 0
|
||||||
|
|||||||
@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest):
|
|||||||
"2",
|
"2",
|
||||||
*passed_params,
|
*passed_params,
|
||||||
]
|
]
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|||||||
@ -504,7 +504,11 @@ async def test_web_search(client: OpenAI, model_name: str):
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
async def test_code_interpreter(client: OpenAI, model_name: str):
|
async def test_code_interpreter(client: OpenAI, model_name: str):
|
||||||
response = await client.responses.create(
|
# Code interpreter may need more time for container init + code execution
|
||||||
|
timeout_value = client.timeout * 3
|
||||||
|
client_with_timeout = client.with_options(timeout=timeout_value)
|
||||||
|
|
||||||
|
response = await client_with_timeout.responses.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
# TODO: Ideally should be able to set max tool calls
|
# TODO: Ideally should be able to set max tool calls
|
||||||
# to prevent multi-turn, but it is not currently supported
|
# to prevent multi-turn, but it is not currently supported
|
||||||
@ -868,6 +872,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
@pytest.mark.flaky(reruns=3)
|
||||||
async def test_function_call_with_previous_input_messages(
|
async def test_function_call_with_previous_input_messages(
|
||||||
client: OpenAI, model_name: str
|
client: OpenAI, model_name: str
|
||||||
):
|
):
|
||||||
|
|||||||
@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
|||||||
add_generation_prompt=True,
|
add_generation_prompt=True,
|
||||||
enable_thinking=False, # default with Qwen3
|
enable_thinking=False, # default with Qwen3
|
||||||
)
|
)
|
||||||
|
|
||||||
for ignore_eos in [True, False]:
|
for ignore_eos in [True, False]:
|
||||||
payload = {
|
payload = {
|
||||||
"model": MODEL_NAME,
|
"model": MODEL_NAME,
|
||||||
@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
|||||||
}
|
}
|
||||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||||
generate_data = generate_resp.json()
|
generate_data = generate_resp.json()
|
||||||
generate_res = tokenizer.decode(
|
gen_token_ids = generate_data["choices"][0]["token_ids"]
|
||||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
generate_res = tokenizer.decode(gen_token_ids, skip_special_tokens=True)
|
||||||
)
|
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": MODEL_NAME,
|
"model": MODEL_NAME,
|
||||||
@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
|||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"ignore_eos": ignore_eos,
|
"ignore_eos": ignore_eos,
|
||||||
"chat_template_kwargs": dict(enable_thinking=False),
|
"chat_template_kwargs": {"enable_thinking": False},
|
||||||
}
|
}
|
||||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||||
completions_data = completions_resp.json()
|
completions_data = completions_resp.json()
|
||||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
|
if ignore_eos:
|
||||||
|
# When ignoring EOS, only compare up to the first EOS token
|
||||||
|
# Post-EOS generation is undefined and may differ
|
||||||
|
eos_tokens = {
|
||||||
|
tokenizer.eos_token_id,
|
||||||
|
*tokenizer.additional_special_tokens_ids,
|
||||||
|
}
|
||||||
|
# Find first EOS in generated tokens
|
||||||
|
eos_pos = None
|
||||||
|
for i, tid in enumerate(gen_token_ids):
|
||||||
|
if tid in eos_tokens:
|
||||||
|
eos_pos = i
|
||||||
|
break
|
||||||
|
if eos_pos is not None:
|
||||||
|
gen_token_ids_truncated = gen_token_ids[:eos_pos]
|
||||||
|
generate_res = tokenizer.decode(
|
||||||
|
gen_token_ids_truncated, skip_special_tokens=True
|
||||||
|
)
|
||||||
|
# Truncate completions_res to same length for comparison
|
||||||
|
completions_res = completions_res[: len(generate_res)]
|
||||||
|
|
||||||
assert generate_res == completions_res
|
assert generate_res == completions_res
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -9,10 +9,16 @@ import time
|
|||||||
import openai
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils.network_utils import get_open_port
|
from vllm.utils.network_utils import get_open_port
|
||||||
|
|
||||||
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
||||||
|
|
||||||
|
# GPU initialization might take take longer
|
||||||
|
_IS_ROCM = current_platform.is_rocm()
|
||||||
|
_SERVER_STARTUP_TIMEOUT = 120
|
||||||
|
_PROCESS_EXIT_TIMEOUT = 15
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_shutdown_on_engine_failure():
|
async def test_shutdown_on_engine_failure():
|
||||||
@ -45,9 +51,11 @@ async def test_shutdown_on_engine_failure():
|
|||||||
"2",
|
"2",
|
||||||
"--disable-frontend-multiprocessing",
|
"--disable-frontend-multiprocessing",
|
||||||
],
|
],
|
||||||
stdout=subprocess.PIPE,
|
# ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
|
||||||
stderr=subprocess.PIPE,
|
# stdout/stderr pipes are enabled during ROCm GPU initialization.
|
||||||
text=True,
|
stdout=None if _IS_ROCM else subprocess.PIPE,
|
||||||
|
stderr=None if _IS_ROCM else subprocess.PIPE,
|
||||||
|
text=None if _IS_ROCM else True,
|
||||||
preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
|
preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -61,7 +69,7 @@ async def test_shutdown_on_engine_failure():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Poll until server is ready
|
# Poll until server is ready
|
||||||
while time.time() - start_time < 30:
|
while time.time() - start_time < _SERVER_STARTUP_TIMEOUT:
|
||||||
try:
|
try:
|
||||||
await client.completions.create(
|
await client.completions.create(
|
||||||
model=MODEL_NAME, prompt="Hello", max_tokens=1
|
model=MODEL_NAME, prompt="Hello", max_tokens=1
|
||||||
@ -70,14 +78,18 @@ async def test_shutdown_on_engine_failure():
|
|||||||
except Exception:
|
except Exception:
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
if proc.poll() is not None:
|
if proc.poll() is not None:
|
||||||
stdout, stderr = proc.communicate(timeout=1)
|
if _IS_ROCM:
|
||||||
pytest.fail(
|
pytest.fail(f"Server died during startup: {proc.returncode}")
|
||||||
f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
|
else:
|
||||||
)
|
stdout, stderr = proc.communicate(timeout=1)
|
||||||
|
pytest.fail(
|
||||||
|
f"Server died during startup. "
|
||||||
|
f"stdout: {stdout}, stderr: {stderr}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
proc.wait(timeout=5)
|
proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
|
||||||
pytest.fail("Server failed to start in 30 seconds")
|
pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds")
|
||||||
|
|
||||||
# Kill server to simulate crash
|
# Kill server to simulate crash
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
@ -89,5 +101,5 @@ async def test_shutdown_on_engine_failure():
|
|||||||
model=MODEL_NAME, prompt="This should fail", max_tokens=1
|
model=MODEL_NAME, prompt="This should fail", max_tokens=1
|
||||||
)
|
)
|
||||||
|
|
||||||
return_code = proc.wait(timeout=5)
|
return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
|
||||||
assert return_code is not None
|
assert return_code is not None
|
||||||
|
|||||||
@ -7,6 +7,7 @@ import json
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ...utils import RemoteOpenAIServer
|
from ...utils import RemoteOpenAIServer
|
||||||
|
from .conftest import add_attention_backend
|
||||||
|
|
||||||
MISTRAL_FORMAT_ARGS = [
|
MISTRAL_FORMAT_ARGS = [
|
||||||
"--tokenizer_mode",
|
"--tokenizer_mode",
|
||||||
@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
|
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
|
||||||
async def test_basic_audio(mary_had_lamb, model_name):
|
async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
|
||||||
server_args = ["--enforce-eager"]
|
server_args = ["--enforce-eager"]
|
||||||
|
|
||||||
if model_name.startswith("mistralai"):
|
if model_name.startswith("mistralai"):
|
||||||
server_args += MISTRAL_FORMAT_ARGS
|
server_args += MISTRAL_FORMAT_ARGS
|
||||||
|
|
||||||
|
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||||
|
|
||||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||||
client = remote_server.get_async_client()
|
client = remote_server.get_async_client()
|
||||||
@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_basic_audio_with_lora(mary_had_lamb):
|
async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
|
||||||
"""Ensure STT (transcribe) requests can pass LoRA through to generate."""
|
"""Ensure STT (transcribe) requests can pass LoRA through to generate."""
|
||||||
|
# ROCm SPECIFIC CONFIGURATION:
|
||||||
|
# To ensure the test passes on ROCm, we modify the max model length to 512.
|
||||||
|
# We DO NOT apply this to other platforms to maintain strict upstream parity.
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
model_name = "ibm-granite/granite-speech-3.3-2b"
|
model_name = "ibm-granite/granite-speech-3.3-2b"
|
||||||
lora_model_name = "speech"
|
lora_model_name = "speech"
|
||||||
server_args = [
|
server_args = [
|
||||||
@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
|
|||||||
"--lora-modules",
|
"--lora-modules",
|
||||||
f"{lora_model_name}={model_name}",
|
f"{lora_model_name}={model_name}",
|
||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
"2048",
|
"512" if current_platform.is_rocm() else "2048",
|
||||||
"--max-num-seqs",
|
"--max-num-seqs",
|
||||||
"1",
|
"1",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||||
|
|
||||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||||
client = remote_server.get_async_client()
|
client = remote_server.get_async_client()
|
||||||
@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_basic_audio_gemma(foscolo):
|
async def test_basic_audio_gemma(foscolo, rocm_aiter_fa_attention):
|
||||||
# Gemma accuracy on some of the audio samples we use is particularly bad,
|
# Gemma accuracy on some of the audio samples we use is particularly bad,
|
||||||
# hence we use a different one here. WER is evaluated separately.
|
# hence we use a different one here. WER is evaluated separately.
|
||||||
model_name = "google/gemma-3n-E2B-it"
|
model_name = "google/gemma-3n-E2B-it"
|
||||||
server_args = ["--enforce-eager"]
|
server_args = ["--enforce-eager"]
|
||||||
|
|
||||||
|
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||||
|
|
||||||
with RemoteOpenAIServer(
|
with RemoteOpenAIServer(
|
||||||
model_name, server_args, max_wait_seconds=480
|
model_name, server_args, max_wait_seconds=480
|
||||||
) as remote_server:
|
) as remote_server:
|
||||||
|
|||||||
@ -14,16 +14,26 @@ import pytest_asyncio
|
|||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
|
||||||
from ...utils import RemoteOpenAIServer
|
from ...utils import RemoteOpenAIServer
|
||||||
|
from .conftest import add_attention_backend
|
||||||
|
|
||||||
SERVER_ARGS = ["--enforce-eager"]
|
SERVER_ARGS = ["--enforce-eager"]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_server_args(attention_config):
|
||||||
|
"""Get server args with attention backend if specified."""
|
||||||
|
args = SERVER_ARGS.copy()
|
||||||
|
add_attention_backend(args, attention_config)
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(
|
@pytest.fixture(
|
||||||
scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
|
scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
|
||||||
)
|
)
|
||||||
def server(request):
|
def server(request, rocm_aiter_fa_attention):
|
||||||
# Parametrize over model name
|
# Parametrize over model name
|
||||||
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
|
with RemoteOpenAIServer(
|
||||||
|
request.param, _get_server_args(rocm_aiter_fa_attention)
|
||||||
|
) as remote_server:
|
||||||
yield remote_server, request.param
|
yield remote_server, request.param
|
||||||
|
|
||||||
|
|
||||||
@ -35,10 +45,12 @@ async def client_and_model(server):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_non_asr_model(foscolo):
|
async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
|
||||||
# text to text model
|
# text to text model
|
||||||
model_name = "JackFram/llama-68m"
|
model_name = "JackFram/llama-68m"
|
||||||
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
|
with RemoteOpenAIServer(
|
||||||
|
model_name, _get_server_args(rocm_aiter_fa_attention)
|
||||||
|
) as remote_server:
|
||||||
client = remote_server.get_async_client()
|
client = remote_server.get_async_client()
|
||||||
res = await client.audio.translations.create(
|
res = await client.audio.translations.create(
|
||||||
model=model_name, file=foscolo, temperature=0.0
|
model=model_name, file=foscolo, temperature=0.0
|
||||||
@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_basic_audio_with_lora(mary_had_lamb):
|
async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
|
||||||
"""Ensure STT (translate) requests can pass LoRA through to generate."""
|
"""Ensure STT (translate) requests can pass LoRA through to generate."""
|
||||||
|
# ROCm SPECIFIC CONFIGURATION:
|
||||||
|
# To ensure the test passes on ROCm, we modify the max model length to 512.
|
||||||
|
# We DO NOT apply this to other platforms to maintain strict upstream parity.
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
# NOTE - careful to call this test before the module scoped server
|
# NOTE - careful to call this test before the module scoped server
|
||||||
# fixture, otherwise it'll OOMkill the CI
|
# fixture, otherwise it'll OOMkill the CI
|
||||||
model_name = "ibm-granite/granite-speech-3.3-2b"
|
model_name = "ibm-granite/granite-speech-3.3-2b"
|
||||||
@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
|
|||||||
"--lora-modules",
|
"--lora-modules",
|
||||||
f"{lora_model_name}={model_name}",
|
f"{lora_model_name}={model_name}",
|
||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
"2048",
|
"512" if current_platform.is_rocm() else "2048",
|
||||||
"--max-num-seqs",
|
"--max-num-seqs",
|
||||||
"1",
|
"1",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||||
|
|
||||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||||
client = remote_server.get_async_client()
|
client = remote_server.get_async_client()
|
||||||
|
|||||||
@ -7,7 +7,8 @@ import openai
|
|||||||
import pytest
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
|
|
||||||
from vllm.multimodal.utils import encode_video_base64, fetch_video
|
from vllm.multimodal.utils import encode_video_url, fetch_video
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
from ...utils import RemoteOpenAIServer
|
from ...utils import RemoteOpenAIServer
|
||||||
|
|
||||||
@ -37,7 +38,16 @@ def server():
|
|||||||
json.dumps({"video": MAXIMUM_VIDEOS}),
|
json.dumps({"video": MAXIMUM_VIDEOS}),
|
||||||
]
|
]
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
# ROCm: Increase timeouts to handle potential network delays and slower
|
||||||
|
# video processing when downloading multiple videos from external sources
|
||||||
|
env_overrides = {}
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
env_overrides = {
|
||||||
|
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
|
||||||
|
}
|
||||||
|
|
||||||
|
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
@ -48,9 +58,9 @@ async def client(server):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def base64_encoded_video() -> dict[str, str]:
|
def url_encoded_video() -> dict[str, str]:
|
||||||
return {
|
return {
|
||||||
video_url: encode_video_base64(fetch_video(video_url)[0])
|
video_url: encode_video_url(fetch_video(video_url)[0])
|
||||||
for video_url in TEST_VIDEO_URLS
|
for video_url in TEST_VIDEO_URLS
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -175,11 +185,9 @@ async def test_single_chat_session_video_base64encoded(
|
|||||||
client: openai.AsyncOpenAI,
|
client: openai.AsyncOpenAI,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
video_url: str,
|
video_url: str,
|
||||||
base64_encoded_video: dict[str, str],
|
url_encoded_video: dict[str, str],
|
||||||
):
|
):
|
||||||
messages = dummy_messages_from_video_url(
|
messages = dummy_messages_from_video_url(url_encoded_video[video_url])
|
||||||
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# test single completion
|
# test single completion
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
@ -223,11 +231,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
|
|||||||
client: openai.AsyncOpenAI,
|
client: openai.AsyncOpenAI,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
video_url: str,
|
video_url: str,
|
||||||
base64_encoded_video: dict[str, str],
|
url_encoded_video: dict[str, str],
|
||||||
):
|
):
|
||||||
messages = dummy_messages_from_video_url(
|
messages = dummy_messages_from_video_url(url_encoded_video[video_url])
|
||||||
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
|
||||||
)
|
|
||||||
|
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
@ -291,6 +297,11 @@ async def test_chat_streaming_video(
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
|
"video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
|
||||||
)
|
)
|
||||||
|
@pytest.mark.flaky(
|
||||||
|
reruns=2,
|
||||||
|
reruns_delay=5,
|
||||||
|
condition=current_platform.is_rocm(),
|
||||||
|
)
|
||||||
async def test_multi_video_input(
|
async def test_multi_video_input(
|
||||||
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
|
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
|
||||||
):
|
):
|
||||||
|
|||||||
@ -9,7 +9,8 @@ import pytest_asyncio
|
|||||||
from transformers import AutoProcessor
|
from transformers import AutoProcessor
|
||||||
|
|
||||||
from vllm.multimodal.base import MediaWithBytes
|
from vllm.multimodal.base import MediaWithBytes
|
||||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
from vllm.multimodal.utils import encode_image_url, fetch_image
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
from ...utils import RemoteOpenAIServer
|
from ...utils import RemoteOpenAIServer
|
||||||
|
|
||||||
@ -35,7 +36,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
|
|||||||
],
|
],
|
||||||
[
|
[
|
||||||
"The image shows a Venn diagram with three over",
|
"The image shows a Venn diagram with three over",
|
||||||
"The image shows a colorful Venn diagram with",
|
"The image displays a Venn diagram with three over",
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
"This image displays a gradient of colors ranging from",
|
"This image displays a gradient of colors ranging from",
|
||||||
@ -43,6 +44,27 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
|
|||||||
],
|
],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
EXPECTED_MM_BEAM_SEARCH_RES_ROCM = [
|
||||||
|
# MultiHeadAttention attn_backend: FLASH_ATTN
|
||||||
|
# with Triton Attention backend
|
||||||
|
[
|
||||||
|
"The image shows a wooden boardwalk leading through a",
|
||||||
|
"The image shows a wooden boardwalk extending into a",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"The image shows two parrots perched on",
|
||||||
|
"The image shows two birds perched on a cur",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"The image shows a Venn diagram with three over",
|
||||||
|
"The image contains a Venn diagram with three over",
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"This image displays a gradient of colors ranging from",
|
||||||
|
"This image displays a gradient of colors transitioning from",
|
||||||
|
],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server():
|
def server():
|
||||||
@ -59,7 +81,16 @@ def server():
|
|||||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||||
]
|
]
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
# ROCm: Increase timeouts to handle potential network delays and slower
|
||||||
|
# video processing when downloading multiple videos from external sources
|
||||||
|
env_overrides = {}
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
env_overrides = {
|
||||||
|
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
|
||||||
|
}
|
||||||
|
|
||||||
|
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
@ -70,11 +101,9 @@ async def client(server):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
def url_encoded_image(local_asset_server) -> dict[str, str]:
|
||||||
return {
|
return {
|
||||||
image_asset: encode_image_base64(
|
image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
|
||||||
local_asset_server.get_image_asset(image_asset)
|
|
||||||
)
|
|
||||||
for image_asset in TEST_IMAGE_ASSETS
|
for image_asset in TEST_IMAGE_ASSETS
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -234,11 +263,11 @@ async def test_single_chat_session_image_base64encoded(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
raw_image_url: str,
|
raw_image_url: str,
|
||||||
image_url: str,
|
image_url: str,
|
||||||
base64_encoded_image: dict[str, str],
|
url_encoded_image: dict[str, str],
|
||||||
):
|
):
|
||||||
content_text = "What's in this image?"
|
content_text = "What's in this image?"
|
||||||
messages = dummy_messages_from_image_url(
|
messages = dummy_messages_from_image_url(
|
||||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
|
url_encoded_image[raw_image_url],
|
||||||
content_text,
|
content_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -288,15 +317,20 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
|||||||
client: openai.AsyncOpenAI,
|
client: openai.AsyncOpenAI,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
image_idx: int,
|
image_idx: int,
|
||||||
base64_encoded_image: dict[str, str],
|
url_encoded_image: dict[str, str],
|
||||||
):
|
):
|
||||||
|
# ROCm: Switch expected results based on platform
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
# NOTE: This test also validates that we pass MM data through beam search
|
# NOTE: This test also validates that we pass MM data through beam search
|
||||||
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
|
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
|
||||||
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
|
|
||||||
|
|
||||||
messages = dummy_messages_from_image_url(
|
if current_platform.is_rocm():
|
||||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
expected_res = EXPECTED_MM_BEAM_SEARCH_RES_ROCM[image_idx]
|
||||||
)
|
else:
|
||||||
|
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
|
||||||
|
|
||||||
|
messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])
|
||||||
|
|
||||||
chat_completion = await client.chat.completions.create(
|
chat_completion = await client.chat.completions.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
|
|||||||
@ -33,6 +33,7 @@ def _terratorch_dummy_messages():
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
|
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
|
||||||
)
|
)
|
||||||
|
|||||||
@ -9,11 +9,6 @@ from vllm import LLM, PoolingParams
|
|||||||
from vllm.distributed import cleanup_dist_env_and_memory
|
from vllm.distributed import cleanup_dist_env_and_memory
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
pytest.skip(
|
|
||||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
|
||||||
)
|
|
||||||
|
|
||||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||||
|
|
||||||
PROMPTS = [
|
PROMPTS = [
|
||||||
@ -35,6 +30,12 @@ TOKEN_IDS = [
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def llm():
|
def llm():
|
||||||
|
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
|
||||||
|
# that supports encoder-only models on ROCm.
|
||||||
|
attention_config = None
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
attention_config = {"backend": "FLEX_ATTENTION"}
|
||||||
|
|
||||||
# pytest caches the fixture so we use weakref.proxy to
|
# pytest caches the fixture so we use weakref.proxy to
|
||||||
# enable garbage collection
|
# enable garbage collection
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@ -44,6 +45,7 @@ def llm():
|
|||||||
gpu_memory_utilization=0.75,
|
gpu_memory_utilization=0.75,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
seed=0,
|
seed=0,
|
||||||
|
attention_config=attention_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
yield weakref.proxy(llm)
|
yield weakref.proxy(llm)
|
||||||
|
|||||||
@ -9,11 +9,6 @@ import pytest_asyncio
|
|||||||
from tests.utils import RemoteOpenAIServer
|
from tests.utils import RemoteOpenAIServer
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
pytest.skip(
|
|
||||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
|
||||||
)
|
|
||||||
|
|
||||||
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
|
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
|
||||||
max_model_len = 128
|
max_model_len = 128
|
||||||
|
|
||||||
@ -44,6 +39,10 @@ def server():
|
|||||||
str(max_model_len),
|
str(max_model_len),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|||||||
28
tests/entrypoints/pooling/embed/conftest.py
Normal file
28
tests/entrypoints/pooling/embed/conftest.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""Pytest configuration for vLLM pooling embed tests."""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_collection_modifyitems(config, items):
|
||||||
|
"""Configure ROCm-specific settings based on collected tests."""
|
||||||
|
if not current_platform.is_rocm():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||||
|
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
|
||||||
|
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||||
|
torch.backends.cuda.enable_flash_sdp(False)
|
||||||
|
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||||
|
torch.backends.cuda.enable_math_sdp(True)
|
||||||
|
warnings.warn(
|
||||||
|
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
|
||||||
|
"to avoid HuggingFace Transformers accuracy issues",
|
||||||
|
UserWarning,
|
||||||
|
stacklevel=1,
|
||||||
|
)
|
||||||
@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
|
|||||||
from tests.utils import RemoteOpenAIServer
|
from tests.utils import RemoteOpenAIServer
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
pytest.skip(
|
|
||||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
|
||||||
)
|
|
||||||
|
|
||||||
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
||||||
|
|
||||||
MODEL_NAME = "intfloat/e5-small"
|
MODEL_NAME = "intfloat/e5-small"
|
||||||
@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667
|
|||||||
def server():
|
def server():
|
||||||
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
|
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
|
||||||
|
|
||||||
|
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|||||||
@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams
|
|||||||
from vllm.distributed import cleanup_dist_env_and_memory
|
from vllm.distributed import cleanup_dist_env_and_memory
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
pytest.skip(
|
|
||||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
|
||||||
)
|
|
||||||
|
|
||||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||||
|
|
||||||
prompts = ["The chef prepared a delicious meal."]
|
prompts = ["The chef prepared a delicious meal."]
|
||||||
@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."]
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def llm():
|
def llm():
|
||||||
|
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
|
||||||
|
# that supports encoder-only models on ROCm.
|
||||||
|
attention_config = None
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
attention_config = {"backend": "FLEX_ATTENTION"}
|
||||||
|
|
||||||
# pytest caches the fixture so we use weakref.proxy to
|
# pytest caches the fixture so we use weakref.proxy to
|
||||||
# enable garbage collection
|
# enable garbage collection
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@ -32,6 +33,7 @@ def llm():
|
|||||||
gpu_memory_utilization=0.75,
|
gpu_memory_utilization=0.75,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
seed=0,
|
seed=0,
|
||||||
|
attention_config=attention_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
yield weakref.proxy(llm)
|
yield weakref.proxy(llm)
|
||||||
|
|||||||
@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
|
|||||||
decode_pooling_output,
|
decode_pooling_output,
|
||||||
)
|
)
|
||||||
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
pytest.skip(
|
|
||||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
|
||||||
)
|
|
||||||
|
|
||||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||||
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
|
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
|
||||||
DTYPE = "bfloat16"
|
DTYPE = "bfloat16"
|
||||||
|
|
||||||
|
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||||
|
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
|
||||||
|
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||||
|
torch.backends.cuda.enable_flash_sdp(False)
|
||||||
|
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||||
|
torch.backends.cuda.enable_math_sdp(True)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server():
|
def server():
|
||||||
args = [
|
args = [
|
||||||
@ -53,6 +57,10 @@ def server():
|
|||||||
DUMMY_CHAT_TEMPLATE,
|
DUMMY_CHAT_TEMPLATE,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|||||||
@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
|
|||||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
pytest.skip(
|
|
||||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
|
||||||
)
|
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
|
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
|
||||||
EmbedModelInfo(
|
EmbedModelInfo(
|
||||||
@ -62,6 +57,10 @@ def server(model_info, dtype: str):
|
|||||||
["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
|
["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||||
|
|
||||||
with RemoteOpenAIServer(model_info.name, args) as remote_server:
|
with RemoteOpenAIServer(model_info.name, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|||||||
@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
|
|||||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
pytest.skip(
|
|
||||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_random_text(word_count: int) -> str:
|
def _generate_random_text(word_count: int) -> str:
|
||||||
"""Generate random text with approximately the specified word count."""
|
"""Generate random text with approximately the specified word count."""
|
||||||
@ -228,6 +223,10 @@ def server_with_chunked_processing():
|
|||||||
"0.8",
|
"0.8",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,7 @@ from transformers import AutoProcessor
|
|||||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||||
from vllm.multimodal.base import MediaWithBytes
|
from vllm.multimodal.base import MediaWithBytes
|
||||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
from vllm.multimodal.utils import fetch_image
|
||||||
|
|
||||||
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
|
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
|
||||||
MAXIMUM_IMAGES = 2
|
MAXIMUM_IMAGES = 2
|
||||||
@ -48,14 +48,6 @@ def server():
|
|||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
|
||||||
return {
|
|
||||||
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
|
|
||||||
for image_url in TEST_IMAGE_ASSETS
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_hf_prompt_tokens(model_name, content, image_url):
|
def get_hf_prompt_tokens(model_name, content, image_url):
|
||||||
processor = AutoProcessor.from_pretrained(
|
processor = AutoProcessor.from_pretrained(
|
||||||
model_name, trust_remote_code=True, num_crops=4
|
model_name, trust_remote_code=True, num_crops=4
|
||||||
|
|||||||
@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
|
|||||||
from tests.utils import RemoteOpenAIServer
|
from tests.utils import RemoteOpenAIServer
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
pytest.skip(
|
|
||||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
|
||||||
)
|
|
||||||
|
|
||||||
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
||||||
|
|
||||||
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||||
@ -30,6 +25,10 @@ st_main_score = 0.33457
|
|||||||
def server():
|
def server():
|
||||||
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
|
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
|
||||||
|
|
||||||
|
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|||||||
@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams
|
|||||||
from vllm.distributed import cleanup_dist_env_and_memory
|
from vllm.distributed import cleanup_dist_env_and_memory
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
pytest.skip(
|
|
||||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
|
||||||
)
|
|
||||||
|
|
||||||
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
|
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def llm():
|
def llm():
|
||||||
|
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
|
||||||
|
# that supports encoder-only models on ROCm.
|
||||||
|
attention_config = None
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
attention_config = {"backend": "FLEX_ATTENTION"}
|
||||||
|
|
||||||
# pytest caches the fixture so we use weakref.proxy to
|
# pytest caches the fixture so we use weakref.proxy to
|
||||||
# enable garbage collection
|
# enable garbage collection
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@ -30,6 +31,7 @@ def llm():
|
|||||||
gpu_memory_utilization=0.75,
|
gpu_memory_utilization=0.75,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
seed=0,
|
seed=0,
|
||||||
|
attention_config=attention_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
yield weakref.proxy(llm)
|
yield weakref.proxy(llm)
|
||||||
|
|||||||
@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
|
|||||||
from vllm.entrypoints.pooling.score.protocol import RerankResponse
|
from vllm.entrypoints.pooling.score.protocol import RerankResponse
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
pytest.skip(
|
|
||||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
|
||||||
)
|
|
||||||
|
|
||||||
MODEL_NAME = "BAAI/bge-reranker-base"
|
MODEL_NAME = "BAAI/bge-reranker-base"
|
||||||
DTYPE = "bfloat16"
|
DTYPE = "bfloat16"
|
||||||
|
|
||||||
@ -24,6 +19,10 @@ DTYPE = "bfloat16"
|
|||||||
def server():
|
def server():
|
||||||
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
|
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
|
||||||
|
|
||||||
|
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||||
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|||||||
@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer
|
|||||||
from vllm.entrypoints.pooling.score.protocol import ScoreResponse
|
from vllm.entrypoints.pooling.score.protocol import ScoreResponse
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
pytest.skip(
|
|
||||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
|
||||||
)
|
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
{"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
|
{"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
|
||||||
{"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
|
{"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
|
||||||
@ -44,6 +39,10 @@ def model(request):
|
|||||||
def server(model: dict[str, Any]):
|
def server(model: dict[str, Any]):
|
||||||
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
|
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
|
||||||
|
|
||||||
|
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||||
|
|
||||||
with RemoteOpenAIServer(model["name"], args) as remote_server:
|
with RemoteOpenAIServer(model["name"], args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|||||||
@ -25,9 +25,9 @@ from vllm.entrypoints.chat_utils import (
|
|||||||
)
|
)
|
||||||
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
|
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
|
||||||
from vllm.multimodal.utils import (
|
from vllm.multimodal.utils import (
|
||||||
encode_audio_base64,
|
encode_audio_url,
|
||||||
encode_image_base64,
|
encode_image_url,
|
||||||
encode_video_base64,
|
encode_video_url,
|
||||||
)
|
)
|
||||||
from vllm.tokenizers import get_tokenizer
|
from vllm.tokenizers import get_tokenizer
|
||||||
from vllm.tokenizers.mistral import MistralTokenizer
|
from vllm.tokenizers.mistral import MistralTokenizer
|
||||||
@ -141,22 +141,19 @@ def mistral_model_config():
|
|||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def image_url():
|
def image_url():
|
||||||
image = ImageAsset("cherry_blossom")
|
image = ImageAsset("cherry_blossom")
|
||||||
base64 = encode_image_base64(image.pil_image)
|
return encode_image_url(image.pil_image)
|
||||||
return f"data:image/jpeg;base64,{base64}"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def video_url():
|
def video_url():
|
||||||
video = VideoAsset("baby_reading", 1)
|
video = VideoAsset("baby_reading", 1)
|
||||||
base64 = encode_video_base64(video.np_ndarrays)
|
return encode_video_url(video.np_ndarrays)
|
||||||
return f"data:video/jpeg;base64,{base64}"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def audio_url():
|
def audio_url():
|
||||||
audio = AudioAsset("mary_had_lamb")
|
audio = AudioAsset("mary_had_lamb")
|
||||||
base64 = encode_audio_base64(*audio.audio_and_sample_rate)
|
return encode_audio_url(*audio.audio_and_sample_rate)
|
||||||
return f"data:audio/ogg;base64,{base64}"
|
|
||||||
|
|
||||||
|
|
||||||
def _assert_mm_data_is_image_input(
|
def _assert_mm_data_is_image_input(
|
||||||
|
|||||||
11
tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
Normal file
11
tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
|
||||||
|
accuracy_threshold: 0.85
|
||||||
|
num_questions: 1319
|
||||||
|
num_fewshot: 5
|
||||||
|
server_args: >-
|
||||||
|
--max-model-len 4096
|
||||||
|
--tensor-parallel-size 2
|
||||||
|
--enable-expert-parallel
|
||||||
|
--async-scheduling
|
||||||
|
env:
|
||||||
|
VLLM_USE_FLASHINFER_MOE_FP8: "1"
|
||||||
@ -4,3 +4,4 @@ Qwen1.5-MoE-W4A16-CT.yaml
|
|||||||
DeepSeek-V2-Lite-Instruct-FP8.yaml
|
DeepSeek-V2-Lite-Instruct-FP8.yaml
|
||||||
Qwen3-30B-A3B-NVFP4.yaml
|
Qwen3-30B-A3B-NVFP4.yaml
|
||||||
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
|
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
|
||||||
|
Qwen3-Next-FP8-EP2.yaml
|
||||||
|
|||||||
@ -71,6 +71,7 @@ def test_gsm8k_correctness(config_filename):
|
|||||||
print(f"Number of questions: {eval_config['num_questions']}")
|
print(f"Number of questions: {eval_config['num_questions']}")
|
||||||
print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
|
print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
|
||||||
print(f"Server args: {' '.join(server_args)}")
|
print(f"Server args: {' '.join(server_args)}")
|
||||||
|
print(f"Environment variables: {env_dict}")
|
||||||
|
|
||||||
# Launch server and run evaluation
|
# Launch server and run evaluation
|
||||||
with RemoteOpenAIServer(
|
with RemoteOpenAIServer(
|
||||||
|
|||||||
@ -40,93 +40,6 @@ KV_CACHE_DTYPE = ["auto", "fp8"]
|
|||||||
RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"]
|
RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
|
|
||||||
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
|
|
||||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
|
||||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
|
||||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
|
||||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
|
||||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
|
||||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
|
||||||
@torch.inference_mode()
|
|
||||||
def test_copy_blocks(
|
|
||||||
kv_cache_factory,
|
|
||||||
num_mappings: int,
|
|
||||||
num_layers: int,
|
|
||||||
num_heads: int,
|
|
||||||
head_size: int,
|
|
||||||
block_size: int,
|
|
||||||
num_blocks: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
seed: int,
|
|
||||||
kv_cache_dtype: str,
|
|
||||||
device: str,
|
|
||||||
) -> None:
|
|
||||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
|
||||||
pytest.skip()
|
|
||||||
current_platform.seed_everything(seed)
|
|
||||||
torch.set_default_device(device)
|
|
||||||
torch.cuda.set_device(device)
|
|
||||||
# Generate random block mappings where each source block is mapped to two
|
|
||||||
# destination blocks.
|
|
||||||
assert 2 * num_mappings <= num_blocks
|
|
||||||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
|
||||||
remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
|
|
||||||
dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
|
|
||||||
block_mapping: list[tuple[int, int]] = []
|
|
||||||
for i in range(num_mappings):
|
|
||||||
src = src_blocks[i]
|
|
||||||
dst1 = dst_blocks[2 * i]
|
|
||||||
dst2 = dst_blocks[2 * i + 1]
|
|
||||||
block_mapping.append((src, dst1))
|
|
||||||
block_mapping.append((src, dst2))
|
|
||||||
|
|
||||||
# Create the KV caches.
|
|
||||||
key_caches, value_caches = kv_cache_factory(
|
|
||||||
num_blocks,
|
|
||||||
block_size,
|
|
||||||
num_layers,
|
|
||||||
num_heads,
|
|
||||||
head_size,
|
|
||||||
kv_cache_dtype,
|
|
||||||
dtype,
|
|
||||||
seed,
|
|
||||||
device,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Clone the KV caches.
|
|
||||||
cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
|
|
||||||
cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
|
|
||||||
|
|
||||||
# Call the copy blocks kernel.
|
|
||||||
block_mapping_tensor = torch.tensor(
|
|
||||||
block_mapping, dtype=torch.int64, device=device
|
|
||||||
).view(-1, 2)
|
|
||||||
|
|
||||||
opcheck(
|
|
||||||
torch.ops._C_cache_ops.copy_blocks,
|
|
||||||
(key_caches, value_caches, block_mapping_tensor),
|
|
||||||
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
|
|
||||||
cond=(head_size == HEAD_SIZES[0]),
|
|
||||||
)
|
|
||||||
ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
|
|
||||||
|
|
||||||
# Run the reference implementation.
|
|
||||||
for src, dst in block_mapping:
|
|
||||||
for cloned_key_cache in cloned_key_caches:
|
|
||||||
cloned_key_cache[dst].copy_(cloned_key_cache[src])
|
|
||||||
for cloned_value_cache in cloned_value_caches:
|
|
||||||
cloned_value_cache[dst].copy_(cloned_value_cache[src])
|
|
||||||
|
|
||||||
# Compare the results.
|
|
||||||
for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
|
|
||||||
torch.testing.assert_close(key_cache, cloned_key_cache)
|
|
||||||
for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
|
|
||||||
torch.testing.assert_close(value_cache, cloned_value_cache)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||||
@ -763,73 +676,6 @@ def test_concat_and_cache_ds_mla(
|
|||||||
torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1)
|
torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
|
|
||||||
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
|
|
||||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
|
|
||||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
|
|
||||||
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
|
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
|
||||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
|
||||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
|
||||||
@torch.inference_mode()
|
|
||||||
def test_copy_blocks_mla(
|
|
||||||
kv_lora_rank: int,
|
|
||||||
qk_rope_head_dim: int,
|
|
||||||
block_size: int,
|
|
||||||
num_blocks: int,
|
|
||||||
num_layers: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
seed: int,
|
|
||||||
device: str,
|
|
||||||
kv_cache_dtype: str,
|
|
||||||
) -> None:
|
|
||||||
current_platform.seed_everything(seed)
|
|
||||||
torch.set_default_device(device)
|
|
||||||
torch.cuda.set_device(device)
|
|
||||||
|
|
||||||
entry_size = kv_lora_rank + qk_rope_head_dim
|
|
||||||
|
|
||||||
kv_caches = []
|
|
||||||
for _ in range(num_layers):
|
|
||||||
kv_cache = _create_mla_cache(
|
|
||||||
num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
|
|
||||||
)
|
|
||||||
_fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
|
|
||||||
kv_caches.append(kv_cache)
|
|
||||||
|
|
||||||
ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
|
|
||||||
|
|
||||||
num_mappings = min(2, num_blocks // 2)
|
|
||||||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
|
||||||
remaining = list(set(range(num_blocks)) - set(src_blocks))
|
|
||||||
dst_blocks = random.sample(remaining, 2 * num_mappings)
|
|
||||||
block_mapping = []
|
|
||||||
for i in range(num_mappings):
|
|
||||||
src = src_blocks[i]
|
|
||||||
dst1 = dst_blocks[2 * i]
|
|
||||||
dst2 = dst_blocks[2 * i + 1]
|
|
||||||
block_mapping.append((src, dst1))
|
|
||||||
block_mapping.append((src, dst2))
|
|
||||||
block_mapping_tensor = torch.tensor(
|
|
||||||
block_mapping, dtype=torch.int64, device=device
|
|
||||||
).view(-1, 2)
|
|
||||||
|
|
||||||
for src, dst in block_mapping:
|
|
||||||
for ref_cache in ref_caches:
|
|
||||||
ref_cache[dst].copy_(ref_cache[src])
|
|
||||||
|
|
||||||
opcheck(
|
|
||||||
torch.ops._C_cache_ops.copy_blocks_mla,
|
|
||||||
(kv_caches, block_mapping_tensor),
|
|
||||||
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
|
|
||||||
)
|
|
||||||
ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
|
|
||||||
|
|
||||||
for kv_cache, ref_cache in zip(kv_caches, ref_caches):
|
|
||||||
torch.testing.assert_close(kv_cache, ref_cache)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
|
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
|
||||||
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
|
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
|
||||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
|
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
|
||||||
|
|||||||
@ -19,7 +19,7 @@ def pytest_collection_modifyitems(config, items):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||||
# accuracy issues
|
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
|
||||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||||
torch.backends.cuda.enable_flash_sdp(False)
|
torch.backends.cuda.enable_flash_sdp(False)
|
||||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||||
|
|||||||
@ -513,6 +513,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||||
|
num_logprobs=10 if current_platform.is_rocm() else 5,
|
||||||
),
|
),
|
||||||
"intern_vl-hf": VLMTestInfo(
|
"intern_vl-hf": VLMTestInfo(
|
||||||
models=["OpenGVLab/InternVL3-1B-hf"],
|
models=["OpenGVLab/InternVL3-1B-hf"],
|
||||||
|
|||||||
@ -8,7 +8,7 @@ from PIL.Image import Image
|
|||||||
from transformers import AutoProcessor
|
from transformers import AutoProcessor
|
||||||
|
|
||||||
from vllm import LLM, EngineArgs, SamplingParams
|
from vllm import LLM, EngineArgs, SamplingParams
|
||||||
from vllm.multimodal.utils import encode_image_base64
|
from vllm.multimodal.utils import encode_image_url
|
||||||
|
|
||||||
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
|
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||||
|
|
||||||
@ -31,10 +31,7 @@ def test_keye_vl(
|
|||||||
question: str,
|
question: str,
|
||||||
):
|
):
|
||||||
images = [asset.pil_image for asset in image_assets]
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
image_urls = [encode_image_url(image) for image in images]
|
||||||
image_urls = [
|
|
||||||
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
|
|
||||||
]
|
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=MODEL_NAME,
|
model=MODEL_NAME,
|
||||||
|
|||||||
@ -15,7 +15,7 @@ from transformers import AutoProcessor
|
|||||||
|
|
||||||
from vllm import LLM, EngineArgs, SamplingParams
|
from vllm import LLM, EngineArgs, SamplingParams
|
||||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||||
from vllm.multimodal.utils import encode_image_base64
|
from vllm.multimodal.utils import encode_image_url
|
||||||
from vllm.multimodal.video import sample_frames_from_video
|
from vllm.multimodal.video import sample_frames_from_video
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
|
|||||||
"""Build Dots.OCR specific prompt with OCR instructions."""
|
"""Build Dots.OCR specific prompt with OCR instructions."""
|
||||||
# Use only stop_sign image for Dots.OCR
|
# Use only stop_sign image for Dots.OCR
|
||||||
image = images[0] # Already filtered to stop_sign
|
image = images[0] # Already filtered to stop_sign
|
||||||
|
image_url = encode_image_url(image)
|
||||||
image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
|
|
||||||
|
|
||||||
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
|
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
|
||||||
messages = [
|
messages = [
|
||||||
@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
|
|||||||
config["model_name"], trust_remote_code=True
|
config["model_name"], trust_remote_code=True
|
||||||
)
|
)
|
||||||
|
|
||||||
image_urls = [
|
image_urls = [encode_image_url(img) for img in images]
|
||||||
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
|
|
||||||
]
|
|
||||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
@ -225,9 +222,7 @@ def build_processor_prompt(images, config):
|
|||||||
|
|
||||||
def build_ovis_prompt(images, config):
|
def build_ovis_prompt(images, config):
|
||||||
"""Build Ovis2.5 specific prompt with custom format."""
|
"""Build Ovis2.5 specific prompt with custom format."""
|
||||||
image_urls = [
|
image_urls = [encode_image_url(img) for img in images]
|
||||||
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
|
|
||||||
]
|
|
||||||
|
|
||||||
placeholders = "\n".join(
|
placeholders = "\n".join(
|
||||||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||||||
|
|||||||
@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
|
|||||||
|
|
||||||
assert len(chat_completion.choices) == 1
|
assert len(chat_completion.choices) == 1
|
||||||
choice = chat_completion.choices[0]
|
choice = chat_completion.choices[0]
|
||||||
|
assert choice.message.content == "In the first audio clip, you hear a brief"
|
||||||
assert choice.finish_reason == "length"
|
assert choice.finish_reason == "length"
|
||||||
|
|||||||
@ -860,6 +860,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
|||||||
# disable this temporarily until we support HF format
|
# disable this temporarily until we support HF format
|
||||||
is_available_online=False,
|
is_available_online=False,
|
||||||
),
|
),
|
||||||
|
"VoxtralStreamingGeneration": _HfExamplesInfo(
|
||||||
|
"<place-holder>",
|
||||||
|
# disable this temporarily until we support HF format
|
||||||
|
is_available_online=False,
|
||||||
|
),
|
||||||
# [Encoder-decoder]
|
# [Encoder-decoder]
|
||||||
"WhisperForConditionalGeneration": _HfExamplesInfo(
|
"WhisperForConditionalGeneration": _HfExamplesInfo(
|
||||||
"openai/whisper-large-v3-turbo",
|
"openai/whisper-large-v3-turbo",
|
||||||
|
|||||||
@ -38,7 +38,7 @@ def test_inference(
|
|||||||
max_num_seqs=32,
|
max_num_seqs=32,
|
||||||
default_torch_num_threads=1,
|
default_torch_num_threads=1,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_output = vllm_model.llm.encode(prompt)
|
vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin")
|
||||||
assert torch.equal(
|
assert torch.equal(
|
||||||
torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
|
torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
|
||||||
)
|
)
|
||||||
|
|||||||
@ -4,6 +4,11 @@
|
|||||||
set -e
|
set -e
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
|
if command -v rocminfo >/dev/null 2>&1; then
|
||||||
|
echo "Skipping test for ROCm platform"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
cd /vllm-workspace/
|
cd /vllm-workspace/
|
||||||
|
|
||||||
rm -rf .venv
|
rm -rf .venv
|
||||||
@ -36,7 +41,7 @@ if diff before.txt after.txt; then
|
|||||||
echo "torch version not overridden."
|
echo "torch version not overridden."
|
||||||
else
|
else
|
||||||
echo "torch version overridden by nightly_torch_test.txt, \
|
echo "torch version overridden by nightly_torch_test.txt, \
|
||||||
if the dependency is not triggered by the pytroch nightly test,\
|
if the dependency is not triggered by the pytorch nightly test,\
|
||||||
please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py"
|
please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -281,6 +281,8 @@ def test_extract_tool_calls_pre_v11_tokenizer(
|
|||||||
"single_tool_add",
|
"single_tool_add",
|
||||||
"single_tool_weather",
|
"single_tool_weather",
|
||||||
"multiple_tool_calls",
|
"multiple_tool_calls",
|
||||||
|
"complex",
|
||||||
|
"wrong_json",
|
||||||
],
|
],
|
||||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||||
argvalues=[
|
argvalues=[
|
||||||
@ -326,6 +328,36 @@ def test_extract_tool_calls_pre_v11_tokenizer(
|
|||||||
],
|
],
|
||||||
None,
|
None,
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
# Complex
|
||||||
|
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')""", # noqa: E501
|
||||||
|
[
|
||||||
|
ToolCall(
|
||||||
|
function=FunctionCall(
|
||||||
|
name="bash",
|
||||||
|
arguments=json.dumps(
|
||||||
|
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
|
||||||
|
)[:-2],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
],
|
||||||
|
"hi{hi",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
# Wrong json
|
||||||
|
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
|
||||||
|
[
|
||||||
|
ToolCall(
|
||||||
|
function=FunctionCall(
|
||||||
|
name="bash",
|
||||||
|
arguments=json.dumps(
|
||||||
|
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
],
|
||||||
|
"hi{hi",
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_extract_tool_calls(
|
def test_extract_tool_calls(
|
||||||
@ -673,7 +705,7 @@ def test_extract_tool_calls_streaming(
|
|||||||
),
|
),
|
||||||
(
|
(
|
||||||
# Complex
|
# Complex
|
||||||
"""[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
|
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
|
||||||
[
|
[
|
||||||
ToolCall(
|
ToolCall(
|
||||||
function=FunctionCall(
|
function=FunctionCall(
|
||||||
@ -684,7 +716,7 @@ def test_extract_tool_calls_streaming(
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
"",
|
"hi{hi",
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|||||||
@ -106,6 +106,7 @@ class RemoteOpenAIServer:
|
|||||||
env.update(env_dict)
|
env.update(env_dict)
|
||||||
serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
|
serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
|
||||||
print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
|
print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
|
||||||
|
print(f"Environment variables: {env}")
|
||||||
self.proc: subprocess.Popen = subprocess.Popen(
|
self.proc: subprocess.Popen = subprocess.Popen(
|
||||||
serve_cmd,
|
serve_cmd,
|
||||||
env=env,
|
env=env,
|
||||||
|
|||||||
@ -1798,3 +1798,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
assert block_hashes[1] == expected_hash2
|
assert block_hashes[1] == expected_hash2
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_fit_max_model_len():
|
||||||
|
"""Test that max_model_len=-1 auto-fits to available GPU memory."""
|
||||||
|
# Create config with original_max_model_len=-1 to trigger auto-fit
|
||||||
|
model_config = ModelConfig(max_model_len=1024)
|
||||||
|
# Simulate the user passing -1 by setting original_max_model_len
|
||||||
|
model_config.original_max_model_len = -1
|
||||||
|
vllm_config = VllmConfig(model_config=model_config)
|
||||||
|
|
||||||
|
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # 16KB per block per layer
|
||||||
|
kv_cache_specs = {
|
||||||
|
"layer_1": new_kv_cache_spec(),
|
||||||
|
"layer_2": new_kv_cache_spec(),
|
||||||
|
}
|
||||||
|
|
||||||
|
# With enough memory, max_model_len stays at the derived max
|
||||||
|
large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory
|
||||||
|
_kv_cache_configs = get_kv_cache_configs(
|
||||||
|
vllm_config, [kv_cache_specs], [large_available_memory]
|
||||||
|
)
|
||||||
|
assert vllm_config.model_config.max_model_len == 1024
|
||||||
|
|
||||||
|
# Reset for next test
|
||||||
|
model_config = ModelConfig(max_model_len=1024)
|
||||||
|
model_config.original_max_model_len = -1
|
||||||
|
vllm_config = VllmConfig(model_config=model_config)
|
||||||
|
|
||||||
|
# With limited memory, max_model_len should be reduced
|
||||||
|
# Need memory for at least max_model_len tokens
|
||||||
|
# 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
|
||||||
|
limited_memory = mem_per_block_per_layer * 2 * 32
|
||||||
|
_kv_cache_configs = get_kv_cache_configs(
|
||||||
|
vllm_config, [kv_cache_specs], [limited_memory]
|
||||||
|
)
|
||||||
|
# Should be reduced to fit in memory
|
||||||
|
assert vllm_config.model_config.max_model_len < 1024
|
||||||
|
assert vllm_config.model_config.max_model_len > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_fit_max_model_len_not_triggered():
|
||||||
|
"""Test that auto-fit is not triggered when original_max_model_len is not -1."""
|
||||||
|
model_config = ModelConfig(max_model_len=16)
|
||||||
|
# original_max_model_len should be None by default, not -1
|
||||||
|
vllm_config = VllmConfig(model_config=model_config)
|
||||||
|
|
||||||
|
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
|
||||||
|
kv_cache_specs = {
|
||||||
|
"layer_1": new_kv_cache_spec(),
|
||||||
|
"layer_2": new_kv_cache_spec(),
|
||||||
|
}
|
||||||
|
|
||||||
|
# This should work normally without auto-fit
|
||||||
|
_kv_cache_configs = get_kv_cache_configs(
|
||||||
|
vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
|
||||||
|
)
|
||||||
|
assert vllm_config.model_config.max_model_len == 16
|
||||||
|
|||||||
@ -1356,6 +1356,69 @@ def test_kv_cache_events(blocks_to_cache: int):
|
|||||||
assert len(manager.block_pool.cached_block_hash_to_block) == 0
|
assert len(manager.block_pool.cached_block_hash_to_block) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_null_parent_block_hash():
|
||||||
|
block_size = 1
|
||||||
|
num_cached_blocks = 2
|
||||||
|
num_full_blocks = 4
|
||||||
|
|
||||||
|
pool = BlockPool(
|
||||||
|
num_gpu_blocks=8,
|
||||||
|
enable_caching=True,
|
||||||
|
hash_block_size=block_size,
|
||||||
|
enable_kv_cache_events=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
req = make_request(
|
||||||
|
"req_null_parent",
|
||||||
|
prompt_token_ids=[10, 11, 12, 13],
|
||||||
|
block_size=block_size,
|
||||||
|
hash_fn=sha256,
|
||||||
|
)
|
||||||
|
assert len(req.block_hashes) == num_full_blocks
|
||||||
|
|
||||||
|
# Physical parent is `null_block` (no hash), while the logical parent hash
|
||||||
|
# still exists in `request.block_hashes[num_cached_blocks - 1]`.
|
||||||
|
assert pool.null_block.block_hash is None
|
||||||
|
new_blocks = pool.get_new_blocks(num_full_blocks - 1)
|
||||||
|
blocks = [
|
||||||
|
new_blocks[: num_cached_blocks - 1],
|
||||||
|
pool.null_block, # physical parent
|
||||||
|
*new_blocks[num_cached_blocks - 1 :],
|
||||||
|
]
|
||||||
|
|
||||||
|
pool.cache_full_blocks(
|
||||||
|
request=req,
|
||||||
|
blocks=blocks,
|
||||||
|
num_cached_blocks=num_cached_blocks,
|
||||||
|
num_full_blocks=num_full_blocks,
|
||||||
|
block_size=block_size,
|
||||||
|
kv_cache_group_id=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
events = pool.take_events()
|
||||||
|
assert len(events) == 1
|
||||||
|
event = events[0]
|
||||||
|
assert isinstance(event, BlockStored)
|
||||||
|
|
||||||
|
expected_parent = kv_cache_utils.maybe_convert_block_hash(
|
||||||
|
req.block_hashes[num_cached_blocks - 1]
|
||||||
|
)
|
||||||
|
assert event.parent_block_hash == expected_parent
|
||||||
|
assert event.parent_block_hash is not None
|
||||||
|
|
||||||
|
expected_new_hashes = [
|
||||||
|
kv_cache_utils.maybe_convert_block_hash(h)
|
||||||
|
for h in req.block_hashes[num_cached_blocks:num_full_blocks]
|
||||||
|
]
|
||||||
|
assert event.block_hashes == expected_new_hashes
|
||||||
|
|
||||||
|
# Ensure we didn't accidentally assign a hash to the null block.
|
||||||
|
assert pool.null_block.block_hash is None
|
||||||
|
# Sanity check: newly cached physical blocks should have hashes assigned.
|
||||||
|
assert blocks[num_cached_blocks].block_hash is not None
|
||||||
|
assert blocks[num_full_blocks - 1].block_hash is not None
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
|
@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
|
||||||
def test_kv_cache_events_with_lora(blocks_to_cache: int):
|
def test_kv_cache_events_with_lora(blocks_to_cache: int):
|
||||||
"""Test BlockStored events contain correct lora_id when using LoRA requests."""
|
"""Test BlockStored events contain correct lora_id when using LoRA requests."""
|
||||||
|
|||||||
@ -31,7 +31,7 @@ import openai
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.multimodal.utils import encode_image_base64
|
from vllm.multimodal.utils import encode_image_url
|
||||||
|
|
||||||
MAX_OUTPUT_LEN = 256
|
MAX_OUTPUT_LEN = 256
|
||||||
|
|
||||||
@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
|
|||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {"url": encode_image_url(image_1)},
|
||||||
"url": f"data:image;base64,{encode_image_base64(image_1)}"
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
{"type": "text", "text": "What's in this image?"},
|
{"type": "text", "text": "What's in this image?"},
|
||||||
],
|
],
|
||||||
@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
|
|||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {"url": encode_image_url(image_2)},
|
||||||
"url": f"data:image;base64,{encode_image_base64(image_2)}"
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
|
|||||||
@ -260,7 +260,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
|
|||||||
|
|
||||||
# Use multi-abort to abort multiple requests at once
|
# Use multi-abort to abort multiple requests at once
|
||||||
abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
|
abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
|
||||||
await engine.abort(abort_request_ids)
|
await engine.abort(abort_request_ids, internal=False)
|
||||||
|
|
||||||
# Wait for all tasks to complete
|
# Wait for all tasks to complete
|
||||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
@ -609,7 +609,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
|
|||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
# Abort the request
|
# Abort the request
|
||||||
await engine.abort(request_id)
|
await engine.abort(request_id, internal=False)
|
||||||
|
|
||||||
# Wait for generation to complete and return final output
|
# Wait for generation to complete and return final output
|
||||||
final_output = await generated
|
final_output = await generated
|
||||||
|
|||||||
@ -40,10 +40,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|||||||
PROMPT = "I am Gyoubu Masataka Oniwa"
|
PROMPT = "I am Gyoubu Masataka Oniwa"
|
||||||
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
|
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
|
||||||
|
|
||||||
|
_REQUEST_COUNTER = 0
|
||||||
|
|
||||||
|
|
||||||
def make_request() -> EngineCoreRequest:
|
def make_request() -> EngineCoreRequest:
|
||||||
|
global _REQUEST_COUNTER
|
||||||
|
_REQUEST_COUNTER += 1
|
||||||
|
request_id = f"request-{_REQUEST_COUNTER}"
|
||||||
return EngineCoreRequest(
|
return EngineCoreRequest(
|
||||||
request_id=str(uuid.uuid4()),
|
request_id=request_id,
|
||||||
|
external_req_id=f"{request_id}-{uuid.uuid4()}",
|
||||||
prompt_token_ids=PROMPT_TOKENS,
|
prompt_token_ids=PROMPT_TOKENS,
|
||||||
mm_features=None,
|
mm_features=None,
|
||||||
sampling_params=SamplingParams(),
|
sampling_params=SamplingParams(),
|
||||||
|
|||||||
@ -45,6 +45,8 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|||||||
PROMPT = "Hello my name is Robert and I love quantization kernels"
|
PROMPT = "Hello my name is Robert and I love quantization kernels"
|
||||||
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
|
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
|
||||||
|
|
||||||
|
_REQUEST_COUNTER = 0
|
||||||
|
|
||||||
|
|
||||||
def make_request(
|
def make_request(
|
||||||
params: SamplingParams, prompt_tokens_ids: list[int] | None = None
|
params: SamplingParams, prompt_tokens_ids: list[int] | None = None
|
||||||
@ -52,8 +54,12 @@ def make_request(
|
|||||||
if not prompt_tokens_ids:
|
if not prompt_tokens_ids:
|
||||||
prompt_tokens_ids = PROMPT_TOKENS
|
prompt_tokens_ids = PROMPT_TOKENS
|
||||||
|
|
||||||
|
global _REQUEST_COUNTER
|
||||||
|
_REQUEST_COUNTER += 1
|
||||||
|
request_id = f"request-{_REQUEST_COUNTER}"
|
||||||
return EngineCoreRequest(
|
return EngineCoreRequest(
|
||||||
request_id=str(uuid.uuid4()),
|
request_id=request_id,
|
||||||
|
external_req_id=f"{request_id}-{uuid.uuid4()}",
|
||||||
prompt_token_ids=prompt_tokens_ids,
|
prompt_token_ids=prompt_tokens_ids,
|
||||||
mm_features=None,
|
mm_features=None,
|
||||||
sampling_params=params,
|
sampling_params=params,
|
||||||
|
|||||||
@ -27,6 +27,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
|
|||||||
params = SamplingParams(skip_special_tokens=True)
|
params = SamplingParams(skip_special_tokens=True)
|
||||||
request = EngineCoreRequest(
|
request = EngineCoreRequest(
|
||||||
request_id="test",
|
request_id="test",
|
||||||
|
external_req_id="test-ext",
|
||||||
prompt_token_ids=prompt_token_ids,
|
prompt_token_ids=prompt_token_ids,
|
||||||
mm_features=None,
|
mm_features=None,
|
||||||
sampling_params=params,
|
sampling_params=params,
|
||||||
|
|||||||
@ -58,12 +58,12 @@ def test_incremental_detokenization(
|
|||||||
output_processor = OutputProcessor(
|
output_processor = OutputProcessor(
|
||||||
dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
|
dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
|
||||||
)
|
)
|
||||||
engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
|
|
||||||
|
|
||||||
# Make N requests.
|
# Make N requests.
|
||||||
requests = [
|
requests = [
|
||||||
EngineCoreRequest(
|
EngineCoreRequest(
|
||||||
request_id=f"request-{idx}",
|
request_id=f"request-{idx}-int",
|
||||||
|
external_req_id=f"request-{idx}",
|
||||||
prompt_token_ids=prompt_tokens,
|
prompt_token_ids=prompt_tokens,
|
||||||
mm_features=None,
|
mm_features=None,
|
||||||
eos_token_id=None,
|
eos_token_id=None,
|
||||||
@ -83,6 +83,11 @@ def test_incremental_detokenization(
|
|||||||
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
engine_core = MockEngineCore(
|
||||||
|
tokens_list=dummy_test_vectors.generation_tokens,
|
||||||
|
request_ids=[req.request_id for req in requests],
|
||||||
|
)
|
||||||
|
|
||||||
# Add requests to the detokenizer.
|
# Add requests to the detokenizer.
|
||||||
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
|
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
|
||||||
output_processor.add_request(request, prompt)
|
output_processor.add_request(request, prompt)
|
||||||
@ -438,15 +443,6 @@ def test_logprobs_processor(
|
|||||||
dummy_test_vectors,
|
dummy_test_vectors,
|
||||||
):
|
):
|
||||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
|
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
|
||||||
engine_core = MockEngineCore(
|
|
||||||
tokens_list=dummy_test_vectors.generation_tokens,
|
|
||||||
generated_logprobs_raw=None
|
|
||||||
if num_sample_logprobs is None
|
|
||||||
else dummy_test_vectors.generation_logprobs,
|
|
||||||
prompt_logprobs_raw=None
|
|
||||||
if num_prompt_logprobs is None
|
|
||||||
else dummy_test_vectors.prompt_logprobs,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Make N requests.
|
# Make N requests.
|
||||||
request_id_list = [
|
request_id_list = [
|
||||||
@ -454,7 +450,8 @@ def test_logprobs_processor(
|
|||||||
]
|
]
|
||||||
requests = [
|
requests = [
|
||||||
EngineCoreRequest(
|
EngineCoreRequest(
|
||||||
request_id=request_id_list[idx],
|
request_id=request_id_list[idx] + "-int",
|
||||||
|
external_req_id=request_id_list[idx],
|
||||||
prompt_token_ids=prompt_tokens,
|
prompt_token_ids=prompt_tokens,
|
||||||
mm_features=None,
|
mm_features=None,
|
||||||
eos_token_id=None,
|
eos_token_id=None,
|
||||||
@ -476,6 +473,17 @@ def test_logprobs_processor(
|
|||||||
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
engine_core = MockEngineCore(
|
||||||
|
tokens_list=dummy_test_vectors.generation_tokens,
|
||||||
|
generated_logprobs_raw=None
|
||||||
|
if num_sample_logprobs is None
|
||||||
|
else dummy_test_vectors.generation_logprobs,
|
||||||
|
prompt_logprobs_raw=None
|
||||||
|
if num_prompt_logprobs is None
|
||||||
|
else dummy_test_vectors.prompt_logprobs,
|
||||||
|
request_ids=[req.request_id for req in requests],
|
||||||
|
)
|
||||||
|
|
||||||
# Add requests to the detokenizer.
|
# Add requests to the detokenizer.
|
||||||
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
|
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
|
||||||
output_processor.add_request(request, prompt)
|
output_processor.add_request(request, prompt)
|
||||||
@ -621,19 +629,12 @@ def test_stop_token(
|
|||||||
]
|
]
|
||||||
prompt_string = dummy_test_vectors.prompt_strings[0]
|
prompt_string = dummy_test_vectors.prompt_strings[0]
|
||||||
prompt_tokens = dummy_test_vectors.prompt_tokens[0]
|
prompt_tokens = dummy_test_vectors.prompt_tokens[0]
|
||||||
engine_core = MockEngineCore(
|
|
||||||
tokens_list=[generation_tokens],
|
|
||||||
generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
|
|
||||||
prompt_logprobs_raw=None,
|
|
||||||
eos_token_id=eos_token_id,
|
|
||||||
stop_token_ids=stop_token_ids,
|
|
||||||
ignore_eos=ignore_eos,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Make request.
|
# Make request.
|
||||||
request_id = "request-0"
|
request_id = "request-0"
|
||||||
request = EngineCoreRequest(
|
request = EngineCoreRequest(
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
|
external_req_id=request_id + "-ext",
|
||||||
prompt_token_ids=prompt_tokens,
|
prompt_token_ids=prompt_tokens,
|
||||||
mm_features=None,
|
mm_features=None,
|
||||||
eos_token_id=eos_token_id,
|
eos_token_id=eos_token_id,
|
||||||
@ -655,6 +656,16 @@ def test_stop_token(
|
|||||||
pooling_params=None,
|
pooling_params=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
engine_core = MockEngineCore(
|
||||||
|
tokens_list=[generation_tokens],
|
||||||
|
generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
|
||||||
|
prompt_logprobs_raw=None,
|
||||||
|
eos_token_id=eos_token_id,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
ignore_eos=ignore_eos,
|
||||||
|
request_ids=[request.request_id],
|
||||||
|
)
|
||||||
|
|
||||||
# Add request to the detokenizer.
|
# Add request to the detokenizer.
|
||||||
output_processor.add_request(request, prompt_string)
|
output_processor.add_request(request, prompt_string)
|
||||||
|
|
||||||
@ -720,13 +731,6 @@ def test_stop_string(
|
|||||||
dummy_test_vectors,
|
dummy_test_vectors,
|
||||||
):
|
):
|
||||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
|
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
|
||||||
engine_core = MockEngineCore(
|
|
||||||
tokens_list=dummy_test_vectors.generation_tokens,
|
|
||||||
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
|
|
||||||
if num_sample_logprobs
|
|
||||||
else None,
|
|
||||||
prompt_logprobs_raw=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Make N requests.
|
# Make N requests.
|
||||||
request_id_list = [
|
request_id_list = [
|
||||||
@ -734,7 +738,8 @@ def test_stop_string(
|
|||||||
]
|
]
|
||||||
requests = [
|
requests = [
|
||||||
EngineCoreRequest(
|
EngineCoreRequest(
|
||||||
request_id=request_id_list[idx],
|
request_id=request_id_list[idx] + "-int",
|
||||||
|
external_req_id=request_id_list[idx],
|
||||||
prompt_token_ids=prompt_tokens,
|
prompt_token_ids=prompt_tokens,
|
||||||
mm_features=None,
|
mm_features=None,
|
||||||
eos_token_id=None,
|
eos_token_id=None,
|
||||||
@ -756,6 +761,15 @@ def test_stop_string(
|
|||||||
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
engine_core = MockEngineCore(
|
||||||
|
tokens_list=dummy_test_vectors.generation_tokens,
|
||||||
|
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
|
||||||
|
if num_sample_logprobs
|
||||||
|
else None,
|
||||||
|
prompt_logprobs_raw=None,
|
||||||
|
request_ids=[req.request_id for req in requests],
|
||||||
|
)
|
||||||
|
|
||||||
# Add requests to the detokenizer.
|
# Add requests to the detokenizer.
|
||||||
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
|
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
|
||||||
output_processor.add_request(request, prompt)
|
output_processor.add_request(request, prompt)
|
||||||
@ -813,9 +827,12 @@ def test_stop_string(
|
|||||||
for idx, (ref_gen_str, stop_str) in enumerate(
|
for idx, (ref_gen_str, stop_str) in enumerate(
|
||||||
zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
|
zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
|
||||||
):
|
):
|
||||||
# Request should be aborted.
|
# Request should be aborted (check internal ID in abort list).
|
||||||
|
internal_request_id = f"request-{idx}-int"
|
||||||
|
assert internal_request_id in aborted
|
||||||
|
|
||||||
|
# Use external ID for collecting outputs
|
||||||
request_id = f"request-{idx}"
|
request_id = f"request-{idx}"
|
||||||
assert request_id in aborted
|
|
||||||
|
|
||||||
# Collected values that were generated.
|
# Collected values that were generated.
|
||||||
gen_str = gen_strings[request_id]
|
gen_str = gen_strings[request_id]
|
||||||
@ -848,13 +865,13 @@ def test_stop_string(
|
|||||||
|
|
||||||
def test_iteration_stats(dummy_test_vectors):
|
def test_iteration_stats(dummy_test_vectors):
|
||||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
|
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
|
||||||
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
|
|
||||||
engine_core_timestamp = time.monotonic()
|
engine_core_timestamp = time.monotonic()
|
||||||
|
|
||||||
# Make N requests.
|
# Make N requests.
|
||||||
requests = [
|
requests = [
|
||||||
EngineCoreRequest(
|
EngineCoreRequest(
|
||||||
request_id=f"request-{idx}",
|
request_id=f"request-{idx}",
|
||||||
|
external_req_id=f"request-{idx}-ext",
|
||||||
prompt_token_ids=prompt_tokens,
|
prompt_token_ids=prompt_tokens,
|
||||||
mm_features=None,
|
mm_features=None,
|
||||||
eos_token_id=None,
|
eos_token_id=None,
|
||||||
@ -868,6 +885,11 @@ def test_iteration_stats(dummy_test_vectors):
|
|||||||
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
engine_core = MockEngineCore(
|
||||||
|
dummy_test_vectors.generation_tokens,
|
||||||
|
request_ids=[req.request_id for req in requests],
|
||||||
|
)
|
||||||
|
|
||||||
# Add all requests except one to the OutputProcessor.
|
# Add all requests except one to the OutputProcessor.
|
||||||
num_active = len(dummy_test_vectors.generation_tokens) - 1
|
num_active = len(dummy_test_vectors.generation_tokens) - 1
|
||||||
for request in requests[:num_active]:
|
for request in requests[:num_active]:
|
||||||
@ -922,7 +944,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
|||||||
output_processor = OutputProcessor(
|
output_processor = OutputProcessor(
|
||||||
dummy_test_vectors.tokenizer, log_stats=log_stats
|
dummy_test_vectors.tokenizer, log_stats=log_stats
|
||||||
)
|
)
|
||||||
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
|
|
||||||
engine_core_timestamp = time.monotonic()
|
engine_core_timestamp = time.monotonic()
|
||||||
|
|
||||||
# Create LoRA requests
|
# Create LoRA requests
|
||||||
@ -936,7 +957,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
|||||||
lora_assignments = [lora1, lora2, None]
|
lora_assignments = [lora1, lora2, None]
|
||||||
requests = [
|
requests = [
|
||||||
EngineCoreRequest(
|
EngineCoreRequest(
|
||||||
request_id=f"request-{idx}",
|
request_id=f"request-{idx}-int",
|
||||||
|
external_req_id=f"request-{idx}",
|
||||||
prompt_token_ids=prompt_tokens,
|
prompt_token_ids=prompt_tokens,
|
||||||
mm_features=None,
|
mm_features=None,
|
||||||
eos_token_id=None,
|
eos_token_id=None,
|
||||||
@ -950,6 +972,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
|||||||
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
engine_core = MockEngineCore(
|
||||||
|
dummy_test_vectors.generation_tokens,
|
||||||
|
request_ids=[req.request_id for req in requests],
|
||||||
|
)
|
||||||
|
|
||||||
# Add all requests to the OutputProcessor
|
# Add all requests to the OutputProcessor
|
||||||
for request in requests:
|
for request in requests:
|
||||||
output_processor.add_request(request, None)
|
output_processor.add_request(request, None)
|
||||||
@ -1015,9 +1042,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
|||||||
outputs = EngineCoreOutputs(
|
outputs = EngineCoreOutputs(
|
||||||
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
|
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
|
||||||
)
|
)
|
||||||
# Find and mark request-0 as finished (it uses lora-1)
|
# Find and mark request-0-int as finished (it uses lora-1)
|
||||||
for output in outputs.outputs:
|
for output in outputs.outputs:
|
||||||
if output.request_id == "request-0":
|
if output.request_id == "request-0-int":
|
||||||
output.finish_reason = FinishReason.LENGTH
|
output.finish_reason = FinishReason.LENGTH
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -1040,9 +1067,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
|||||||
outputs = EngineCoreOutputs(
|
outputs = EngineCoreOutputs(
|
||||||
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
|
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
|
||||||
)
|
)
|
||||||
# Find and mark request-1 as finished (it uses lora-2)
|
# Find and mark request-1-int as finished (it uses lora-2)
|
||||||
for output in outputs.outputs:
|
for output in outputs.outputs:
|
||||||
if output.request_id == "request-1":
|
if output.request_id == "request-1-int":
|
||||||
output.finish_reason = FinishReason.LENGTH
|
output.finish_reason = FinishReason.LENGTH
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -1064,9 +1091,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
|||||||
outputs = EngineCoreOutputs(
|
outputs = EngineCoreOutputs(
|
||||||
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
|
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
|
||||||
)
|
)
|
||||||
# Find and mark request-2 as finished (it has no LoRA)
|
# Find and mark request-2-int as finished (it has no LoRA)
|
||||||
for output in outputs.outputs:
|
for output in outputs.outputs:
|
||||||
if output.request_id == "request-2":
|
if output.request_id == "request-2-int":
|
||||||
output.finish_reason = FinishReason.LENGTH
|
output.finish_reason = FinishReason.LENGTH
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -1107,7 +1134,9 @@ async def test_request_output_collector():
|
|||||||
for idx in range(NUM_REQS)
|
for idx in range(NUM_REQS)
|
||||||
]
|
]
|
||||||
|
|
||||||
collector = RequestOutputCollector(RequestOutputKind.DELTA)
|
collector = RequestOutputCollector(
|
||||||
|
RequestOutputKind.DELTA, request_id="my-request-id-int"
|
||||||
|
)
|
||||||
|
|
||||||
# CASE 1: Put then get.
|
# CASE 1: Put then get.
|
||||||
outputs = make_outputs()
|
outputs = make_outputs()
|
||||||
@ -1163,7 +1192,9 @@ async def test_request_output_collector():
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_cumulative_output_collector_n():
|
async def test_cumulative_output_collector_n():
|
||||||
"""Test collector correctly handles multiple outputs by index."""
|
"""Test collector correctly handles multiple outputs by index."""
|
||||||
collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
|
collector = RequestOutputCollector(
|
||||||
|
RequestOutputKind.CUMULATIVE, request_id="my-request-id-int"
|
||||||
|
)
|
||||||
outputs = [
|
outputs = [
|
||||||
RequestOutput(
|
RequestOutput(
|
||||||
request_id="my-request-id",
|
request_id="my-request-id",
|
||||||
@ -1242,11 +1273,13 @@ async def test_cumulative_output_collector_n():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("runner", ["generate", "pooling"])
|
@pytest.mark.parametrize("runner", ["generate", "pooling"])
|
||||||
def test_abort_requests(runner: str, dummy_test_vectors):
|
@pytest.mark.parametrize("abort_by", ["internal", "external"])
|
||||||
|
def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors):
|
||||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
|
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
|
||||||
requests = [
|
requests = [
|
||||||
EngineCoreRequest(
|
EngineCoreRequest(
|
||||||
request_id=f"request-{idx}",
|
request_id=f"request-{idx}",
|
||||||
|
external_req_id=f"external-{idx}",
|
||||||
prompt_token_ids=prompt_tokens,
|
prompt_token_ids=prompt_tokens,
|
||||||
mm_features=None,
|
mm_features=None,
|
||||||
eos_token_id=None,
|
eos_token_id=None,
|
||||||
@ -1265,8 +1298,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
|
|||||||
output_kind = request.sampling_params.output_kind
|
output_kind = request.sampling_params.output_kind
|
||||||
else:
|
else:
|
||||||
output_kind = request.pooling_params.output_kind
|
output_kind = request.pooling_params.output_kind
|
||||||
queue = RequestOutputCollector(output_kind=output_kind)
|
queue = RequestOutputCollector(
|
||||||
|
output_kind=output_kind, request_id=request.request_id
|
||||||
|
)
|
||||||
output_processor.add_request(request, None, queue=queue)
|
output_processor.add_request(request, None, queue=queue)
|
||||||
|
|
||||||
for request in requests:
|
for request in requests:
|
||||||
output_processor.abort_requests([request.request_id])
|
if abort_by == "internal":
|
||||||
|
output_processor.abort_requests([request.request_id], internal=True)
|
||||||
|
else:
|
||||||
|
output_processor.abort_requests([request.external_req_id], internal=False)
|
||||||
|
|||||||
@ -4,11 +4,12 @@
|
|||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
from vllm.outputs import CompletionOutput
|
from vllm.outputs import CompletionOutput
|
||||||
from vllm.sampling_params import RequestOutputKind
|
from vllm.sampling_params import RequestOutputKind
|
||||||
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
from vllm.v1.engine.parallel_sampling import ParentRequest
|
from vllm.v1.engine.parallel_sampling import ParentRequest
|
||||||
|
|
||||||
|
|
||||||
def test_parent_request_to_output_stream() -> None:
|
def test_parent_request_to_output_stream() -> None:
|
||||||
parent_request = ParentRequest("parent_id", SamplingParams(n=2))
|
parent_request = ParentRequest(make_request(SamplingParams(n=2)))
|
||||||
parent_request.child_requests = {"child_id_0", "child_id_1"}
|
parent_request.child_requests = {"child_id_0", "child_id_1"}
|
||||||
output_0 = CompletionOutput(
|
output_0 = CompletionOutput(
|
||||||
index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
|
index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
|
||||||
@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
|
|||||||
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
|
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
|
||||||
)
|
)
|
||||||
# Request not finished
|
# Request not finished
|
||||||
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
|
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
|
||||||
"child_id_0", output_0
|
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
|
||||||
)
|
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
|
||||||
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
|
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
|
||||||
"child_id_1", output_1
|
|
||||||
)
|
|
||||||
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
|
|
||||||
"child_id_0", output_0
|
|
||||||
)
|
|
||||||
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
|
|
||||||
"child_id_1", output_1
|
|
||||||
)
|
|
||||||
|
|
||||||
# output_1 finished
|
# output_1 finished
|
||||||
output_1.finish_reason = "ended"
|
output_1.finish_reason = "ended"
|
||||||
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
|
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
|
||||||
"child_id_0", output_0
|
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
|
||||||
)
|
|
||||||
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
|
|
||||||
"child_id_1", output_1
|
|
||||||
)
|
|
||||||
# Finished output_1 had already returned, DO NOT returned again
|
# Finished output_1 had already returned, DO NOT returned again
|
||||||
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
|
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
|
||||||
"child_id_0", output_0
|
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
|
||||||
)
|
|
||||||
assert parent_request.get_outputs("child_id_1", output_1) == (
|
|
||||||
"parent_id",
|
|
||||||
[],
|
|
||||||
False,
|
|
||||||
)
|
|
||||||
|
|
||||||
# output_0 finished
|
# output_0 finished
|
||||||
output_0.finish_reason = "ended"
|
output_0.finish_reason = "ended"
|
||||||
assert ("parent_id", [output_0], True) == parent_request.get_outputs(
|
assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0)
|
||||||
"child_id_0", output_0
|
assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
|
||||||
)
|
|
||||||
assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
|
|
||||||
# Finished output_0 had already returned, DO NOT returned again
|
# Finished output_0 had already returned, DO NOT returned again
|
||||||
assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
|
assert parent_request.get_outputs("child_id_0", output_0) == ([], True)
|
||||||
assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
|
assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
|
||||||
|
|
||||||
|
|
||||||
def test_parent_request_to_output_final_only() -> None:
|
def test_parent_request_to_output_final_only() -> None:
|
||||||
parent_request = ParentRequest(
|
parent_request = ParentRequest(
|
||||||
"parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
|
make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY))
|
||||||
)
|
)
|
||||||
parent_request.child_requests = {"child_id_0", "child_id_1"}
|
parent_request.child_requests = {"child_id_0", "child_id_1"}
|
||||||
output_0 = CompletionOutput(
|
output_0 = CompletionOutput(
|
||||||
@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
|
|||||||
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
|
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
|
||||||
)
|
)
|
||||||
# Request not finished, return nothing
|
# Request not finished, return nothing
|
||||||
assert parent_request.get_outputs("child_id_0", output_0) == (
|
assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
|
||||||
"parent_id",
|
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
|
||||||
[],
|
|
||||||
False,
|
|
||||||
)
|
|
||||||
assert parent_request.get_outputs("child_id_1", output_1) == (
|
|
||||||
"parent_id",
|
|
||||||
[],
|
|
||||||
False,
|
|
||||||
)
|
|
||||||
# output_1 finished, but outputs won't be returned until all child requests finished
|
# output_1 finished, but outputs won't be returned until all child requests finished
|
||||||
output_1.finish_reason = "ended"
|
output_1.finish_reason = "ended"
|
||||||
assert parent_request.get_outputs("child_id_0", output_0) == (
|
assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
|
||||||
"parent_id",
|
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
|
||||||
[],
|
|
||||||
False,
|
|
||||||
)
|
|
||||||
assert parent_request.get_outputs("child_id_1", output_1) == (
|
|
||||||
"parent_id",
|
|
||||||
[],
|
|
||||||
False,
|
|
||||||
)
|
|
||||||
# output_0 finished, as all child requests finished, the output would be returned
|
# output_0 finished, as all child requests finished, the output would be returned
|
||||||
output_0.finish_reason = "ended"
|
output_0.finish_reason = "ended"
|
||||||
assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
|
assert ([output_0, output_1], True) == parent_request.get_outputs(
|
||||||
"child_id_0", output_0
|
"child_id_0", output_0
|
||||||
)
|
)
|
||||||
assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
|
assert ([output_0, output_1], True) == parent_request.get_outputs(
|
||||||
"child_id_1", output_1
|
"child_id_1", output_1
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def make_request(sampling_params: SamplingParams) -> EngineCoreRequest:
|
||||||
|
return EngineCoreRequest(
|
||||||
|
request_id="parent_id",
|
||||||
|
external_req_id="ext_parent_id",
|
||||||
|
prompt_token_ids=None,
|
||||||
|
mm_features=None,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
pooling_params=None,
|
||||||
|
eos_token_id=None,
|
||||||
|
arrival_time=0.0,
|
||||||
|
lora_request=None,
|
||||||
|
cache_salt=None,
|
||||||
|
data_parallel_rank=None,
|
||||||
|
)
|
||||||
|
|||||||
@ -6,6 +6,7 @@ import pytest
|
|||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.assets.video import VideoAsset
|
from vllm.assets.video import VideoAsset
|
||||||
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
|
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
|
||||||
|
from vllm.multimodal import MultiModalUUIDDict
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.v1.engine import input_processor as input_processor_mod
|
from vllm.v1.engine import input_processor as input_processor_mod
|
||||||
from vllm.v1.engine.input_processor import InputProcessor
|
from vllm.v1.engine.input_processor import InputProcessor
|
||||||
@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
|
|||||||
monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
|
monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
|
||||||
)
|
)
|
||||||
|
|
||||||
captured: dict[str, object] = {}
|
captured: dict[str, MultiModalUUIDDict] = {}
|
||||||
|
|
||||||
def fake_preprocess(
|
def fake_preprocess(
|
||||||
prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
|
prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
|
||||||
@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Expect request-id-based overrides are passed through
|
# Expect request-id-based overrides are passed through
|
||||||
assert captured["mm_uuids"] == {
|
mm_uuids = captured["mm_uuids"]
|
||||||
"image": [f"{request_id}-image-0", f"{request_id}-image-1"],
|
assert set(mm_uuids.keys()) == {"image", "video"}
|
||||||
"video": [f"{request_id}-video-0"],
|
assert len(mm_uuids["image"]) == 2
|
||||||
}
|
assert len(mm_uuids["video"]) == 1
|
||||||
|
assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
|
||||||
|
"image"
|
||||||
|
][0].endswith("-0")
|
||||||
|
assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
|
||||||
|
"image"
|
||||||
|
][1].endswith("-1")
|
||||||
|
assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
|
||||||
|
"video"
|
||||||
|
][0].endswith("-0")
|
||||||
|
|||||||
@ -343,6 +343,7 @@ class MockEngineCore:
|
|||||||
eos_token_id: int | None = None,
|
eos_token_id: int | None = None,
|
||||||
stop_token_ids: list[int] | None = None,
|
stop_token_ids: list[int] | None = None,
|
||||||
ignore_eos: bool = False,
|
ignore_eos: bool = False,
|
||||||
|
request_ids: list[str] | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.num_requests = len(tokens_list)
|
self.num_requests = len(tokens_list)
|
||||||
self.tokens_list = tokens_list
|
self.tokens_list = tokens_list
|
||||||
@ -355,6 +356,11 @@ class MockEngineCore:
|
|||||||
self.eos_token_id = eos_token_id
|
self.eos_token_id = eos_token_id
|
||||||
self.stop_token_ids = stop_token_ids
|
self.stop_token_ids = stop_token_ids
|
||||||
self.ignore_eos = ignore_eos
|
self.ignore_eos = ignore_eos
|
||||||
|
self.request_ids = (
|
||||||
|
request_ids
|
||||||
|
if request_ids is not None
|
||||||
|
else [f"request-{i}" for i in range(self.num_requests)]
|
||||||
|
)
|
||||||
|
|
||||||
def get_outputs(self) -> list[EngineCoreOutput]:
|
def get_outputs(self) -> list[EngineCoreOutput]:
|
||||||
do_logprobs = self.do_logprobs
|
do_logprobs = self.do_logprobs
|
||||||
@ -386,7 +392,7 @@ class MockEngineCore:
|
|||||||
prompt_logprobs = None
|
prompt_logprobs = None
|
||||||
new_token_id = token_ids[token_idx]
|
new_token_id = token_ids[token_idx]
|
||||||
output = EngineCoreOutput(
|
output = EngineCoreOutput(
|
||||||
request_id=f"request-{req_idx}",
|
request_id=self.request_ids[req_idx],
|
||||||
new_token_ids=[new_token_id],
|
new_token_ids=[new_token_id],
|
||||||
new_logprobs=logprobs,
|
new_logprobs=logprobs,
|
||||||
new_prompt_logprobs_tensors=prompt_logprobs,
|
new_prompt_logprobs_tensors=prompt_logprobs,
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import pytest
|
|||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
|
|
||||||
from tests.utils import RemoteOpenAIServer
|
from tests.utils import RemoteOpenAIServer
|
||||||
from vllm.multimodal.utils import encode_image_base64
|
from vllm.multimodal.utils import encode_image_url
|
||||||
|
|
||||||
# Use a small vision model for testing
|
# Use a small vision model for testing
|
||||||
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
|
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||||
@ -52,9 +52,9 @@ async def client(image_server):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
def url_encoded_image(local_asset_server) -> dict[str, str]:
|
||||||
return {
|
return {
|
||||||
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
|
image_url: encode_image_url(local_asset_server.get_image_asset(image_url))
|
||||||
for image_url in TEST_IMAGE_ASSETS
|
for image_url in TEST_IMAGE_ASSETS
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded(
|
|||||||
client: openai.AsyncOpenAI,
|
client: openai.AsyncOpenAI,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
raw_image_url: str,
|
raw_image_url: str,
|
||||||
base64_encoded_image: dict[str, str],
|
url_encoded_image: dict[str, str],
|
||||||
):
|
):
|
||||||
content_text = "What's in this image?"
|
content_text = "What's in this image?"
|
||||||
messages = [
|
messages = [
|
||||||
@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
|
|||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "input_image",
|
"type": "input_image",
|
||||||
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501
|
"image_url": url_encoded_image[raw_image_url],
|
||||||
"detail": "auto",
|
"detail": "auto",
|
||||||
},
|
},
|
||||||
{"type": "input_text", "text": content_text},
|
{"type": "input_text", "text": content_text},
|
||||||
|
|||||||
@ -9,7 +9,7 @@ from PIL import Image
|
|||||||
from vllm import LLM, EngineArgs, SamplingParams
|
from vllm import LLM, EngineArgs, SamplingParams
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.config import KVTransferConfig
|
from vllm.config import KVTransferConfig
|
||||||
from vllm.multimodal.utils import encode_image_base64
|
from vllm.multimodal.utils import encode_image_url
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
|
MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
|
||||||
@ -74,7 +74,7 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
|
|||||||
placeholders = [
|
placeholders = [
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {"url": f"data:image;base64,{encode_image_base64(image_pil)}"},
|
"image_url": {"url": encode_image_url(image_pil)},
|
||||||
}
|
}
|
||||||
for image_pil in image_urls
|
for image_pil in image_urls
|
||||||
]
|
]
|
||||||
|
|||||||
@ -41,10 +41,13 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
|
|||||||
has_kv_transfer_group,
|
has_kv_transfer_group,
|
||||||
)
|
)
|
||||||
from vllm.forward_context import ForwardContext
|
from vllm.forward_context import ForwardContext
|
||||||
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.platforms.interface import Platform
|
from vllm.platforms.interface import Platform
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
||||||
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
|
from vllm.v1.engine.output_processor import OutputProcessor
|
||||||
from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
|
from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
|
||||||
from vllm.v1.request import RequestStatus
|
from vllm.v1.request import RequestStatus
|
||||||
|
|
||||||
@ -1265,6 +1268,22 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
|
|||||||
run_test_and_cleanup()
|
run_test_and_cleanup()
|
||||||
|
|
||||||
|
|
||||||
|
class RequestIdMapper:
|
||||||
|
"""Helper class to map external request IDs to internal request IDs."""
|
||||||
|
|
||||||
|
def __init__(self, output_processor: OutputProcessor):
|
||||||
|
self.req_id_mapping: dict[str, str] = {}
|
||||||
|
self.original_add_request = output_processor.add_request
|
||||||
|
output_processor.add_request = self._add_request
|
||||||
|
|
||||||
|
def _add_request(self, request: EngineCoreRequest, *args, **kwargs):
|
||||||
|
self.req_id_mapping[request.external_req_id] = request.request_id
|
||||||
|
return self.original_add_request(request, *args, **kwargs)
|
||||||
|
|
||||||
|
def __call__(self, external_req_id: str) -> str:
|
||||||
|
return self.req_id_mapping[external_req_id]
|
||||||
|
|
||||||
|
|
||||||
def _run_abort_timeout_test(llm: LLM, timeout: int):
|
def _run_abort_timeout_test(llm: LLM, timeout: int):
|
||||||
"""Helper function to run the abort timeout test logic."""
|
"""Helper function to run the abort timeout test logic."""
|
||||||
remote_prefill_opts = {
|
remote_prefill_opts = {
|
||||||
@ -1286,24 +1305,34 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
|
|||||||
0
|
0
|
||||||
].req_to_blocks
|
].req_to_blocks
|
||||||
|
|
||||||
|
id_mapper = RequestIdMapper(llm.llm_engine.output_processor)
|
||||||
|
|
||||||
|
def req_id(outputs: list[RequestOutput]) -> str:
|
||||||
|
assert len(outputs) == 1
|
||||||
|
return id_mapper(outputs[0].request_id)
|
||||||
|
|
||||||
padding = "Just making this request a little longer so that we're sure "
|
padding = "Just making this request a little longer so that we're sure "
|
||||||
"we're not hitting the small-request lower bound beneath which we don't "
|
"we're not hitting the small-request lower bound beneath which we don't "
|
||||||
"actually trigger the whole kv transfer, but rather just recompute the "
|
"actually trigger the whole kv transfer, but rather just recompute the "
|
||||||
"blocks on D."
|
"blocks on D."
|
||||||
_ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
|
req0_id = req_id(
|
||||||
|
llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
|
||||||
|
)
|
||||||
|
|
||||||
# Request finished but not freed
|
# Request finished but not freed
|
||||||
assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks
|
assert req0_id in scheduler.finished_req_ids and req0_id in req_to_blocks
|
||||||
# Some other request, 0 still not freed
|
# Some other request, 0 still not freed
|
||||||
_ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
|
req1_id = req_id(
|
||||||
assert "0" in req_to_blocks
|
llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
|
||||||
assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks
|
)
|
||||||
|
assert req0_id in req_to_blocks
|
||||||
|
assert req1_id in scheduler.finished_req_ids and req1_id in req_to_blocks
|
||||||
|
|
||||||
# Wait for timeout and trigger another scheduler loop
|
# Wait for timeout and trigger another scheduler loop
|
||||||
time.sleep(timeout)
|
time.sleep(timeout)
|
||||||
_ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
|
_ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
|
||||||
# Request-0 times out and is cleared!
|
# Request-0 times out and is cleared!
|
||||||
assert "0" not in req_to_blocks
|
assert req0_id not in req_to_blocks
|
||||||
# Need to shutdown the background thread to release NIXL side channel port
|
# Need to shutdown the background thread to release NIXL side channel port
|
||||||
llm.llm_engine.engine_core.shutdown()
|
llm.llm_engine.engine_core.shutdown()
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
import openai
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.multimodal.utils import encode_image_base64
|
from vllm.multimodal.utils import encode_image_url
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
|
from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
|
||||||
@ -12,11 +12,9 @@ from ...utils import RemoteOpenAIServer
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
def url_encoded_image(local_asset_server) -> dict[str, str]:
|
||||||
return {
|
return {
|
||||||
image_asset: encode_image_base64(
|
image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
|
||||||
local_asset_server.get_image_asset(image_asset)
|
|
||||||
)
|
|
||||||
for image_asset in TEST_IMAGE_ASSETS
|
for image_asset in TEST_IMAGE_ASSETS
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -24,19 +22,16 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
|
@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
|
||||||
@pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
|
@pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
|
||||||
async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str]):
|
async def test_basic_vision(model_name: str, url_encoded_image: dict[str, str]):
|
||||||
pytest.skip("Skip this test until it's fixed.")
|
pytest.skip("Skip this test until it's fixed.")
|
||||||
|
|
||||||
def whats_in_this_image_msg(b64):
|
def whats_in_this_image_msg(url):
|
||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": "What's in this image?"},
|
{"type": "text", "text": "What's in this image?"},
|
||||||
{
|
{"type": "image_url", "image_url": {"url": url}},
|
||||||
"type": "image_url",
|
|
||||||
"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
|
|
||||||
},
|
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -63,14 +58,14 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str
|
|||||||
|
|
||||||
# Other requests now should be much faster
|
# Other requests now should be much faster
|
||||||
for image_url in TEST_IMAGE_ASSETS:
|
for image_url in TEST_IMAGE_ASSETS:
|
||||||
image_base64 = base64_encoded_image[image_url]
|
image_url = url_encoded_image[image_url]
|
||||||
chat_completion_from_base64 = await client.chat.completions.create(
|
chat_completion_from_url = await client.chat.completions.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
messages=whats_in_this_image_msg(image_base64),
|
messages=whats_in_this_image_msg(image_url),
|
||||||
max_completion_tokens=24,
|
max_completion_tokens=24,
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
result = chat_completion_from_base64
|
result = chat_completion_from_url
|
||||||
assert result
|
assert result
|
||||||
choice = result.choices[0]
|
choice = result.choices[0]
|
||||||
assert choice.finish_reason == "length"
|
assert choice.finish_reason == "length"
|
||||||
|
|||||||
@ -380,6 +380,31 @@ def _rocm_aiter_gemm_a8w8_fake(
|
|||||||
return Y
|
return Y
|
||||||
|
|
||||||
|
|
||||||
|
def _rocm_aiter_triton_gemm_a8w8_blockscale_impl(
|
||||||
|
A: torch.Tensor,
|
||||||
|
B: torch.Tensor,
|
||||||
|
As: torch.Tensor,
|
||||||
|
Bs: torch.Tensor,
|
||||||
|
output_dtype: torch.dtype = torch.float16,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
|
||||||
|
|
||||||
|
return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def _rocm_aiter_triton_gemm_a8w8_blockscale_fake(
|
||||||
|
A: torch.Tensor,
|
||||||
|
B: torch.Tensor,
|
||||||
|
As: torch.Tensor,
|
||||||
|
Bs: torch.Tensor,
|
||||||
|
output_dtype: torch.dtype = torch.float16,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
m = A.shape[0]
|
||||||
|
n = B.shape[0]
|
||||||
|
Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
|
||||||
|
return Y
|
||||||
|
|
||||||
|
|
||||||
def _rocm_aiter_gemm_a8w8_blockscale_impl(
|
def _rocm_aiter_gemm_a8w8_blockscale_impl(
|
||||||
A: torch.Tensor,
|
A: torch.Tensor,
|
||||||
B: torch.Tensor,
|
B: torch.Tensor,
|
||||||
@ -964,6 +989,12 @@ class rocm_aiter_ops:
|
|||||||
dispatch_key=current_platform.dispatch_key,
|
dispatch_key=current_platform.dispatch_key,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
direct_register_custom_op(
|
||||||
|
op_name="rocm_aiter_triton_gemm_a8w8_blockscale",
|
||||||
|
op_func=_rocm_aiter_triton_gemm_a8w8_blockscale_impl,
|
||||||
|
fake_impl=_rocm_aiter_triton_gemm_a8w8_blockscale_fake,
|
||||||
|
)
|
||||||
|
|
||||||
direct_register_custom_op(
|
direct_register_custom_op(
|
||||||
op_name="rocm_aiter_gemm_a8w8_blockscale",
|
op_name="rocm_aiter_gemm_a8w8_blockscale",
|
||||||
op_func=_rocm_aiter_gemm_a8w8_blockscale_impl,
|
op_func=_rocm_aiter_gemm_a8w8_blockscale_impl,
|
||||||
@ -1102,6 +1133,19 @@ class rocm_aiter_ops:
|
|||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype)
|
return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def triton_gemm_a8w8_blockscale(
|
||||||
|
A: torch.Tensor,
|
||||||
|
B: torch.Tensor,
|
||||||
|
As: torch.Tensor,
|
||||||
|
Bs: torch.Tensor,
|
||||||
|
block_size: list[int],
|
||||||
|
output_dtype: torch.dtype = torch.float16,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
return torch.ops.vllm.rocm_aiter_triton_gemm_a8w8_blockscale(
|
||||||
|
A, B, As, Bs, output_dtype
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def gemm_a8w8_blockscale(
|
def gemm_a8w8_blockscale(
|
||||||
A: torch.Tensor,
|
A: torch.Tensor,
|
||||||
@ -1373,19 +1417,6 @@ class rocm_aiter_ops:
|
|||||||
config=config,
|
config=config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def triton_gemm_a8w8_blockscale(
|
|
||||||
A: torch.Tensor,
|
|
||||||
B: torch.Tensor,
|
|
||||||
As: torch.Tensor,
|
|
||||||
Bs: torch.Tensor,
|
|
||||||
block_size: list[int],
|
|
||||||
output_dtype: torch.dtype = torch.float16,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
|
|
||||||
|
|
||||||
return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def group_fp8_quant(
|
def group_fp8_quant(
|
||||||
input_2d: torch.Tensor,
|
input_2d: torch.Tensor,
|
||||||
|
|||||||
@ -2328,18 +2328,6 @@ def concat_and_cache_mla(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def copy_blocks(
|
|
||||||
key_caches: list[torch.Tensor],
|
|
||||||
value_caches: list[torch.Tensor],
|
|
||||||
block_mapping: torch.Tensor,
|
|
||||||
) -> None:
|
|
||||||
torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
|
|
||||||
|
|
||||||
|
|
||||||
def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None:
|
|
||||||
torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
|
|
||||||
|
|
||||||
|
|
||||||
def swap_blocks(
|
def swap_blocks(
|
||||||
src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
|
src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|||||||
@ -383,18 +383,6 @@ class ipex_ops:
|
|||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def copy_blocks(
|
|
||||||
key_caches: list[torch.Tensor],
|
|
||||||
value_caches: list[torch.Tensor],
|
|
||||||
block_mapping: torch.Tensor,
|
|
||||||
) -> None:
|
|
||||||
torch.xpu.copy_blocks( # type: ignore
|
|
||||||
key_caches,
|
|
||||||
value_caches,
|
|
||||||
block_mapping,
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def swap_blocks(
|
def swap_blocks(
|
||||||
src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
|
src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
|
||||||
|
|||||||
@ -136,7 +136,7 @@ class MMEncoderAttention(CustomOp):
|
|||||||
cu_seqlens=cu_seqlens,
|
cu_seqlens=cu_seqlens,
|
||||||
)
|
)
|
||||||
if is_reshaped:
|
if is_reshaped:
|
||||||
output = output.view(bsz, q_len, -1)
|
output = output.reshape(bsz, q_len, -1)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def _forward_fa(
|
def _forward_fa(
|
||||||
@ -174,7 +174,7 @@ class MMEncoderAttention(CustomOp):
|
|||||||
fa_version=self._fa_version,
|
fa_version=self._fa_version,
|
||||||
)
|
)
|
||||||
if is_reshaped:
|
if is_reshaped:
|
||||||
output = output.view(bsz, q_len, -1)
|
output = output.reshape(bsz, q_len, -1)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def forward_native(
|
def forward_native(
|
||||||
|
|||||||
@ -1847,7 +1847,6 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
|
|||||||
random_seed=args.seed,
|
random_seed=args.seed,
|
||||||
dataset_path=args.dataset_path,
|
dataset_path=args.dataset_path,
|
||||||
disable_shuffle=args.disable_shuffle,
|
disable_shuffle=args.disable_shuffle,
|
||||||
prefix_len=args.common_prefix_len,
|
|
||||||
).sample(
|
).sample(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
|
|||||||
@ -1281,12 +1281,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
help="Repetition penalty sampling parameter. Only has effect on "
|
help="Repetition penalty sampling parameter. Only has effect on "
|
||||||
"openai-compatible backends.",
|
"openai-compatible backends.",
|
||||||
)
|
)
|
||||||
sampling_group.add_argument(
|
|
||||||
"--common-prefix-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Common prefix length shared by all prompts (used by random dataset)",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--served-model-name",
|
"--served-model-name",
|
||||||
|
|||||||
@ -172,7 +172,10 @@ class ModelConfig:
|
|||||||
format. Examples:\n
|
format. Examples:\n
|
||||||
- 1k -> 1000\n
|
- 1k -> 1000\n
|
||||||
- 1K -> 1024\n
|
- 1K -> 1024\n
|
||||||
- 25.6k -> 25,600"""
|
- 25.6k -> 25,600\n
|
||||||
|
- -1 or 'auto' -> Automatically choose the maximum model length that fits in
|
||||||
|
GPU memory. This will use the model's maximum context length if it fits,
|
||||||
|
otherwise it will find the largest length that can be accommodated."""
|
||||||
spec_target_max_model_len: int | None = None
|
spec_target_max_model_len: int | None = None
|
||||||
"""Specify the maximum length for spec decoding draft models."""
|
"""Specify the maximum length for spec decoding draft models."""
|
||||||
quantization: QuantizationMethods | str | None = None
|
quantization: QuantizationMethods | str | None = None
|
||||||
@ -1094,11 +1097,10 @@ class ModelConfig:
|
|||||||
# The size of inputs_embeds is usually identical to the size
|
# The size of inputs_embeds is usually identical to the size
|
||||||
# of the hidden states, however there are exceptions, such as
|
# of the hidden states, however there are exceptions, such as
|
||||||
# embedding models like CLIP and SigLIP
|
# embedding models like CLIP and SigLIP
|
||||||
for target_attr in ("projection_dim", "projection_size"):
|
names = ("projection_dim", "projection_size")
|
||||||
if hasattr(self.hf_text_config, target_attr):
|
return getattr_iter(
|
||||||
return getattr(self.hf_text_config, target_attr)
|
self.hf_text_config, names, default_factory=self.get_hidden_size
|
||||||
|
)
|
||||||
return self.get_hidden_size()
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_deepseek_mla(self) -> bool:
|
def is_deepseek_mla(self) -> bool:
|
||||||
@ -1231,14 +1233,12 @@ class ModelConfig:
|
|||||||
# For ChatGLM:
|
# For ChatGLM:
|
||||||
"multi_query_group_num",
|
"multi_query_group_num",
|
||||||
]
|
]
|
||||||
for attr in attributes:
|
|
||||||
num_kv_heads = getattr(self.hf_text_config, attr, None)
|
|
||||||
if num_kv_heads is not None:
|
|
||||||
return num_kv_heads
|
|
||||||
|
|
||||||
# For non-grouped-query attention models, the number of KV heads is
|
# For non-grouped-query attention models, the number of KV heads is
|
||||||
# equal to the number of attention heads.
|
# equal to the number of attention heads.
|
||||||
return self.hf_text_config.num_attention_heads
|
default_factory = lambda: self.hf_text_config.num_attention_heads
|
||||||
|
return getattr_iter(
|
||||||
|
self.hf_text_config, attributes, default_factory=default_factory
|
||||||
|
)
|
||||||
|
|
||||||
def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int:
|
def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int:
|
||||||
"""Returns the number of KV heads per GPU."""
|
"""Returns the number of KV heads per GPU."""
|
||||||
@ -1542,6 +1542,10 @@ class ModelConfig:
|
|||||||
def is_multimodal_raw_input_only_model(self) -> bool:
|
def is_multimodal_raw_input_only_model(self) -> bool:
|
||||||
return self._model_info.supports_multimodal_raw_input_only
|
return self._model_info.supports_multimodal_raw_input_only
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_raw_input_tokens(self) -> bool:
|
||||||
|
return self._model_info.requires_raw_input_tokens
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_cross_encoder(self) -> bool:
|
def is_cross_encoder(self) -> bool:
|
||||||
return (
|
return (
|
||||||
@ -2150,9 +2154,10 @@ def _get_and_verify_max_len(
|
|||||||
if encoder_config and "max_seq_length" in encoder_config:
|
if encoder_config and "max_seq_length" in encoder_config:
|
||||||
derived_max_model_len = encoder_config["max_seq_length"]
|
derived_max_model_len = encoder_config["max_seq_length"]
|
||||||
|
|
||||||
# If the user didn't specify `max_model_len`, then use that derived from
|
# If the user didn't specify `max_model_len` or specified -1 (auto-fit),
|
||||||
# the model config as a default value.
|
# then use that derived from the model config as a default value.
|
||||||
if max_model_len is None:
|
# When -1 is specified, the engine will later auto-fit to available memory.
|
||||||
|
if max_model_len is None or max_model_len == -1:
|
||||||
# For LongRoPE, default to original_max_position_embeddings to avoid
|
# For LongRoPE, default to original_max_position_embeddings to avoid
|
||||||
# performance degradation for shorter sequences
|
# performance degradation for shorter sequences
|
||||||
if rope_parameters is not None and any(
|
if rope_parameters is not None and any(
|
||||||
|
|||||||
@ -465,6 +465,7 @@ class ParallelConfig:
|
|||||||
# Derived/runtime topology, networking, or launch details
|
# Derived/runtime topology, networking, or launch details
|
||||||
"data_parallel_rank",
|
"data_parallel_rank",
|
||||||
"data_parallel_rank_local",
|
"data_parallel_rank_local",
|
||||||
|
"data_parallel_size_local",
|
||||||
"data_parallel_backend",
|
"data_parallel_backend",
|
||||||
"data_parallel_external_lb",
|
"data_parallel_external_lb",
|
||||||
"data_parallel_hybrid_lb",
|
"data_parallel_hybrid_lb",
|
||||||
|
|||||||
@ -9,7 +9,7 @@ import inspect
|
|||||||
import json
|
import json
|
||||||
import pathlib
|
import pathlib
|
||||||
import textwrap
|
import textwrap
|
||||||
from collections.abc import Iterable, Mapping, Sequence, Set
|
from collections.abc import Callable, Iterable, Mapping, Sequence, Set
|
||||||
from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
|
from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
|
||||||
from itertools import pairwise
|
from itertools import pairwise
|
||||||
from typing import TYPE_CHECKING, Any, Protocol, TypeVar
|
from typing import TYPE_CHECKING, Any, Protocol, TypeVar
|
||||||
@ -74,7 +74,11 @@ def get_field(cls: ConfigType, name: str) -> Field:
|
|||||||
|
|
||||||
|
|
||||||
def getattr_iter(
|
def getattr_iter(
|
||||||
object: object, names: Iterable[str], default: Any, warn: bool = False
|
object: object,
|
||||||
|
names: Iterable[str],
|
||||||
|
default: Any | None = None,
|
||||||
|
default_factory: Callable[[], Any] | None = None,
|
||||||
|
warn: bool = False,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
"""
|
"""
|
||||||
A helper function that retrieves an attribute from an object which may
|
A helper function that retrieves an attribute from an object which may
|
||||||
@ -96,7 +100,7 @@ def getattr_iter(
|
|||||||
names[0],
|
names[0],
|
||||||
)
|
)
|
||||||
return getattr(object, name)
|
return getattr(object, name)
|
||||||
return default
|
return default_factory() if default_factory is not None else default
|
||||||
|
|
||||||
|
|
||||||
def contains_object_print(text: str) -> bool:
|
def contains_object_print(text: str) -> bool:
|
||||||
|
|||||||
@ -408,7 +408,13 @@ class MooncakeConnectorWorker:
|
|||||||
|
|
||||||
self.engine = TransferEngine()
|
self.engine = TransferEngine()
|
||||||
self.hostname = get_ip()
|
self.hostname = get_ip()
|
||||||
ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", "rdma", "")
|
protocol = self.vllm_config.kv_transfer_config.kv_connector_extra_config.get( # type: ignore[union-attr]
|
||||||
|
"mooncake_protocol", "rdma"
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"The Mooncake Transfer Engine is using %s as its protocol.", protocol
|
||||||
|
)
|
||||||
|
ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", protocol, "")
|
||||||
if ret_value != 0:
|
if ret_value != 0:
|
||||||
raise RuntimeError("Mooncake Transfer Engine initialization failed.")
|
raise RuntimeError("Mooncake Transfer Engine initialization failed.")
|
||||||
|
|
||||||
|
|||||||
@ -2045,13 +2045,20 @@ def _raise_unsupported_error(feature_name: str):
|
|||||||
def human_readable_int(value):
|
def human_readable_int(value):
|
||||||
"""Parse human-readable integers like '1k', '2M', etc.
|
"""Parse human-readable integers like '1k', '2M', etc.
|
||||||
Including decimal values with decimal multipliers.
|
Including decimal values with decimal multipliers.
|
||||||
|
Also accepts -1 or 'auto' as a special value for auto-detection.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
- '1k' -> 1,000
|
- '1k' -> 1,000
|
||||||
- '1K' -> 1,024
|
- '1K' -> 1,024
|
||||||
- '25.6k' -> 25,600
|
- '25.6k' -> 25,600
|
||||||
|
- '-1' or 'auto' -> -1 (special value for auto-detection)
|
||||||
"""
|
"""
|
||||||
value = value.strip()
|
value = value.strip()
|
||||||
|
|
||||||
|
# Handle -1 or 'auto' as a special value for auto-detection
|
||||||
|
if value == "-1" or value.lower() == "auto":
|
||||||
|
return -1
|
||||||
|
|
||||||
match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
|
match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
|
||||||
if match:
|
if match:
|
||||||
decimal_multiplier = {
|
decimal_multiplier = {
|
||||||
|
|||||||
@ -1621,7 +1621,7 @@ class LLM:
|
|||||||
added_request_ids.append(request_id)
|
added_request_ids.append(request_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if added_request_ids:
|
if added_request_ids:
|
||||||
self.llm_engine.abort_request(added_request_ids)
|
self.llm_engine.abort_request(added_request_ids, internal=True)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
def _validate_mm_data_and_uuids(
|
def _validate_mm_data_and_uuids(
|
||||||
@ -1731,7 +1731,7 @@ class LLM:
|
|||||||
priority=priority,
|
priority=priority,
|
||||||
prompt_text=prompt_text,
|
prompt_text=prompt_text,
|
||||||
)
|
)
|
||||||
return request_id
|
return engine_request.request_id
|
||||||
|
|
||||||
def _run_engine(
|
def _run_engine(
|
||||||
self, *, use_tqdm: bool | Callable[..., tqdm] = True
|
self, *, use_tqdm: bool | Callable[..., tqdm] = True
|
||||||
|
|||||||
28
vllm/entrypoints/pooling/embed/conftest.py
Normal file
28
vllm/entrypoints/pooling/embed/conftest.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""Pytest configuration for vLLM pooling embed tests."""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_collection_modifyitems(config, items):
|
||||||
|
"""Configure ROCm-specific settings based on collected tests."""
|
||||||
|
if not current_platform.is_rocm():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||||
|
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
|
||||||
|
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||||
|
torch.backends.cuda.enable_flash_sdp(False)
|
||||||
|
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||||
|
torch.backends.cuda.enable_math_sdp(True)
|
||||||
|
warnings.warn(
|
||||||
|
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
|
||||||
|
"to avoid HuggingFace Transformers accuracy issues",
|
||||||
|
UserWarning,
|
||||||
|
stacklevel=1,
|
||||||
|
)
|
||||||
@ -118,6 +118,7 @@ class ShortConv(MambaBase, CustomOp):
|
|||||||
conv_state = self_kv_cache[0].transpose(-1, -2)
|
conv_state = self_kv_cache[0].transpose(-1, -2)
|
||||||
state_indices_tensor = attn_metadata.state_indices_tensor
|
state_indices_tensor = attn_metadata.state_indices_tensor
|
||||||
has_initial_states_p = attn_metadata.has_initial_states_p
|
has_initial_states_p = attn_metadata.has_initial_states_p
|
||||||
|
query_start_loc_p = attn_metadata.query_start_loc_p
|
||||||
|
|
||||||
BCx, _ = self.in_proj(hidden_states)
|
BCx, _ = self.in_proj(hidden_states)
|
||||||
|
|
||||||
@ -165,11 +166,6 @@ class ShortConv(MambaBase, CustomOp):
|
|||||||
[num_decodes, num_prefills],
|
[num_decodes, num_prefills],
|
||||||
dim=0,
|
dim=0,
|
||||||
)
|
)
|
||||||
query_start_loc_p = (
|
|
||||||
attn_metadata.query_start_loc[-num_prefills - 1 :] - num_decodes
|
|
||||||
if has_prefill
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
|
|
||||||
conv_output_list = []
|
conv_output_list = []
|
||||||
|
|
||||||
|
|||||||
@ -111,7 +111,7 @@ class AudioFlamingo3EmbeddingInputs(TensorSchema):
|
|||||||
|
|
||||||
audio_embeds: Annotated[
|
audio_embeds: Annotated[
|
||||||
list[torch.Tensor],
|
list[torch.Tensor],
|
||||||
TensorShape("bn", "naf", "hs"),
|
TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -878,11 +878,14 @@ class Indexer(nn.Module):
|
|||||||
)
|
)
|
||||||
|
|
||||||
q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
|
q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
|
||||||
# `rotary_emb` is shape-preserving; `q_pe` is already
|
# Note: RoPE (NeoX) can introduce extra leading dimensions during compilation
|
||||||
# [num_tokens, n_head, rope_dim].
|
# so we need to reshape back to token-flattened shapes
|
||||||
|
q_pe = q_pe.reshape(-1, self.n_head, self.rope_dim)
|
||||||
|
k_pe = k_pe.reshape(-1, 1, self.rope_dim)
|
||||||
|
|
||||||
q = torch.cat([q_pe, q_nope], dim=-1)
|
q = torch.cat([q_pe, q_nope], dim=-1)
|
||||||
# `k_pe` is [num_tokens, 1, rope_dim] (MQA).
|
# `k_pe` is [num_tokens, 1, rope_dim] (MQA).
|
||||||
k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1)
|
k = torch.cat([k_pe.squeeze(-2), k_nope], dim=-1)
|
||||||
|
|
||||||
# we only quant q here since k quant is fused with cache insertion
|
# we only quant q here since k quant is fused with cache insertion
|
||||||
q = q.view(-1, self.head_dim)
|
q = q.view(-1, self.head_dim)
|
||||||
@ -1595,7 +1598,11 @@ class DeepseekV2ForCausalLM(
|
|||||||
# Determine split axis based on op type
|
# Determine split axis based on op type
|
||||||
# gate/up: ColumnParallel → split along dim 0
|
# gate/up: ColumnParallel → split along dim 0
|
||||||
# down: RowParallel → split along dim 1
|
# down: RowParallel → split along dim 1
|
||||||
split_dim = 1 if "down_proj.weight" in name else 0
|
split_dim = (
|
||||||
|
1
|
||||||
|
if ("down_proj.weight" in name and loaded_weight.ndim > 1)
|
||||||
|
else 0
|
||||||
|
)
|
||||||
total = loaded_weight.shape[split_dim]
|
total = loaded_weight.shape[split_dim]
|
||||||
assert total % num_chunks == 0, (
|
assert total % num_chunks == 0, (
|
||||||
f"Shared expert weight dim {total} "
|
f"Shared expert weight dim {total} "
|
||||||
@ -1608,14 +1615,13 @@ class DeepseekV2ForCausalLM(
|
|||||||
weight_to_load = loaded_weight
|
weight_to_load = loaded_weight
|
||||||
|
|
||||||
if is_fusion_moe_shared_experts_layer:
|
if is_fusion_moe_shared_experts_layer:
|
||||||
if split_dim == 0:
|
chunk_slice = slice(j * chunk_size, (j + 1) * chunk_size)
|
||||||
weight_to_load = loaded_weight[
|
if loaded_weight.ndim == 1:
|
||||||
j * chunk_size : (j + 1) * chunk_size, :
|
weight_to_load = loaded_weight[chunk_slice]
|
||||||
]
|
elif split_dim == 0:
|
||||||
|
weight_to_load = loaded_weight[chunk_slice, :]
|
||||||
else:
|
else:
|
||||||
weight_to_load = loaded_weight[
|
weight_to_load = loaded_weight[:, chunk_slice]
|
||||||
:, j * chunk_size : (j + 1) * chunk_size
|
|
||||||
]
|
|
||||||
# Synthesize an expert-style name so expert mapping
|
# Synthesize an expert-style name so expert mapping
|
||||||
# can route it
|
# can route it
|
||||||
chunk_name = name.replace(
|
chunk_name = name.replace(
|
||||||
|
|||||||
@ -94,6 +94,12 @@ class SupportsMultiModal(Protocol):
|
|||||||
`multimodal_config.mm_encoder_tp_mode="data"`.
|
`multimodal_config.mm_encoder_tp_mode="data"`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
requires_raw_input_tokens: ClassVar[bool] = False
|
||||||
|
"""
|
||||||
|
A flag that indicates this model processes input id tokens
|
||||||
|
in their raw form and not input embeddings.
|
||||||
|
"""
|
||||||
|
|
||||||
merge_by_field_config: ClassVar[bool | None] = None
|
merge_by_field_config: ClassVar[bool | None] = None
|
||||||
"""
|
"""
|
||||||
[DEPRECATED] A flag that indicates which implementation of
|
[DEPRECATED] A flag that indicates which implementation of
|
||||||
@ -306,6 +312,10 @@ def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
|
|||||||
return getattr(model, "supports_multimodal_raw_input_only", False)
|
return getattr(model, "supports_multimodal_raw_input_only", False)
|
||||||
|
|
||||||
|
|
||||||
|
def requires_raw_input_tokens(model: type[object] | object) -> bool:
|
||||||
|
return getattr(model, "requires_raw_input_tokens", False)
|
||||||
|
|
||||||
|
|
||||||
def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool:
|
def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool:
|
||||||
return getattr(model, "supports_encoder_tp_data", False)
|
return getattr(model, "supports_encoder_tp_data", False)
|
||||||
|
|
||||||
|
|||||||
@ -139,7 +139,7 @@ class MiniCPMVImageEmbeddingInputs(TensorSchema):
|
|||||||
type: Literal["image_embeds"]
|
type: Literal["image_embeds"]
|
||||||
image_embeds: Annotated[
|
image_embeds: Annotated[
|
||||||
torch.Tensor | list[torch.Tensor],
|
torch.Tensor | list[torch.Tensor],
|
||||||
TensorShape("bn", "ns", "hs"),
|
TensorShape("bn", "ns", "hs", dynamic_dims={"ns"}),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -101,7 +101,7 @@ class Qwen2AudioEmbeddingInputs(TensorSchema):
|
|||||||
|
|
||||||
audio_embeds: Annotated[
|
audio_embeds: Annotated[
|
||||||
list[torch.Tensor],
|
list[torch.Tensor],
|
||||||
TensorShape("bn", "naf", "hs"),
|
TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -118,7 +118,7 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
|
|||||||
output_lengths = (
|
output_lengths = (
|
||||||
((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
|
((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
|
||||||
)
|
)
|
||||||
return feat_lengths, output_lengths
|
return output_lengths
|
||||||
|
|
||||||
|
|
||||||
class Qwen3_VisionPatchEmbed(nn.Module):
|
class Qwen3_VisionPatchEmbed(nn.Module):
|
||||||
@ -921,13 +921,11 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
|||||||
if audio_feature_lengths is None and feature_attention_mask is None:
|
if audio_feature_lengths is None and feature_attention_mask is None:
|
||||||
audio_output_lengths = []
|
audio_output_lengths = []
|
||||||
elif audio_feature_lengths is not None:
|
elif audio_feature_lengths is not None:
|
||||||
_, audio_output_lens = _get_feat_extract_output_lengths(
|
audio_output_lens = _get_feat_extract_output_lengths(audio_feature_lengths)
|
||||||
audio_feature_lengths
|
|
||||||
)
|
|
||||||
audio_output_lengths = audio_output_lens.tolist()
|
audio_output_lengths = audio_output_lens.tolist()
|
||||||
elif feature_attention_mask is not None:
|
elif feature_attention_mask is not None:
|
||||||
assert isinstance(feature_attention_mask, torch.Tensor)
|
assert isinstance(feature_attention_mask, torch.Tensor)
|
||||||
_, audio_output_lens = _get_feat_extract_output_lengths(
|
audio_output_lens = _get_feat_extract_output_lengths(
|
||||||
feature_attention_mask.sum(-1)
|
feature_attention_mask.sum(-1)
|
||||||
)
|
)
|
||||||
audio_output_lengths = audio_output_lens.tolist()
|
audio_output_lengths = audio_output_lens.tolist()
|
||||||
@ -1111,18 +1109,16 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix
|
|||||||
audio_input: Qwen2_5OmniAudioFeatureInputs,
|
audio_input: Qwen2_5OmniAudioFeatureInputs,
|
||||||
audio_hashes: list[str] | None = None,
|
audio_hashes: list[str] | None = None,
|
||||||
cached_audio_features: torch.Tensor | None = None,
|
cached_audio_features: torch.Tensor | None = None,
|
||||||
) -> torch.Tensor:
|
) -> tuple[torch.Tensor, ...]:
|
||||||
input_features = audio_input["input_features"]
|
input_features = audio_input["input_features"]
|
||||||
audio_feature_lengths = audio_input["audio_feature_lengths"]
|
audio_feature_lengths = audio_input["audio_feature_lengths"]
|
||||||
|
|
||||||
audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(
|
audio_output_lengths = _get_feat_extract_output_lengths(audio_feature_lengths)
|
||||||
audio_feature_lengths
|
|
||||||
)
|
|
||||||
|
|
||||||
audio_outputs = self.audio_tower(
|
audio_outputs = self.audio_tower(
|
||||||
input_features.to(self.audio_tower.dtype),
|
input_features.to(self.audio_tower.dtype),
|
||||||
feature_lens=audio_feature_lengths,
|
feature_lens=audio_feature_lengths,
|
||||||
aftercnn_lens=audio_feat_lengths,
|
aftercnn_lens=audio_output_lengths,
|
||||||
)
|
)
|
||||||
audio_features = audio_outputs.last_hidden_state
|
audio_features = audio_outputs.last_hidden_state
|
||||||
return audio_features.split(audio_output_lengths.tolist())
|
return audio_features.split(audio_output_lengths.tolist())
|
||||||
@ -1579,7 +1575,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
|||||||
+ st_idx
|
+ st_idx
|
||||||
)
|
)
|
||||||
st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
|
st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
|
||||||
_, audio_len = _get_feat_extract_output_lengths(
|
audio_len = _get_feat_extract_output_lengths(
|
||||||
audio_feature_lengths[audio_idx]
|
audio_feature_lengths[audio_idx]
|
||||||
)
|
)
|
||||||
llm_pos_ids = (
|
llm_pos_ids = (
|
||||||
@ -1700,7 +1696,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
|||||||
llm_pos_ids_list.append(bos_block)
|
llm_pos_ids_list.append(bos_block)
|
||||||
llm_pos_ids_list.append(bos_block)
|
llm_pos_ids_list.append(bos_block)
|
||||||
st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
|
st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
|
||||||
_, audio_len = _get_feat_extract_output_lengths(
|
audio_len = _get_feat_extract_output_lengths(
|
||||||
audio_feature_lengths[audio_idx]
|
audio_feature_lengths[audio_idx]
|
||||||
)
|
)
|
||||||
audio_llm_pos_ids = (
|
audio_llm_pos_ids = (
|
||||||
|
|||||||
@ -46,6 +46,7 @@ from .interfaces import (
|
|||||||
has_noops,
|
has_noops,
|
||||||
is_attention_free,
|
is_attention_free,
|
||||||
is_hybrid,
|
is_hybrid,
|
||||||
|
requires_raw_input_tokens,
|
||||||
supports_cross_encoding,
|
supports_cross_encoding,
|
||||||
supports_mamba_prefix_caching,
|
supports_mamba_prefix_caching,
|
||||||
supports_multimodal,
|
supports_multimodal,
|
||||||
@ -422,6 +423,7 @@ _MULTIMODAL_MODELS = {
|
|||||||
),
|
),
|
||||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||||
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
|
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
|
||||||
|
"VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"), # noqa: E501
|
||||||
# [Encoder-decoder]
|
# [Encoder-decoder]
|
||||||
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501
|
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501
|
||||||
}
|
}
|
||||||
@ -539,6 +541,7 @@ class _ModelInfo:
|
|||||||
supports_cross_encoding: bool
|
supports_cross_encoding: bool
|
||||||
supports_multimodal: bool
|
supports_multimodal: bool
|
||||||
supports_multimodal_raw_input_only: bool
|
supports_multimodal_raw_input_only: bool
|
||||||
|
requires_raw_input_tokens: bool
|
||||||
supports_multimodal_encoder_tp_data: bool
|
supports_multimodal_encoder_tp_data: bool
|
||||||
supports_pp: bool
|
supports_pp: bool
|
||||||
has_inner_state: bool
|
has_inner_state: bool
|
||||||
@ -562,6 +565,7 @@ class _ModelInfo:
|
|||||||
supports_multimodal_raw_input_only=supports_multimodal_raw_input_only(
|
supports_multimodal_raw_input_only=supports_multimodal_raw_input_only(
|
||||||
model
|
model
|
||||||
),
|
),
|
||||||
|
requires_raw_input_tokens=requires_raw_input_tokens(model),
|
||||||
supports_multimodal_encoder_tp_data=supports_multimodal_encoder_tp_data(
|
supports_multimodal_encoder_tp_data=supports_multimodal_encoder_tp_data(
|
||||||
model
|
model
|
||||||
),
|
),
|
||||||
|
|||||||
@ -163,8 +163,10 @@ def apply_rotary_pos_emb(
|
|||||||
enable_fp32_compute=True,
|
enable_fp32_compute=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
if is_flash_attn_backend and not current_platform.is_cuda():
|
if is_flash_attn_backend and current_platform.is_cuda():
|
||||||
apply_rotary_emb_func = apply_rotary_emb.forward_cuda
|
apply_rotary_emb_func = apply_rotary_emb.forward_cuda
|
||||||
|
elif is_flash_attn_backend and current_platform.is_rocm():
|
||||||
|
apply_rotary_emb_func = apply_rotary_emb.forward_hip
|
||||||
else:
|
else:
|
||||||
apply_rotary_emb_func = apply_rotary_emb.forward_native
|
apply_rotary_emb_func = apply_rotary_emb.forward_native
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import inspect
|
||||||
import math
|
import math
|
||||||
from collections.abc import Iterable, Mapping, Sequence
|
from collections.abc import Iterable, Mapping, Sequence
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
@ -116,10 +117,7 @@ class VoxtralProcessorAdapter:
|
|||||||
self,
|
self,
|
||||||
audio_length: int,
|
audio_length: int,
|
||||||
) -> int:
|
) -> int:
|
||||||
pad_audio_length = self._audio_processor.next_multiple_of_chunk_frames(
|
return ceil(audio_length / (self.sampling_rate // self.frame_rate))
|
||||||
audio_length, self.sampling_rate
|
|
||||||
)
|
|
||||||
return ceil(pad_audio_length / (self.sampling_rate // self.frame_rate))
|
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
@ -158,7 +156,14 @@ class VoxtralProcessorAdapter:
|
|||||||
assert audio.ndim == 1
|
assert audio.ndim == 1
|
||||||
|
|
||||||
# pad if necessary
|
# pad if necessary
|
||||||
audio = self._audio_processor.pad(audio, self.sampling_rate)
|
# TODO(Patrick) - remove once mistral-common is bumped
|
||||||
|
sig = inspect.signature(self._audio_processor.pad)
|
||||||
|
if "is_online_streaming" in sig.parameters:
|
||||||
|
audio = self._audio_processor.pad(
|
||||||
|
audio, self.sampling_rate, is_online_streaming=False
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
audio = self._audio_processor.pad(audio, self.sampling_rate)
|
||||||
|
|
||||||
audio_tokens = [self.begin_audio_token_id] + [
|
audio_tokens = [self.begin_audio_token_id] + [
|
||||||
self.audio_token_id
|
self.audio_token_id
|
||||||
@ -510,6 +515,7 @@ class VoxtralForConditionalGeneration(
|
|||||||
|
|
||||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
||||||
remapping_rules = [
|
remapping_rules = [
|
||||||
|
(r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"),
|
||||||
(r"mm_whisper_embeddings\.(.*)", r"\1"),
|
(r"mm_whisper_embeddings\.(.*)", r"\1"),
|
||||||
(r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"),
|
(r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"),
|
||||||
(
|
(
|
||||||
@ -535,13 +541,16 @@ class VoxtralForConditionalGeneration(
|
|||||||
def llm_weights_generator():
|
def llm_weights_generator():
|
||||||
nonlocal loaded_weights
|
nonlocal loaded_weights
|
||||||
for name, w in weights:
|
for name, w in weights:
|
||||||
is_encoder = (
|
is_encoder = False
|
||||||
name.startswith("mm_whisper_embeddings")
|
for k in [
|
||||||
and not name.startswith("mm_whisper_embeddings.tok_embeddings")
|
"mm_whisper_embeddings",
|
||||||
and not name.startswith(
|
"mm_streams_embeddings.embedding_module",
|
||||||
"mm_whisper_embeddings.audio_language_projection"
|
]:
|
||||||
|
is_encoder |= (
|
||||||
|
name.startswith(k)
|
||||||
|
and not name.startswith(f"{k}.tok_embeddings")
|
||||||
|
and not name.startswith(f"{k}.audio_language_projection")
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
for pattern, repl in remapping_rules:
|
for pattern, repl in remapping_rules:
|
||||||
if re.fullmatch(pattern, name):
|
if re.fullmatch(pattern, name):
|
||||||
@ -676,6 +685,7 @@ class VoxtralEncoderModel(nn.Module):
|
|||||||
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
||||||
|
|
||||||
mistral_remapping = [
|
mistral_remapping = [
|
||||||
|
(r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"),
|
||||||
(
|
(
|
||||||
r"whisper_encoder\.conv_layers\.0\.(weight|bias)",
|
r"whisper_encoder\.conv_layers\.0\.(weight|bias)",
|
||||||
r"whisper_encoder.conv1.\1",
|
r"whisper_encoder.conv1.\1",
|
||||||
@ -684,6 +694,14 @@ class VoxtralEncoderModel(nn.Module):
|
|||||||
r"whisper_encoder\.conv_layers\.1\.(weight|bias)",
|
r"whisper_encoder\.conv_layers\.1\.(weight|bias)",
|
||||||
r"whisper_encoder.conv2.\1",
|
r"whisper_encoder.conv2.\1",
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
r"whisper_encoder\.conv_layers\.0\.conv\.(weight|bias)",
|
||||||
|
r"whisper_encoder.conv1.\1",
|
||||||
|
), # noqa: E501
|
||||||
|
(
|
||||||
|
r"whisper_encoder\.conv_layers\.1\.conv\.(weight|bias)",
|
||||||
|
r"whisper_encoder.conv2.\1",
|
||||||
|
), # noqa: E501
|
||||||
(
|
(
|
||||||
r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)", # noqa: E501
|
r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)", # noqa: E501
|
||||||
r"whisper_encoder.layers.\1.self_attn.\2_proj.\3",
|
r"whisper_encoder.layers.\1.self_attn.\2_proj.\3",
|
||||||
|
|||||||
243
vllm/model_executor/models/voxtral_streaming.py
Normal file
243
vllm/model_executor/models/voxtral_streaming.py
Normal file
@ -0,0 +1,243 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import math
|
||||||
|
from collections.abc import Mapping
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.config.vllm import VllmConfig
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.model_executor.models.interfaces import MultiModalEmbeddings
|
||||||
|
from vllm.model_executor.models.voxtral import (
|
||||||
|
VoxtralDummyInputsBuilder,
|
||||||
|
VoxtralForConditionalGeneration,
|
||||||
|
VoxtralMultiModalProcessor,
|
||||||
|
VoxtralProcessingInfo,
|
||||||
|
)
|
||||||
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
from vllm.multimodal.cache import _I, BaseMultiModalProcessorCache
|
||||||
|
from vllm.multimodal.inputs import (
|
||||||
|
MultiModalKwargsOptionalItems,
|
||||||
|
)
|
||||||
|
from vllm.multimodal.parse import MultiModalDataItems
|
||||||
|
from vllm.multimodal.processing import (
|
||||||
|
MultiModalPromptUpdates,
|
||||||
|
PlaceholderFeaturesInfo,
|
||||||
|
)
|
||||||
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
|
from .utils import (
|
||||||
|
_flatten_embeddings,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
info: _I,
|
||||||
|
dummy_inputs: BaseDummyInputsBuilder[_I],
|
||||||
|
*,
|
||||||
|
cache: BaseMultiModalProcessorCache | None = None,
|
||||||
|
) -> None:
|
||||||
|
# streaming can't make use of a cache yet
|
||||||
|
super().__init__(info, dummy_inputs, cache=None)
|
||||||
|
|
||||||
|
def _maybe_apply_prompt_updates(
|
||||||
|
self,
|
||||||
|
mm_items: MultiModalDataItems,
|
||||||
|
prompt_ids: list[int],
|
||||||
|
mm_kwargs: MultiModalKwargsOptionalItems,
|
||||||
|
mm_prompt_updates: MultiModalPromptUpdates,
|
||||||
|
is_update_applied: bool,
|
||||||
|
) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
||||||
|
# there are no placeholder audio tokens for streaming
|
||||||
|
# so we need to build the place placeholder positions manually
|
||||||
|
|
||||||
|
# in streaming there is always only one audio input
|
||||||
|
audios = mm_kwargs.get("audio", [])
|
||||||
|
assert len(audios) == 1, (
|
||||||
|
f"Expected only one audio input for streaming, got {mm_kwargs=}"
|
||||||
|
)
|
||||||
|
tokenizer = self.info.get_tokenizer()
|
||||||
|
audio_config = tokenizer.instruct.audio_encoder.audio_config
|
||||||
|
|
||||||
|
num_audio_samples = audios[0]["audio_arrays"].data.shape[0]
|
||||||
|
length = audio_config.num_audio_tokens(num_audio_samples)
|
||||||
|
|
||||||
|
features_info = PlaceholderFeaturesInfo(
|
||||||
|
modality="audio",
|
||||||
|
item_idx=0,
|
||||||
|
start_idx=0,
|
||||||
|
tokens=length
|
||||||
|
* [0], # only used for length computation, so we can take dummy inputs
|
||||||
|
is_embed=None,
|
||||||
|
)
|
||||||
|
return prompt_ids, {"audio": [features_info]}
|
||||||
|
|
||||||
|
|
||||||
|
class TimeEmbedding(torch.nn.Module):
|
||||||
|
"""Sinusoidal Embedding for encoding time"""
|
||||||
|
|
||||||
|
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.dim = dim
|
||||||
|
self.theta = theta
|
||||||
|
inv_freq = torch.exp(
|
||||||
|
-math.log(self.theta)
|
||||||
|
* torch.arange(self.dim // 2).float()
|
||||||
|
/ (self.dim // 2)
|
||||||
|
)
|
||||||
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||||
|
|
||||||
|
def forward(self, t: torch.Tensor) -> torch.Tensor:
|
||||||
|
t = t[..., None] # (B,) -> (B, 1) or (B, T) -> (B, T, 1)
|
||||||
|
inv_freq = self.inv_freq.to(device=t.device, dtype=t.dtype)
|
||||||
|
emb = (
|
||||||
|
t * inv_freq
|
||||||
|
) # (B, 1) x (D/2,) -> (B, D/2) or (B, T, 1) x (D/2,) -> (B, T, D/2)
|
||||||
|
return torch.cat((emb.cos(), emb.sin()), dim=-1) # (B, D) or (B, T, D)
|
||||||
|
|
||||||
|
|
||||||
|
@MULTIMODAL_REGISTRY.register_processor(
|
||||||
|
VoxtralStreamingMultiModalProcessor,
|
||||||
|
info=VoxtralProcessingInfo,
|
||||||
|
dummy_inputs=VoxtralDummyInputsBuilder,
|
||||||
|
)
|
||||||
|
class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
|
||||||
|
requires_raw_input_tokens = True
|
||||||
|
|
||||||
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
|
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||||
|
self.time_embedding: TimeEmbedding = TimeEmbedding(
|
||||||
|
dim=self.config.text_config.hidden_size
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_config = self.tokenizer.instruct.audio_encoder.audio_config
|
||||||
|
_n_delay_tokens = (
|
||||||
|
audio_config.frame_rate * audio_config.transcription_delay_ms / 1000
|
||||||
|
)
|
||||||
|
assert _n_delay_tokens.is_integer(), (
|
||||||
|
f"n_delay_tokens must be integer, got {_n_delay_tokens}"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.n_delay_tokens = int(_n_delay_tokens)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def audio_config(self):
|
||||||
|
return self.tokenizer.instruct.audio_encoder.audio_config
|
||||||
|
|
||||||
|
def embed_input_ids(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
multimodal_embeddings: MultiModalEmbeddings | None = None,
|
||||||
|
*,
|
||||||
|
is_multimodal: torch.Tensor | None = None,
|
||||||
|
# Multi-modal token ID may exceed vocab size
|
||||||
|
handle_oov_mm_token: bool = True,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
"""Pass post-conv embeddings directly as input"""
|
||||||
|
# for streaming we simply flatten the multimodal embeddings
|
||||||
|
# to be in tensor format, we treat the input ids later
|
||||||
|
assert multimodal_embeddings is not None
|
||||||
|
assert len(multimodal_embeddings) > 0, (
|
||||||
|
"For streaming you must provide a multimodal_embedding at every step."
|
||||||
|
)
|
||||||
|
mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
|
||||||
|
return mm_embeds_flat
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
positions: torch.Tensor,
|
||||||
|
intermediate_tensors: IntermediateTensors | None = None,
|
||||||
|
inputs_embeds: torch.Tensor | None = None,
|
||||||
|
**kwargs: object,
|
||||||
|
) -> torch.Tensor | IntermediateTensors:
|
||||||
|
assert inputs_embeds is not None
|
||||||
|
assert input_ids is not None
|
||||||
|
|
||||||
|
pool_size = self.config.audio_config.block_pool_size
|
||||||
|
inputs_embeds = inputs_embeds.view(
|
||||||
|
inputs_embeds.shape[0] * pool_size, inputs_embeds.shape[1] // pool_size
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_hidden_states = self.whisper_encoder.whisper_encoder.forward_layers(
|
||||||
|
inputs_embeds
|
||||||
|
)
|
||||||
|
|
||||||
|
num_tokens, audio_hidden_size = audio_hidden_states.shape
|
||||||
|
assert num_tokens % self.downsample_factor == 0
|
||||||
|
audio_hidden_states = audio_hidden_states.reshape(
|
||||||
|
num_tokens // self.downsample_factor,
|
||||||
|
audio_hidden_size * self.downsample_factor,
|
||||||
|
)
|
||||||
|
audio_text_embeds = self.audio_language_adapter(audio_hidden_states)
|
||||||
|
|
||||||
|
text_embeds = self.language_model.embed_input_ids(input_ids)
|
||||||
|
|
||||||
|
# sum pool text and audio embeddings
|
||||||
|
inputs_embeds = audio_text_embeds + text_embeds
|
||||||
|
|
||||||
|
time_tensor = torch.tensor(
|
||||||
|
[self.n_delay_tokens],
|
||||||
|
device=inputs_embeds.device,
|
||||||
|
dtype=inputs_embeds.dtype,
|
||||||
|
)
|
||||||
|
inputs_embeds = inputs_embeds + self.time_embedding(time_tensor)
|
||||||
|
|
||||||
|
hidden_states = self.language_model.model(
|
||||||
|
input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
|
||||||
|
)
|
||||||
|
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
def embed_multimodal(
|
||||||
|
self, **kwargs
|
||||||
|
) -> list[torch.Tensor] | torch.Tensor | tuple[torch.Tensor, ...] | None:
|
||||||
|
"""Transform audio waveforms -> initial whisper post-conv embeddings"""
|
||||||
|
audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
|
||||||
|
|
||||||
|
assert audio_inputs is not None, (
|
||||||
|
"For streaming you must provide an audio input at every step."
|
||||||
|
)
|
||||||
|
|
||||||
|
multiple_of = self.audio_config.raw_audio_length_per_tok
|
||||||
|
assert all(
|
||||||
|
(this_audio := audio.shape[0]) % multiple_of == 0 for audio in audio_inputs
|
||||||
|
), (
|
||||||
|
f"Every input audio waveform has to be a multiple of {multiple_of}, but"
|
||||||
|
f" one is {this_audio} with {(this_audio / multiple_of)=}."
|
||||||
|
)
|
||||||
|
|
||||||
|
mel_features = [
|
||||||
|
self.whisper_encoder.compute_whisper_melspec(audio).to(
|
||||||
|
self.whisper_encoder.dtype
|
||||||
|
)
|
||||||
|
for audio in audio_inputs
|
||||||
|
]
|
||||||
|
seq_lens = [mel.shape[1] for mel in mel_features]
|
||||||
|
# [total_num_20ms_frames, hidden_size]
|
||||||
|
audio_embeddings = self.whisper_encoder.whisper_encoder.forward_conv(
|
||||||
|
mel_features
|
||||||
|
)[0]
|
||||||
|
conv_stride = self.whisper_encoder.whisper_encoder.total_stride
|
||||||
|
audio_embeddings_per_sample = audio_embeddings.split(
|
||||||
|
[s // conv_stride for s in seq_lens], dim=0
|
||||||
|
)
|
||||||
|
|
||||||
|
# audio_embeddings per sample need to be divisible by 4
|
||||||
|
pool_size = self.config.audio_config.block_pool_size
|
||||||
|
assert all(
|
||||||
|
(this_shape := sample.shape[0]) % pool_size == 0
|
||||||
|
for sample in audio_embeddings_per_sample
|
||||||
|
), f"Every audio embedding has to be a multiple of 4, but one is {this_shape}."
|
||||||
|
|
||||||
|
audio_embeddings_per_sample = [
|
||||||
|
e.view(e.shape[0] // pool_size, e.shape[1] * pool_size)
|
||||||
|
for e in audio_embeddings_per_sample
|
||||||
|
]
|
||||||
|
return audio_embeddings_per_sample
|
||||||
@ -1,9 +1,11 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import enum
|
||||||
import math
|
import math
|
||||||
from collections.abc import Iterable, Mapping, Sequence
|
from collections.abc import Iterable, Mapping, Sequence
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
|
from functools import partial
|
||||||
from typing import Annotated, Literal, cast
|
from typing import Annotated, Literal, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -16,7 +18,10 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
from transformers.models.whisper.modeling_whisper import sinusoids
|
from transformers.models.whisper.modeling_whisper import sinusoids
|
||||||
|
|
||||||
from vllm.attention.layer import Attention, AttentionType
|
from vllm.attention.backends.abstract import (
|
||||||
|
AttentionType,
|
||||||
|
)
|
||||||
|
from vllm.attention.layer import Attention
|
||||||
from vllm.attention.layers.cross_attention import CrossAttention
|
from vllm.attention.layers.cross_attention import CrossAttention
|
||||||
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
|
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
|
||||||
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
|
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
|
||||||
@ -34,6 +39,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
|||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
|
from vllm.model_executor.models.whisper_utils import (
|
||||||
|
ISO639_1_SUPPORTED_LANGS,
|
||||||
|
WhisperAttentionWithBlockPooling,
|
||||||
|
WhisperCausalConv1d,
|
||||||
|
)
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (
|
from vllm.multimodal.inputs import (
|
||||||
MultiModalDataDict,
|
MultiModalDataDict,
|
||||||
@ -64,67 +74,11 @@ from .utils import (
|
|||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
|
|
||||||
|
|
||||||
ISO639_1_SUPPORTED_LANGS = {
|
class WhisperPosEmbedType(enum.Enum):
|
||||||
"af": "Afrikaans",
|
SINUSOIDAL = "sinusoidal"
|
||||||
"ar": "Arabic",
|
NOPE = "nope"
|
||||||
"hy": "Armenian",
|
LEARNED = "learned"
|
||||||
"az": "Azerbaijani",
|
|
||||||
"be": "Belarusian",
|
|
||||||
"bs": "Bosnian",
|
|
||||||
"bg": "Bulgarian",
|
|
||||||
"ca": "Catalan",
|
|
||||||
"zh": "Chinese",
|
|
||||||
"hr": "Croatian",
|
|
||||||
"cs": "Czech",
|
|
||||||
"da": "Danish",
|
|
||||||
"nl": "Dutch",
|
|
||||||
"en": "English",
|
|
||||||
"et": "Estonian",
|
|
||||||
"fi": "Finnish",
|
|
||||||
"fr": "French",
|
|
||||||
"gl": "Galician",
|
|
||||||
"de": "German",
|
|
||||||
"el": "Greek",
|
|
||||||
"he": "Hebrew",
|
|
||||||
"hi": "Hindi",
|
|
||||||
"hu": "Hungarian",
|
|
||||||
"is": "Icelandic",
|
|
||||||
"id": "Indonesian",
|
|
||||||
"it": "Italian",
|
|
||||||
"ja": "Japanese",
|
|
||||||
"kn": "Kannada",
|
|
||||||
"kk": "Kazakh",
|
|
||||||
"ko": "Korean",
|
|
||||||
"lv": "Latvian",
|
|
||||||
"lt": "Lithuanian",
|
|
||||||
"mk": "Macedonian",
|
|
||||||
"ms": "Malay",
|
|
||||||
"mr": "Marathi",
|
|
||||||
"mi": "Maori",
|
|
||||||
"ne": "Nepali",
|
|
||||||
"no": "Norwegian",
|
|
||||||
"fa": "Persian",
|
|
||||||
"pl": "Polish",
|
|
||||||
"pt": "Portuguese",
|
|
||||||
"ro": "Romanian",
|
|
||||||
"ru": "Russian",
|
|
||||||
"sr": "Serbian",
|
|
||||||
"sk": "Slovak",
|
|
||||||
"sl": "Slovenian",
|
|
||||||
"es": "Spanish",
|
|
||||||
"sw": "Swahili",
|
|
||||||
"sv": "Swedish",
|
|
||||||
"tl": "Tagalog",
|
|
||||||
"ta": "Tamil",
|
|
||||||
"th": "Thai",
|
|
||||||
"tr": "Turkish",
|
|
||||||
"uk": "Ukrainian",
|
|
||||||
"ur": "Urdu",
|
|
||||||
"vi": "Vietnamese",
|
|
||||||
"cy": "Welsh",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class WhisperAudioInputs(TensorSchema):
|
class WhisperAudioInputs(TensorSchema):
|
||||||
@ -184,6 +138,8 @@ class WhisperAttention(nn.Module):
|
|||||||
num_heads: int,
|
num_heads: int,
|
||||||
bias: bool = True,
|
bias: bool = True,
|
||||||
attn_type: AttentionType = AttentionType.DECODER,
|
attn_type: AttentionType = AttentionType.DECODER,
|
||||||
|
per_layer_sliding_window: int | None = None,
|
||||||
|
block_pool_size: int = 1,
|
||||||
cache_config: CacheConfig | None = None,
|
cache_config: CacheConfig | None = None,
|
||||||
quant_config: QuantizationConfig | None = None,
|
quant_config: QuantizationConfig | None = None,
|
||||||
prefix: str = "",
|
prefix: str = "",
|
||||||
@ -242,7 +198,14 @@ class WhisperAttention(nn.Module):
|
|||||||
attn_type=self.attn_type,
|
attn_type=self.attn_type,
|
||||||
)
|
)
|
||||||
else: # AttentionType.DECODER (regular decoder self-attention)
|
else: # AttentionType.DECODER (regular decoder self-attention)
|
||||||
self.attn = Attention(
|
if block_pool_size > 1:
|
||||||
|
attn_cls = partial(
|
||||||
|
WhisperAttentionWithBlockPooling, block_pool_size=block_pool_size
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
attn_cls = Attention
|
||||||
|
|
||||||
|
self.attn = attn_cls(
|
||||||
self.num_heads,
|
self.num_heads,
|
||||||
self.head_dim,
|
self.head_dim,
|
||||||
self.scaling,
|
self.scaling,
|
||||||
@ -251,6 +214,7 @@ class WhisperAttention(nn.Module):
|
|||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
prefix=f"{prefix}.attn",
|
prefix=f"{prefix}.attn",
|
||||||
attn_type=self.attn_type,
|
attn_type=self.attn_type,
|
||||||
|
per_layer_sliding_window=per_layer_sliding_window,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _init_qkv(
|
def _init_qkv(
|
||||||
@ -386,6 +350,9 @@ class WhisperEncoderLayer(nn.Module):
|
|||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
config = vllm_config.model_config.hf_config
|
config = vllm_config.model_config.hf_config
|
||||||
|
is_causal = getattr(config, "is_causal", False)
|
||||||
|
sliding_window = getattr(config, "sliding_window", None)
|
||||||
|
block_pool_size = getattr(config, "block_pool_size", 1)
|
||||||
cache_config = vllm_config.cache_config
|
cache_config = vllm_config.cache_config
|
||||||
quant_config = vllm_config.quant_config
|
quant_config = vllm_config.quant_config
|
||||||
|
|
||||||
@ -393,7 +360,9 @@ class WhisperEncoderLayer(nn.Module):
|
|||||||
self.self_attn = WhisperAttention(
|
self.self_attn = WhisperAttention(
|
||||||
embed_dim=self.embed_dim,
|
embed_dim=self.embed_dim,
|
||||||
num_heads=config.encoder_attention_heads,
|
num_heads=config.encoder_attention_heads,
|
||||||
attn_type=AttentionType.ENCODER,
|
attn_type=AttentionType.DECODER if is_causal else AttentionType.ENCODER,
|
||||||
|
block_pool_size=block_pool_size,
|
||||||
|
per_layer_sliding_window=sliding_window,
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
prefix=f"{prefix}.self_attn",
|
prefix=f"{prefix}.self_attn",
|
||||||
@ -492,12 +461,21 @@ class WhisperEncoder(nn.Module):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
config = vllm_config.model_config.hf_config
|
config = vllm_config.model_config.hf_config
|
||||||
embed_dim = config.d_model
|
embed_dim = config.d_model
|
||||||
|
|
||||||
|
self.pos_embed_type = WhisperPosEmbedType(
|
||||||
|
getattr(config, "pos_embed", "sinusoidal")
|
||||||
|
)
|
||||||
self.num_mel_bins = config.num_mel_bins
|
self.num_mel_bins = config.num_mel_bins
|
||||||
self.max_source_positions = config.max_source_positions
|
self.max_source_positions = config.max_source_positions
|
||||||
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
|
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
|
||||||
|
|
||||||
self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
|
is_causal = getattr(config, "is_causal", False)
|
||||||
self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
|
Conv1d = WhisperCausalConv1d if is_causal else partial(nn.Conv1d, padding=1)
|
||||||
|
|
||||||
|
self.conv1 = Conv1d(self.num_mel_bins, embed_dim, kernel_size=3)
|
||||||
|
self.conv2 = Conv1d(embed_dim, embed_dim, stride=2, kernel_size=3)
|
||||||
|
|
||||||
|
self.total_stride = self.conv1.stride[0] * self.conv2.stride[0]
|
||||||
self.start_layer, self.end_layer, self.layers = make_layers(
|
self.start_layer, self.end_layer, self.layers = make_layers(
|
||||||
config.encoder_layers,
|
config.encoder_layers,
|
||||||
lambda prefix: WhisperEncoderLayer(
|
lambda prefix: WhisperEncoderLayer(
|
||||||
@ -507,29 +485,54 @@ class WhisperEncoder(nn.Module):
|
|||||||
)
|
)
|
||||||
self.layer_norm = nn.LayerNorm(config.d_model)
|
self.layer_norm = nn.LayerNorm(config.d_model)
|
||||||
|
|
||||||
maybe_fp32_init_ctx = (
|
if is_causal and self.pos_embed_type != WhisperPosEmbedType.NOPE:
|
||||||
set_default_torch_dtype(torch.float32) if init_in_fp32 else nullcontext()
|
raise ValueError(
|
||||||
)
|
"Only NOPE position embeddings are supported "
|
||||||
|
f"for causal models, but got {self.pos_embed_type}"
|
||||||
with (
|
)
|
||||||
torch.no_grad(),
|
elif self.pos_embed_type in (
|
||||||
maybe_fp32_init_ctx,
|
WhisperPosEmbedType.SINUSOIDAL,
|
||||||
|
WhisperPosEmbedType.LEARNED,
|
||||||
):
|
):
|
||||||
self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
|
maybe_fp32_init_ctx = (
|
||||||
self.embed_positions.weight.copy_(
|
set_default_torch_dtype(torch.float32)
|
||||||
sinusoids(*self.embed_positions.weight.shape)
|
if init_in_fp32
|
||||||
|
else nullcontext()
|
||||||
)
|
)
|
||||||
|
|
||||||
def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
|
with (
|
||||||
|
torch.no_grad(),
|
||||||
|
maybe_fp32_init_ctx,
|
||||||
|
):
|
||||||
|
self.embed_positions = nn.Embedding(
|
||||||
|
self.max_source_positions, embed_dim
|
||||||
|
)
|
||||||
|
self.embed_positions.weight.copy_(
|
||||||
|
sinusoids(*self.embed_positions.weight.shape)
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward_conv(
|
||||||
|
self, input_features: torch.Tensor | list[torch.Tensor]
|
||||||
|
) -> torch.Tensor:
|
||||||
hidden_states = []
|
hidden_states = []
|
||||||
input_is_batched = False
|
input_is_batched = False
|
||||||
for features in input_features:
|
for features in input_features:
|
||||||
embeds = nn.functional.gelu(self.conv1(features))
|
embeds = nn.functional.gelu(self.conv1(features))
|
||||||
embeds = nn.functional.gelu(self.conv2(embeds))
|
embeds = nn.functional.gelu(self.conv2(embeds))
|
||||||
embeds = embeds.transpose(-1, -2)
|
|
||||||
embeds = (embeds + self.embed_positions.weight[: embeds.size(-2), :]).to(
|
if self.pos_embed_type in (
|
||||||
embeds.dtype
|
WhisperPosEmbedType.SINUSOIDAL,
|
||||||
)
|
WhisperPosEmbedType.LEARNED,
|
||||||
|
):
|
||||||
|
embeds = embeds.transpose(-1, -2)
|
||||||
|
embeds = (
|
||||||
|
embeds + self.embed_positions.weight[: embeds.size(-2), :]
|
||||||
|
).to(embeds.dtype)
|
||||||
|
elif self.pos_embed_type == WhisperPosEmbedType.NOPE:
|
||||||
|
embeds = embeds.transpose(-1, -2).to(embeds.dtype)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown pos_embed_type: {self.pos_embed_type}")
|
||||||
|
|
||||||
hidden_states.append(embeds)
|
hidden_states.append(embeds)
|
||||||
input_is_batched = embeds.ndim > 2
|
input_is_batched = embeds.ndim > 2
|
||||||
# Input to MHA must be B x T x D
|
# Input to MHA must be B x T x D
|
||||||
@ -539,12 +542,19 @@ class WhisperEncoder(nn.Module):
|
|||||||
else:
|
else:
|
||||||
hidden_states = torch.stack(hidden_states, dim=0)
|
hidden_states = torch.stack(hidden_states, dim=0)
|
||||||
|
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
def forward_layers(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||||
for encoder_layer in self.layers:
|
for encoder_layer in self.layers:
|
||||||
hidden_states = encoder_layer(hidden_states)
|
hidden_states = encoder_layer(hidden_states)
|
||||||
|
|
||||||
hidden_states = self.layer_norm(hidden_states)
|
hidden_states = self.layer_norm(hidden_states)
|
||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
|
||||||
|
hidden_states = self.forward_conv(input_features)
|
||||||
|
return self.forward_layers(hidden_states)
|
||||||
|
|
||||||
|
|
||||||
class WhisperDecoder(nn.Module):
|
class WhisperDecoder(nn.Module):
|
||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
|
|||||||
299
vllm/model_executor/models/whisper_utils.py
Normal file
299
vllm/model_executor/models/whisper_utils.py
Normal file
@ -0,0 +1,299 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import copy
|
||||||
|
import functools
|
||||||
|
import math
|
||||||
|
from dataclasses import replace
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
from vllm.attention.backends.abstract import (
|
||||||
|
AttentionBackend,
|
||||||
|
AttentionMetadata,
|
||||||
|
AttentionType,
|
||||||
|
)
|
||||||
|
from vllm.attention.layer import Attention
|
||||||
|
from vllm.attention.selector import get_attn_backend
|
||||||
|
from vllm.config import CacheConfig, VllmConfig
|
||||||
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
|
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
||||||
|
from vllm.v1.attention.backends.utils import (
|
||||||
|
CommonAttentionMetadata,
|
||||||
|
subclass_attention_backend_with_overrides,
|
||||||
|
)
|
||||||
|
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||||
|
|
||||||
|
# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
|
||||||
|
ISO639_1_SUPPORTED_LANGS = {
|
||||||
|
"af": "Afrikaans",
|
||||||
|
"ar": "Arabic",
|
||||||
|
"hy": "Armenian",
|
||||||
|
"az": "Azerbaijani",
|
||||||
|
"be": "Belarusian",
|
||||||
|
"bs": "Bosnian",
|
||||||
|
"bg": "Bulgarian",
|
||||||
|
"ca": "Catalan",
|
||||||
|
"zh": "Chinese",
|
||||||
|
"hr": "Croatian",
|
||||||
|
"cs": "Czech",
|
||||||
|
"da": "Danish",
|
||||||
|
"nl": "Dutch",
|
||||||
|
"en": "English",
|
||||||
|
"et": "Estonian",
|
||||||
|
"fi": "Finnish",
|
||||||
|
"fr": "French",
|
||||||
|
"gl": "Galician",
|
||||||
|
"de": "German",
|
||||||
|
"el": "Greek",
|
||||||
|
"he": "Hebrew",
|
||||||
|
"hi": "Hindi",
|
||||||
|
"hu": "Hungarian",
|
||||||
|
"is": "Icelandic",
|
||||||
|
"id": "Indonesian",
|
||||||
|
"it": "Italian",
|
||||||
|
"ja": "Japanese",
|
||||||
|
"kn": "Kannada",
|
||||||
|
"kk": "Kazakh",
|
||||||
|
"ko": "Korean",
|
||||||
|
"lv": "Latvian",
|
||||||
|
"lt": "Lithuanian",
|
||||||
|
"mk": "Macedonian",
|
||||||
|
"ms": "Malay",
|
||||||
|
"mr": "Marathi",
|
||||||
|
"mi": "Maori",
|
||||||
|
"ne": "Nepali",
|
||||||
|
"no": "Norwegian",
|
||||||
|
"fa": "Persian",
|
||||||
|
"pl": "Polish",
|
||||||
|
"pt": "Portuguese",
|
||||||
|
"ro": "Romanian",
|
||||||
|
"ru": "Russian",
|
||||||
|
"sr": "Serbian",
|
||||||
|
"sk": "Slovak",
|
||||||
|
"sl": "Slovenian",
|
||||||
|
"es": "Spanish",
|
||||||
|
"sw": "Swahili",
|
||||||
|
"sv": "Swedish",
|
||||||
|
"tl": "Tagalog",
|
||||||
|
"ta": "Tamil",
|
||||||
|
"th": "Thai",
|
||||||
|
"tr": "Turkish",
|
||||||
|
"uk": "Ukrainian",
|
||||||
|
"ur": "Urdu",
|
||||||
|
"vi": "Vietnamese",
|
||||||
|
"cy": "Welsh",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _pad1d(
|
||||||
|
x: torch.Tensor,
|
||||||
|
paddings: tuple[int, int],
|
||||||
|
mode: str = "constant",
|
||||||
|
value: float = 0.0,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
"""Tiny wrapper around F.pad, just to allow for
|
||||||
|
reflect padding on small input.
|
||||||
|
If this is the case, we insert extra 0 padding
|
||||||
|
to the right before the reflection happen.
|
||||||
|
"""
|
||||||
|
length = x.shape[-1]
|
||||||
|
padding_left, padding_right = paddings
|
||||||
|
assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
|
||||||
|
if mode == "reflect":
|
||||||
|
max_pad = max(padding_left, padding_right)
|
||||||
|
extra_pad = 0
|
||||||
|
if length <= max_pad:
|
||||||
|
extra_pad = max_pad - length + 1
|
||||||
|
x = F.pad(x, (0, extra_pad))
|
||||||
|
padded = F.pad(x, paddings, mode, value)
|
||||||
|
end = padded.shape[-1] - extra_pad
|
||||||
|
return padded[..., :end]
|
||||||
|
else:
|
||||||
|
return F.pad(x, paddings, mode, value)
|
||||||
|
|
||||||
|
|
||||||
|
class WhisperCausalConv1d(nn.Conv1d):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_channels: int,
|
||||||
|
out_channels: int,
|
||||||
|
kernel_size: int,
|
||||||
|
stride: int = 1,
|
||||||
|
padding: int = 0,
|
||||||
|
bias: bool = True,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
bias=bias,
|
||||||
|
)
|
||||||
|
self._stride = self.stride[0]
|
||||||
|
self._effective_kernel_size = (kernel_size - 1) * self.dilation[0] + 1
|
||||||
|
self._padding_total = self._effective_kernel_size - self._stride
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
n_frames = (
|
||||||
|
x.shape[-1] - self._effective_kernel_size + self._padding_total
|
||||||
|
) / self._stride + 1
|
||||||
|
target_length = (math.ceil(n_frames) - 1) * self._stride + (
|
||||||
|
self._effective_kernel_size - self._padding_total
|
||||||
|
)
|
||||||
|
extra_padding = target_length - x.shape[-1]
|
||||||
|
x = _pad1d(x, (self._padding_total, extra_padding), mode="constant")
|
||||||
|
return super().forward(x)
|
||||||
|
|
||||||
|
|
||||||
|
@functools.lru_cache
|
||||||
|
def create_whisper_attention_backend_with_block_pooling(
|
||||||
|
underlying_attn_backend: AttentionBackend, block_pool_size: int
|
||||||
|
) -> type[AttentionBackend]:
|
||||||
|
prefix = "WhisperAttentionWithBlockPooling_"
|
||||||
|
underlying_builder = underlying_attn_backend.get_builder_cls()
|
||||||
|
|
||||||
|
class WhisperAttentionWithBlockPoolingBuilder(underlying_builder): # type: ignore
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
kv_cache_spec: AttentionSpec,
|
||||||
|
layer_names: list[str],
|
||||||
|
vllm_config: VllmConfig,
|
||||||
|
device: torch.device,
|
||||||
|
):
|
||||||
|
assert kv_cache_spec.num_kv_heads % block_pool_size == 0
|
||||||
|
kv_cache_spec = replace(
|
||||||
|
kv_cache_spec,
|
||||||
|
block_size=kv_cache_spec.block_size * block_pool_size,
|
||||||
|
num_kv_heads=kv_cache_spec.num_kv_heads // block_pool_size,
|
||||||
|
)
|
||||||
|
super().__init__(kv_cache_spec, layer_names, vllm_config, device)
|
||||||
|
|
||||||
|
def build(
|
||||||
|
self,
|
||||||
|
common_prefix_len: int,
|
||||||
|
common_attn_metadata: CommonAttentionMetadata,
|
||||||
|
fast_build: bool = False,
|
||||||
|
) -> AttentionMetadata:
|
||||||
|
new_common_attn_metadata = copy.deepcopy(common_attn_metadata)
|
||||||
|
new_common_attn_metadata.query_start_loc *= block_pool_size
|
||||||
|
new_common_attn_metadata.query_start_loc_cpu *= block_pool_size
|
||||||
|
new_common_attn_metadata.seq_lens *= block_pool_size
|
||||||
|
new_common_attn_metadata._seq_lens_cpu *= block_pool_size
|
||||||
|
new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size
|
||||||
|
new_common_attn_metadata.num_actual_tokens *= block_pool_size
|
||||||
|
new_common_attn_metadata.max_query_len *= block_pool_size
|
||||||
|
new_common_attn_metadata.max_seq_len *= block_pool_size
|
||||||
|
original_slot_mapping = common_attn_metadata.slot_mapping
|
||||||
|
common_prefix_len *= block_pool_size
|
||||||
|
new_common_attn_metadata.slot_mapping = (
|
||||||
|
(
|
||||||
|
original_slot_mapping.unsqueeze(1) * block_pool_size
|
||||||
|
+ torch.arange(block_pool_size, device=original_slot_mapping.device)
|
||||||
|
)
|
||||||
|
.flatten()
|
||||||
|
.clamp(min=-1)
|
||||||
|
)
|
||||||
|
return super().build(
|
||||||
|
common_prefix_len, new_common_attn_metadata, fast_build
|
||||||
|
)
|
||||||
|
|
||||||
|
if not issubclass(underlying_attn_backend, FlashAttentionBackend):
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"{underlying_attn_backend} is not yet supported."
|
||||||
|
"Contributions to support more backends are much "
|
||||||
|
"appreciated."
|
||||||
|
)
|
||||||
|
|
||||||
|
attn_backend = subclass_attention_backend_with_overrides(
|
||||||
|
name_prefix=prefix,
|
||||||
|
attention_backend_cls=underlying_attn_backend,
|
||||||
|
overrides={
|
||||||
|
"get_builder_cls": lambda: WhisperAttentionWithBlockPoolingBuilder,
|
||||||
|
"get_kv_cache_shape": lambda num_blocks,
|
||||||
|
block_size,
|
||||||
|
num_kv_heads,
|
||||||
|
head_size,
|
||||||
|
cache_dtype_str: (
|
||||||
|
2,
|
||||||
|
num_blocks,
|
||||||
|
# we stretch each block by `block_pool_size`
|
||||||
|
block_size * block_pool_size,
|
||||||
|
num_kv_heads // block_pool_size,
|
||||||
|
head_size,
|
||||||
|
), # TODO: generalize to other backends
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
return attn_backend
|
||||||
|
|
||||||
|
|
||||||
|
class WhisperAttentionWithBlockPooling(Attention):
|
||||||
|
"""Attention layer with block pooling."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
num_heads: int,
|
||||||
|
head_size: int,
|
||||||
|
scale: float,
|
||||||
|
num_kv_heads: int | None = None,
|
||||||
|
alibi_slopes: list[float] | None = None,
|
||||||
|
cache_config: CacheConfig | None = None,
|
||||||
|
quant_config: QuantizationConfig | None = None,
|
||||||
|
logits_soft_cap: float | None = None,
|
||||||
|
per_layer_sliding_window: int | None = None,
|
||||||
|
prefix: str = "",
|
||||||
|
attn_type: str = AttentionType.DECODER,
|
||||||
|
kv_sharing_target_layer_name: str | None = None,
|
||||||
|
block_pool_size: int = 1,
|
||||||
|
attn_backend: type[AttentionBackend] | None = None,
|
||||||
|
**extra_impl_args,
|
||||||
|
) -> None:
|
||||||
|
self.block_pool_size = block_pool_size
|
||||||
|
dtype = torch.get_default_dtype()
|
||||||
|
|
||||||
|
if cache_config is not None:
|
||||||
|
kv_cache_dtype = cache_config.cache_dtype
|
||||||
|
block_size = cache_config.block_size
|
||||||
|
else:
|
||||||
|
kv_cache_dtype = "auto"
|
||||||
|
block_size = 16
|
||||||
|
|
||||||
|
underlying_attn_backend = get_attn_backend(
|
||||||
|
head_size,
|
||||||
|
dtype,
|
||||||
|
kv_cache_dtype,
|
||||||
|
block_size,
|
||||||
|
attn_type=attn_type,
|
||||||
|
)
|
||||||
|
attn_backend = create_whisper_attention_backend_with_block_pooling(
|
||||||
|
underlying_attn_backend, block_pool_size
|
||||||
|
)
|
||||||
|
|
||||||
|
super().__init__(
|
||||||
|
num_heads=num_heads,
|
||||||
|
head_size=head_size,
|
||||||
|
scale=scale,
|
||||||
|
num_kv_heads=num_kv_heads,
|
||||||
|
alibi_slopes=alibi_slopes,
|
||||||
|
cache_config=cache_config,
|
||||||
|
quant_config=quant_config,
|
||||||
|
logits_soft_cap=logits_soft_cap,
|
||||||
|
per_layer_sliding_window=per_layer_sliding_window,
|
||||||
|
prefix=prefix,
|
||||||
|
attn_type=attn_type,
|
||||||
|
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||||
|
attn_backend=attn_backend,
|
||||||
|
**extra_impl_args,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_kv_cache_spec(self, vllm_config: VllmConfig):
|
||||||
|
kv_cache_spec = super().get_kv_cache_spec(vllm_config)
|
||||||
|
assert isinstance(kv_cache_spec, AttentionSpec)
|
||||||
|
kv_cache_spec = replace(
|
||||||
|
kv_cache_spec,
|
||||||
|
num_kv_heads=self.block_pool_size * kv_cache_spec.num_kv_heads,
|
||||||
|
)
|
||||||
|
return kv_cache_spec
|
||||||
@ -111,11 +111,16 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
|
|||||||
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
|
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
|
||||||
return librosa.load(filepath, sr=None)
|
return librosa.load(filepath, sr=None)
|
||||||
|
|
||||||
def encode_base64(self, media: tuple[npt.NDArray, int]) -> str:
|
def encode_base64(
|
||||||
|
self,
|
||||||
|
media: tuple[npt.NDArray, int],
|
||||||
|
*,
|
||||||
|
audio_format: str = "WAV",
|
||||||
|
) -> str:
|
||||||
audio, sr = media
|
audio, sr = media
|
||||||
|
|
||||||
with BytesIO() as buffer:
|
with BytesIO() as buffer:
|
||||||
soundfile.write(buffer, audio, sr, format="WAV")
|
soundfile.write(buffer, audio, sr, format=audio_format)
|
||||||
data = buffer.getvalue()
|
data = buffer.getvalue()
|
||||||
|
|
||||||
return base64.b64encode(data).decode("utf-8")
|
return base64.b64encode(data).decode("utf-8")
|
||||||
|
|||||||
@ -8,8 +8,12 @@ import pybase64
|
|||||||
import torch
|
import torch
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
from .base import MediaIO, MediaWithBytes
|
from .base import MediaIO, MediaWithBytes
|
||||||
|
|
||||||
|
logger = init_logger(__file__)
|
||||||
|
|
||||||
|
|
||||||
def rescale_image_size(
|
def rescale_image_size(
|
||||||
image: Image.Image, size_factor: float, transpose: int = -1
|
image: Image.Image, size_factor: float, transpose: int = -1
|
||||||
@ -104,8 +108,17 @@ class ImageMediaIO(MediaIO[Image.Image]):
|
|||||||
self,
|
self,
|
||||||
media: Image.Image,
|
media: Image.Image,
|
||||||
*,
|
*,
|
||||||
image_format: str = "JPEG",
|
image_format: str | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
|
if image_format is None:
|
||||||
|
logger.warning_once(
|
||||||
|
"The default format of `ImageMediaIO.encode_base64` will be changed "
|
||||||
|
'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
|
||||||
|
"To continue using the old default, "
|
||||||
|
'pass `format="JPEG"` explicitly to silence this warning.'
|
||||||
|
)
|
||||||
|
image_format = "JPEG"
|
||||||
|
|
||||||
image = media
|
image = media
|
||||||
|
|
||||||
with BytesIO() as buffer:
|
with BytesIO() as buffer:
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import atexit
|
import atexit
|
||||||
|
import mimetypes
|
||||||
from collections.abc import Generator, Set
|
from collections.abc import Generator, Set
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
@ -357,17 +358,31 @@ class MediaConnector:
|
|||||||
def encode_audio_base64(
|
def encode_audio_base64(
|
||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
sampling_rate: int,
|
sampling_rate: int,
|
||||||
|
*,
|
||||||
|
format: str = "WAV",
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Encode audio as base64."""
|
"""Encode audio as base64."""
|
||||||
audio_io = AudioMediaIO()
|
audio_io = AudioMediaIO()
|
||||||
return audio_io.encode_base64((audio, sampling_rate))
|
return audio_io.encode_base64((audio, sampling_rate), audio_format=format)
|
||||||
|
|
||||||
|
|
||||||
|
def encode_audio_url(
|
||||||
|
audio: np.ndarray,
|
||||||
|
sampling_rate: int,
|
||||||
|
*,
|
||||||
|
format: str = "WAV",
|
||||||
|
) -> str:
|
||||||
|
"""Encode audio as a data URL."""
|
||||||
|
audio_b64 = encode_audio_base64(audio, sampling_rate, format=format)
|
||||||
|
mimetype = mimetypes.types_map.get("." + format.lower(), "audio")
|
||||||
|
return f"data:{mimetype};base64,{audio_b64}"
|
||||||
|
|
||||||
|
|
||||||
def encode_image_base64(
|
def encode_image_base64(
|
||||||
image: Image.Image,
|
image: Image.Image,
|
||||||
*,
|
*,
|
||||||
image_mode: str = "RGB",
|
image_mode: str = "RGB",
|
||||||
format: str = "JPEG",
|
format: str | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Encode a pillow image to base64 format.
|
Encode a pillow image to base64 format.
|
||||||
@ -378,10 +393,45 @@ def encode_image_base64(
|
|||||||
return image_io.encode_base64(image, image_format=format)
|
return image_io.encode_base64(image, image_format=format)
|
||||||
|
|
||||||
|
|
||||||
def encode_video_base64(frames: npt.NDArray) -> str:
|
def encode_image_url(
|
||||||
|
image: Image.Image,
|
||||||
|
*,
|
||||||
|
image_mode: str = "RGB",
|
||||||
|
format: str = "PNG",
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Encode a pillow image as a data URL.
|
||||||
|
|
||||||
|
By default, the image is converted into RGB format before being encoded.
|
||||||
|
"""
|
||||||
|
image_b64 = encode_image_base64(image, image_mode=image_mode, format=format)
|
||||||
|
mimetype = mimetypes.types_map.get("." + format.lower(), "image")
|
||||||
|
return f"data:{mimetype};base64,{image_b64}"
|
||||||
|
|
||||||
|
|
||||||
|
def encode_video_base64(
|
||||||
|
frames: npt.NDArray,
|
||||||
|
*,
|
||||||
|
format: str = "JPEG",
|
||||||
|
) -> str:
|
||||||
image_io = ImageMediaIO()
|
image_io = ImageMediaIO()
|
||||||
video_io = VideoMediaIO(image_io)
|
video_io = VideoMediaIO(image_io)
|
||||||
return video_io.encode_base64(frames)
|
return video_io.encode_base64(frames, video_format=format)
|
||||||
|
|
||||||
|
|
||||||
|
def encode_video_url(
|
||||||
|
frames: npt.NDArray,
|
||||||
|
*,
|
||||||
|
format: str = "JPEG",
|
||||||
|
) -> str:
|
||||||
|
video_b64 = encode_video_base64(frames, format=format)
|
||||||
|
|
||||||
|
if format.lower() == "jpeg":
|
||||||
|
mimetype = "video/jpeg"
|
||||||
|
else:
|
||||||
|
mimetype = mimetypes.types_map.get("." + format.lower(), "video")
|
||||||
|
|
||||||
|
return f"data:{mimetype};base64,{video_b64}"
|
||||||
|
|
||||||
|
|
||||||
def argsort_mm_positions(
|
def argsort_mm_positions(
|
||||||
|
|||||||
@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Optional
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
@ -168,32 +167,6 @@ class XPUPlatform(Platform):
|
|||||||
if vllm_config.kv_transfer_config is not None:
|
if vllm_config.kv_transfer_config is not None:
|
||||||
vllm_config.kv_transfer_config.enable_permute_local_kv = True
|
vllm_config.kv_transfer_config.enable_permute_local_kv = True
|
||||||
|
|
||||||
if parallel_config.distributed_executor_backend is None:
|
|
||||||
if parallel_config.world_size > 1:
|
|
||||||
parallel_config.distributed_executor_backend = "ray"
|
|
||||||
else:
|
|
||||||
parallel_config.distributed_executor_backend = "uni"
|
|
||||||
elif parallel_config.distributed_executor_backend == "mp":
|
|
||||||
# FIXME(kunshang):
|
|
||||||
# spawn needs calling `if __name__ == '__main__':`
|
|
||||||
# fork is not supported for xpu start new process.
|
|
||||||
if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
|
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
|
||||||
logger.warning(
|
|
||||||
"Please use spawn as start method if you want to use mp."
|
|
||||||
)
|
|
||||||
elif (
|
|
||||||
parallel_config.distributed_executor_backend != "ray"
|
|
||||||
and parallel_config.distributed_executor_backend != "uni"
|
|
||||||
and parallel_config.distributed_executor_backend != "external_launcher"
|
|
||||||
):
|
|
||||||
logger.warning(
|
|
||||||
"%s is not supported on XPU, fallback to ray distributed"
|
|
||||||
" executor backend.",
|
|
||||||
parallel_config.distributed_executor_backend,
|
|
||||||
)
|
|
||||||
parallel_config.distributed_executor_backend = "ray"
|
|
||||||
|
|
||||||
if model_config and model_config.use_mla:
|
if model_config and model_config.use_mla:
|
||||||
logger.info(
|
logger.info(
|
||||||
"MLA is enabled on a non-GPU platform; forcing chunked "
|
"MLA is enabled on a non-GPU platform; forcing chunked "
|
||||||
|
|||||||
@ -131,78 +131,105 @@ class MistralToolParser(ToolParser):
|
|||||||
request: ChatCompletionRequest,
|
request: ChatCompletionRequest,
|
||||||
) -> ExtractedToolCallInformation:
|
) -> ExtractedToolCallInformation:
|
||||||
"""
|
"""
|
||||||
Extract the tool calls from a complete model response. Requires
|
Extract the tool calls from a complete model response.
|
||||||
find-and-replacing single quotes with double quotes for JSON parsing,
|
|
||||||
make sure your tool call arguments don't ever include quotes!
|
Content and tool calls formatting depends on the Mistral's tokenizer version
|
||||||
|
used to train the model:
|
||||||
|
|
||||||
|
- < v11: `content[BOT] [{tool_call1},{tool_call2}]`
|
||||||
|
- >= v11: `content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}`
|
||||||
|
|
||||||
|
with [BOT] the tool call token.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
For tokenizer versions >= v11, tool calls with arguments wrongly formatted
|
||||||
|
are still returned as tool calls. This is to allow the model to know it
|
||||||
|
tried to make a tool call. It reduces chance of another failure and
|
||||||
|
prevents that the context is filled with tool calls wrongly placed in
|
||||||
|
assistant message contents.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# case -- if a tool call token is not present, return a text response
|
# If the tool call token is not present, return a text response
|
||||||
if self.bot_token not in model_output:
|
if self.bot_token not in model_output:
|
||||||
return ExtractedToolCallInformation(
|
return ExtractedToolCallInformation(
|
||||||
tools_called=False, tool_calls=[], content=model_output
|
tools_called=False, tool_calls=[], content=model_output
|
||||||
)
|
)
|
||||||
|
|
||||||
# first remove the BOT token
|
content_and_raw_tool_calls = model_output.split(self.bot_token)
|
||||||
tool_content = model_output.replace(self.bot_token, "").strip()
|
content = content_and_raw_tool_calls[0]
|
||||||
|
raw_tool_calls = content_and_raw_tool_calls[1:]
|
||||||
|
|
||||||
try:
|
# >= v11: content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}
|
||||||
|
if not self._is_pre_v11:
|
||||||
|
tool_calls = []
|
||||||
|
for raw_tool_call in raw_tool_calls:
|
||||||
|
if "{" not in raw_tool_call:
|
||||||
|
continue
|
||||||
|
|
||||||
|
end_name = raw_tool_call.find("{")
|
||||||
|
tool_name, args = (
|
||||||
|
raw_tool_call[:end_name],
|
||||||
|
raw_tool_call[end_name:],
|
||||||
|
)
|
||||||
|
|
||||||
|
tool_calls.append({"name": tool_name, "arguments": args})
|
||||||
|
|
||||||
|
# < v11: content[BOT] [{tool_call1},{tool_call2}]
|
||||||
|
else:
|
||||||
|
if len(raw_tool_calls) != 1:
|
||||||
|
raise ValueError(
|
||||||
|
"Only one BOT token should have been outputted, "
|
||||||
|
f"but got {model_output}."
|
||||||
|
)
|
||||||
|
stringified_tool_calls = raw_tool_calls[0].strip()
|
||||||
try:
|
try:
|
||||||
if not self._is_pre_v11:
|
tool_calls = json.loads(stringified_tool_calls)
|
||||||
function_call_arr = []
|
|
||||||
for single_tool_content in model_output.split(self.bot_token):
|
|
||||||
if "{" not in single_tool_content:
|
|
||||||
continue
|
|
||||||
|
|
||||||
end_name = single_tool_content.find("{")
|
|
||||||
fn_name, args = (
|
|
||||||
single_tool_content[:end_name],
|
|
||||||
single_tool_content[end_name:],
|
|
||||||
)
|
|
||||||
|
|
||||||
# fn_name is encoded outside serialized json dump
|
|
||||||
# only arguments are serialized
|
|
||||||
function_call_arr.append(
|
|
||||||
{"name": fn_name, "arguments": json.loads(args)}
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
function_call_arr = json.loads(tool_content)
|
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
# use a regex to find the part corresponding to the tool call.
|
# use a regex to find the part corresponding to the tool call.
|
||||||
# NOTE: This use case should not happen if the model is trained
|
# NOTE: This use case should not happen if the model is trained
|
||||||
# correctly. It's an easy possible fix so it's included, but
|
# correctly. It's an easy possible fix so it's included, but
|
||||||
# can be brittle for very complex / highly nested tool calls
|
# can be brittle for very complex / highly nested tool calls
|
||||||
raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
|
try:
|
||||||
function_call_arr = json.loads(raw_tool_call)
|
raw_tool_call = self.tool_call_regex.findall(
|
||||||
|
stringified_tool_calls
|
||||||
# Tool Call
|
)[0]
|
||||||
tool_calls: list[MistralToolCall] = [
|
tool_calls = json.loads(raw_tool_call)
|
||||||
MistralToolCall(
|
except (IndexError, json.JSONDecodeError):
|
||||||
type="function",
|
logger.exception("Error in extracting tool call from response: {e}")
|
||||||
function=FunctionCall(
|
# If raw decoding and decoding post regex rule fails, then just
|
||||||
name=raw_function_call["name"],
|
# return content.
|
||||||
# function call args are JSON but as a string
|
return ExtractedToolCallInformation(
|
||||||
arguments=json.dumps(
|
tools_called=False,
|
||||||
raw_function_call["arguments"], ensure_ascii=False
|
tool_calls=[],
|
||||||
|
content=stringified_tool_calls,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
tool_calls = [
|
||||||
|
{
|
||||||
|
"name": tool_call["name"],
|
||||||
|
"arguments": json.dumps(
|
||||||
|
tool_call["arguments"], ensure_ascii=False
|
||||||
),
|
),
|
||||||
),
|
}
|
||||||
)
|
for tool_call in tool_calls
|
||||||
for raw_function_call in function_call_arr
|
]
|
||||||
]
|
|
||||||
|
|
||||||
# get any content before the tool call
|
mistral_tool_calls: list[MistralToolCall] = [
|
||||||
content = model_output.split(self.bot_token)[0]
|
MistralToolCall(
|
||||||
return ExtractedToolCallInformation(
|
type="function",
|
||||||
tools_called=True,
|
function=FunctionCall(
|
||||||
tool_calls=tool_calls,
|
name=tool_call["name"],
|
||||||
content=content if len(content) > 0 else None,
|
arguments=tool_call["arguments"],
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
for tool_call in tool_calls
|
||||||
|
]
|
||||||
|
|
||||||
except Exception:
|
return ExtractedToolCallInformation(
|
||||||
logger.exception("Error in extracting tool call from response.")
|
tools_called=True,
|
||||||
# return information to just treat the tool call as regular JSON
|
tool_calls=mistral_tool_calls,
|
||||||
return ExtractedToolCallInformation(
|
content=content if len(content) > 0 else None,
|
||||||
tools_called=False, tool_calls=[], content=tool_content
|
)
|
||||||
)
|
|
||||||
|
|
||||||
def extract_tool_calls_streaming(
|
def extract_tool_calls_streaming(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -330,19 +330,25 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
|
|||||||
rope_theta = getattr_iter(config, names, None, warn=True)
|
rope_theta = getattr_iter(config, names, None, warn=True)
|
||||||
names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"]
|
names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"]
|
||||||
partial_rotary_factor = getattr_iter(config, names, None, warn=True)
|
partial_rotary_factor = getattr_iter(config, names, None, warn=True)
|
||||||
|
ompe = getattr(config, "original_max_position_embeddings", None)
|
||||||
|
|
||||||
if Version(version("transformers")) < Version("5.0.0.dev0"):
|
if Version(version("transformers")) < Version("5.0.0.dev0"):
|
||||||
# Transformers v4 installed, legacy config fields may be present
|
# Transformers v4 installed, legacy config fields may be present
|
||||||
if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
|
if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
|
||||||
config.rope_parameters = rope_scaling
|
config.rope_parameters = rope_scaling
|
||||||
if (
|
if (
|
||||||
rope_theta is not None or partial_rotary_factor is not None
|
rope_theta is not None
|
||||||
|
or partial_rotary_factor is not None
|
||||||
|
or ompe is not None
|
||||||
) and not getattr(config, "rope_parameters", None):
|
) and not getattr(config, "rope_parameters", None):
|
||||||
config.rope_parameters = {"rope_type": "default"}
|
config.rope_parameters = {"rope_type": "default"}
|
||||||
|
# Patch legacy fields into rope_parameters
|
||||||
if rope_theta is not None:
|
if rope_theta is not None:
|
||||||
config.rope_parameters["rope_theta"] = rope_theta
|
config.rope_parameters["rope_theta"] = rope_theta
|
||||||
if partial_rotary_factor is not None:
|
if partial_rotary_factor is not None:
|
||||||
config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
|
config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
|
||||||
|
if ompe is not None:
|
||||||
|
config.rope_parameters["original_max_position_embeddings"] = ompe
|
||||||
elif rope_theta is not None or getattr(config, "rope_parameters", None):
|
elif rope_theta is not None or getattr(config, "rope_parameters", None):
|
||||||
# Transformers v5 installed
|
# Transformers v5 installed
|
||||||
# Patch these fields in case they used non-standard names
|
# Patch these fields in case they used non-standard names
|
||||||
@ -358,10 +364,6 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
|
|||||||
if getattr(config, "rope_parameters", None) is None:
|
if getattr(config, "rope_parameters", None) is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Add original_max_position_embeddings if present
|
|
||||||
if ompe := getattr(config, "original_max_position_embeddings", None):
|
|
||||||
config.rope_parameters["original_max_position_embeddings"] = ompe
|
|
||||||
|
|
||||||
# Handle nested rope_parameters in interleaved sliding attention models
|
# Handle nested rope_parameters in interleaved sliding attention models
|
||||||
if is_rope_parameters_nested(config.rope_parameters):
|
if is_rope_parameters_nested(config.rope_parameters):
|
||||||
for rope_parameters_layer_type in config.rope_parameters.values():
|
for rope_parameters_layer_type in config.rope_parameters.values():
|
||||||
|
|||||||
@ -184,18 +184,42 @@ def _remap_mistral_audio_args(config: dict) -> dict:
|
|||||||
whisper_args = config["multimodal"].pop("whisper_model_args")
|
whisper_args = config["multimodal"].pop("whisper_model_args")
|
||||||
encoder_args = whisper_args["encoder_args"]
|
encoder_args = whisper_args["encoder_args"]
|
||||||
downsample_args = whisper_args["downsample_args"]
|
downsample_args = whisper_args["downsample_args"]
|
||||||
|
downsample_factor = downsample_args["downsample_factor"]
|
||||||
|
|
||||||
|
# make sure that k/v blocks can be allocated with
|
||||||
|
# unified k/v cache class and pool whisper k/v cache blocks
|
||||||
|
# with downsample_factor:1 ratio
|
||||||
|
if encoder_args.get("causal"):
|
||||||
|
block_pool_size = downsample_factor
|
||||||
|
config["projection_size"] = downsample_factor * encoder_args["dim"]
|
||||||
|
else:
|
||||||
|
block_pool_size = 1
|
||||||
|
|
||||||
|
_maybe_sliding_window = encoder_args.get("ragged_attention", None)
|
||||||
|
if _maybe_sliding_window is None:
|
||||||
|
sliding_window = None
|
||||||
|
elif _maybe_sliding_window.isdigit():
|
||||||
|
sliding_window = int(_maybe_sliding_window)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}")
|
||||||
|
|
||||||
|
architecture = (
|
||||||
|
"VoxtralStreamingGeneration"
|
||||||
|
if encoder_args.get("causal")
|
||||||
|
else "VoxtralForConditionalGeneration"
|
||||||
|
)
|
||||||
|
|
||||||
quant_config = config.get("quantization_config")
|
quant_config = config.get("quantization_config")
|
||||||
config = {
|
config = {
|
||||||
"model_type": "whixtral",
|
"model_type": "voxtral",
|
||||||
"architectures": ["VoxtralForConditionalGeneration"],
|
"architectures": [architecture],
|
||||||
"text_config": PretrainedConfig.from_dict(config),
|
"text_config": PretrainedConfig.from_dict(config),
|
||||||
"audio_config": WhisperConfig(
|
"audio_config": WhisperConfig(
|
||||||
num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
|
num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
|
||||||
window_size=encoder_args["audio_encoding_args"]["window_size"],
|
window_size=encoder_args["audio_encoding_args"]["window_size"],
|
||||||
sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
|
sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
|
||||||
hop_length=encoder_args["audio_encoding_args"]["hop_length"],
|
hop_length=encoder_args["audio_encoding_args"]["hop_length"],
|
||||||
downsample_factor=downsample_args["downsample_factor"],
|
downsample_factor=downsample_factor,
|
||||||
d_model=encoder_args["dim"],
|
d_model=encoder_args["dim"],
|
||||||
encoder_layers=encoder_args["n_layers"],
|
encoder_layers=encoder_args["n_layers"],
|
||||||
encoder_ffn_dim=encoder_args["hidden_dim"],
|
encoder_ffn_dim=encoder_args["hidden_dim"],
|
||||||
@ -203,6 +227,10 @@ def _remap_mistral_audio_args(config: dict) -> dict:
|
|||||||
vocab_size=encoder_args["vocab_size"],
|
vocab_size=encoder_args["vocab_size"],
|
||||||
max_source_positions=encoder_args["max_source_positions"],
|
max_source_positions=encoder_args["max_source_positions"],
|
||||||
is_encoder_decoder=False, # Override WhisperConfig default
|
is_encoder_decoder=False, # Override WhisperConfig default
|
||||||
|
is_causal=encoder_args.get("causal", False),
|
||||||
|
sliding_window=sliding_window,
|
||||||
|
block_pool_size=block_pool_size,
|
||||||
|
pos_embed=encoder_args.get("pos_embed", "sinusoidal"),
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
if quant_config:
|
if quant_config:
|
||||||
|
|||||||
@ -3,17 +3,11 @@
|
|||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from vllm.attention.backends.abstract import AttentionBackend
|
from vllm.attention.backends.abstract import AttentionBackend
|
||||||
from vllm.attention.backends.utils import PAD_SLOT_ID
|
from vllm.v1.attention.backends.mamba_attn import (
|
||||||
from vllm.config import VllmConfig
|
BaseMambaAttentionMetadata,
|
||||||
from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
|
BaseMambaAttentionMetadataBuilder,
|
||||||
from vllm.v1.attention.backends.utils import (
|
|
||||||
CommonAttentionMetadata,
|
|
||||||
split_decodes_and_prefills,
|
|
||||||
)
|
)
|
||||||
from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
|
|
||||||
|
|
||||||
|
|
||||||
class Mamba1AttentionBackend(AttentionBackend):
|
class Mamba1AttentionBackend(AttentionBackend):
|
||||||
@ -23,137 +17,12 @@ class Mamba1AttentionBackend(AttentionBackend):
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Mamba1AttentionMetadata:
|
class Mamba1AttentionMetadata(BaseMambaAttentionMetadata):
|
||||||
query_start_loc_p: torch.Tensor
|
pass
|
||||||
state_indices_tensor: torch.Tensor
|
|
||||||
has_initial_states_p: torch.Tensor | None
|
|
||||||
num_prefills: int
|
|
||||||
num_prefill_tokens: int
|
|
||||||
num_decodes: int
|
|
||||||
num_decode_tokens: int
|
|
||||||
|
|
||||||
block_idx_last_scheduled_token: torch.Tensor # shape: [batch,]
|
|
||||||
block_idx_first_scheduled_token_p: torch.Tensor # shape: [batch,]
|
|
||||||
block_idx_last_computed_token: torch.Tensor # shape: [batch,]
|
|
||||||
num_computed_tokens_p: torch.Tensor # shape: [batch,]
|
|
||||||
|
|
||||||
|
|
||||||
class Mamba1AttentionMetadataBuilder(
|
class Mamba1AttentionMetadataBuilder(
|
||||||
BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]
|
BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]
|
||||||
):
|
):
|
||||||
def __init__(
|
metadata_cls = Mamba1AttentionMetadata
|
||||||
self,
|
supports_update_block_table: bool = False
|
||||||
kv_cache_spec: AttentionSpec,
|
|
||||||
layer_names: list[str],
|
|
||||||
vllm_config: VllmConfig,
|
|
||||||
device: torch.device,
|
|
||||||
):
|
|
||||||
super().__init__(kv_cache_spec, layer_names, vllm_config, device)
|
|
||||||
assert isinstance(kv_cache_spec, MambaSpec)
|
|
||||||
|
|
||||||
def build(
|
|
||||||
self,
|
|
||||||
common_prefix_len: int,
|
|
||||||
common_attn_metadata: CommonAttentionMetadata,
|
|
||||||
fast_build: bool = False,
|
|
||||||
) -> Mamba1AttentionMetadata:
|
|
||||||
num_reqs = common_attn_metadata.num_reqs
|
|
||||||
|
|
||||||
num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
|
|
||||||
split_decodes_and_prefills(
|
|
||||||
common_attn_metadata, decode_threshold=self.reorder_batch_threshold
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
has_initial_states_p = None
|
|
||||||
query_start_loc_p = None
|
|
||||||
num_computed_tokens, num_computed_tokens_p = None, None
|
|
||||||
block_idx_first_scheduled_token = None
|
|
||||||
block_idx_first_scheduled_token_p = None
|
|
||||||
|
|
||||||
# TODO(@Josephasafg) Mamba1 and Mamba2 have a lot of code in common here.
|
|
||||||
# We should consolidate this code
|
|
||||||
if self.vllm_config.cache_config.enable_prefix_caching:
|
|
||||||
# Return a tensor of shape (#requests, #max blocks)
|
|
||||||
state_indices_tensor = common_attn_metadata.block_table_tensor
|
|
||||||
mamba_block_size = self.kv_cache_spec.block_size
|
|
||||||
num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
|
|
||||||
self.device
|
|
||||||
)
|
|
||||||
(
|
|
||||||
block_idx_last_computed_token,
|
|
||||||
block_idx_first_scheduled_token,
|
|
||||||
block_idx_last_scheduled_token,
|
|
||||||
) = self._compute_prefix_caching_block_indices(
|
|
||||||
common_attn_metadata, mamba_block_size
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Always return just a single block per each request:
|
|
||||||
state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
|
|
||||||
block_idx_last_scheduled_token = None
|
|
||||||
block_idx_last_computed_token = None
|
|
||||||
|
|
||||||
if num_prefills > 0:
|
|
||||||
query_start_loc_p = (
|
|
||||||
common_attn_metadata.query_start_loc[-num_prefills - 1 :]
|
|
||||||
- num_decode_tokens
|
|
||||||
)
|
|
||||||
has_initial_states_cpu = (
|
|
||||||
common_attn_metadata.num_computed_tokens_cpu[
|
|
||||||
num_reqs - num_prefills : num_reqs
|
|
||||||
]
|
|
||||||
> 0
|
|
||||||
)
|
|
||||||
has_initial_states_p = has_initial_states_cpu.to(
|
|
||||||
common_attn_metadata.query_start_loc.device
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.vllm_config.cache_config.enable_prefix_caching:
|
|
||||||
assert num_computed_tokens is not None
|
|
||||||
num_computed_tokens_p = num_computed_tokens[
|
|
||||||
num_reqs - num_prefills : num_reqs
|
|
||||||
]
|
|
||||||
assert block_idx_first_scheduled_token is not None
|
|
||||||
block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
|
|
||||||
num_reqs - num_prefills : num_reqs
|
|
||||||
]
|
|
||||||
|
|
||||||
elif (
|
|
||||||
num_decodes > 0
|
|
||||||
and num_decodes <= self.decode_cudagraph_max_bs
|
|
||||||
and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
|
||||||
):
|
|
||||||
self.state_indices_tensor[:num_decodes].copy_(
|
|
||||||
state_indices_tensor, non_blocking=True
|
|
||||||
)
|
|
||||||
state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
|
|
||||||
state_indices_tensor[num_decodes:] = PAD_SLOT_ID
|
|
||||||
|
|
||||||
if self.vllm_config.cache_config.enable_prefix_caching:
|
|
||||||
self.block_idx_last_scheduled_token[:num_decodes].copy_(
|
|
||||||
block_idx_last_scheduled_token, non_blocking=True
|
|
||||||
)
|
|
||||||
block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
|
|
||||||
:num_decode_tokens
|
|
||||||
]
|
|
||||||
|
|
||||||
self.block_idx_last_computed_token[:num_decodes].copy_(
|
|
||||||
block_idx_last_computed_token, non_blocking=True
|
|
||||||
)
|
|
||||||
block_idx_last_computed_token = self.block_idx_last_computed_token[
|
|
||||||
:num_decode_tokens
|
|
||||||
]
|
|
||||||
|
|
||||||
return Mamba1AttentionMetadata(
|
|
||||||
query_start_loc_p=query_start_loc_p,
|
|
||||||
has_initial_states_p=has_initial_states_p,
|
|
||||||
state_indices_tensor=state_indices_tensor,
|
|
||||||
num_prefills=num_prefills,
|
|
||||||
num_prefill_tokens=num_prefill_tokens,
|
|
||||||
num_decodes=num_decodes,
|
|
||||||
num_decode_tokens=num_decode_tokens,
|
|
||||||
block_idx_last_scheduled_token=block_idx_last_scheduled_token,
|
|
||||||
block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
|
|
||||||
block_idx_last_computed_token=block_idx_last_computed_token,
|
|
||||||
num_computed_tokens_p=num_computed_tokens_p,
|
|
||||||
)
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user