mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-02 21:47:12 +08:00
Merge branch 'main' into upstream_mori_
This commit is contained in:
commit
8b5e2e69fb
@ -162,7 +162,10 @@ steps:
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py
|
||||
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
|
||||
# TODO: Remove after next torch update
|
||||
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
|
||||
- label: Entrypoints Integration Test (API Server 2)
|
||||
@ -219,6 +222,9 @@ steps:
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
commands:
|
||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
||||
# test with torchrun tp=2 and external_dp=2
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=2 and pp=2
|
||||
@ -267,9 +273,10 @@ steps:
|
||||
- vllm/v1/executor/uniproc_executor.py
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
#- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and dp=4 with ep
|
||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: EPLB Algorithm Test # 5min
|
||||
@ -979,7 +986,10 @@ steps:
|
||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
|
||||
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
|
||||
# TODO: Remove after next torch update
|
||||
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 5min
|
||||
@ -1288,6 +1298,9 @@ steps:
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
@ -1341,7 +1354,9 @@ steps:
|
||||
# end platform plugin tests
|
||||
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
|
||||
# TODO: Remove after next torch update
|
||||
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
@ -1510,7 +1525,7 @@ steps:
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
|
||||
10
csrc/cache.h
10
csrc/cache.h
@ -9,16 +9,6 @@
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
const torch::Tensor& block_mapping);
|
||||
|
||||
// Note: the key_caches and value_caches vectors are constant but
|
||||
// not the Tensors they contain. The vectors need to be const refs
|
||||
// in order to satisfy pytorch's C++ operator registration code.
|
||||
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
std::vector<torch::Tensor> const& value_caches,
|
||||
const torch::Tensor& block_mapping);
|
||||
|
||||
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
||||
const torch::Tensor& block_mapping);
|
||||
|
||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
|
||||
@ -119,94 +119,6 @@ __global__ void copy_blocks_mla_kernel(
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
// Note: the key_caches and value_caches vectors are constant but
|
||||
// not the Tensors they contain. The vectors need to be const refs
|
||||
// in order to satisfy pytorch's C++ operator registration code.
|
||||
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
std::vector<torch::Tensor> const& value_caches,
|
||||
const torch::Tensor& block_mapping) {
|
||||
int num_layers = key_caches.size();
|
||||
TORCH_CHECK(num_layers == value_caches.size());
|
||||
if (num_layers == 0) {
|
||||
return;
|
||||
}
|
||||
torch::Device cache_device = key_caches[0].device();
|
||||
TORCH_CHECK(cache_device.is_cuda());
|
||||
|
||||
// Create data structures for the kernel.
|
||||
// Create an array of pointers to the key and value caches.
|
||||
int64_t key_cache_ptrs[num_layers];
|
||||
int64_t value_cache_ptrs[num_layers];
|
||||
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
|
||||
key_cache_ptrs[layer_idx] =
|
||||
reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
|
||||
value_cache_ptrs[layer_idx] =
|
||||
reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
|
||||
}
|
||||
|
||||
// block_mapping is a 2D tensor with shape (num_pairs, 2).
|
||||
int num_pairs = block_mapping.size(0);
|
||||
|
||||
// Move the data structures to the GPU.
|
||||
// NOTE: This synchronizes the CPU and GPU.
|
||||
torch::Tensor key_cache_ptrs_tensor =
|
||||
torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
|
||||
.to(cache_device);
|
||||
torch::Tensor value_cache_ptrs_tensor =
|
||||
torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
|
||||
.to(cache_device);
|
||||
|
||||
// Launch the kernel.
|
||||
const int numel_per_block = key_caches[0][0].numel();
|
||||
dim3 grid(num_layers, num_pairs);
|
||||
dim3 block(std::min(1024, numel_per_block));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
|
||||
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
|
||||
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
key_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||
value_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||
block_mapping.data_ptr<int64_t>(), numel_per_block);
|
||||
}));
|
||||
}
|
||||
|
||||
// copy blocks kernel for MLA (assumes a joint KV-cache)
|
||||
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
||||
const torch::Tensor& block_mapping) {
|
||||
int num_layers = kv_caches.size();
|
||||
if (num_layers == 0) {
|
||||
return;
|
||||
}
|
||||
torch::Device cache_device = kv_caches[0].device();
|
||||
TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
|
||||
|
||||
std::vector<int64_t> cache_ptrs(num_layers);
|
||||
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
|
||||
cache_ptrs[layer_idx] =
|
||||
reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
|
||||
}
|
||||
torch::Tensor cache_ptrs_tensor =
|
||||
torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
|
||||
.to(cache_device);
|
||||
|
||||
int num_pairs = block_mapping.size(0);
|
||||
// We use the stride instead of numel in case the cache is padded for memory
|
||||
// alignment reasons, we assume the blocks data (inclusive of any padding)
|
||||
// is contiguous in memory
|
||||
int mem_footprint_per_block = kv_caches[0].stride(0);
|
||||
dim3 grid(num_layers, num_pairs);
|
||||
dim3 block(std::min(1024, mem_footprint_per_block));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
|
||||
kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
|
||||
vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||
block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
|
||||
}));
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Used to copy/convert one element
|
||||
@ -539,9 +451,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
for (int i = 0; i < VEC_SIZE; i++) {
|
||||
amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
|
||||
}
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
|
||||
// Reduced amax
|
||||
for (int mask = 16; mask > 0; mask /= 2) {
|
||||
@ -551,9 +460,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
|
||||
#endif
|
||||
}
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
|
||||
#if defined(__gfx942__)
|
||||
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
||||
#else
|
||||
|
||||
@ -35,7 +35,7 @@ template <typename Int>
|
||||
__host__ __device__ inline Int round_up(Int x, Int y) {
|
||||
static_assert(std::is_integral_v<Int>,
|
||||
"round_up argument must be integral type");
|
||||
return (x + y - 1) / y * y;
|
||||
return ((x + y - 1) / y) * y;
|
||||
}
|
||||
|
||||
// Compute effective rows for grid configuration with swizzled SF layouts.
|
||||
@ -61,37 +61,47 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
int sf_m = round_up<int>(numRows, 128);
|
||||
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
|
||||
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
|
||||
for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
|
||||
// Each thread writes 4 uint32_t elements.
|
||||
for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
|
||||
col += blockDim.x * 4) {
|
||||
SFout[row * sf_n_int + col] = 0x00;
|
||||
}
|
||||
}
|
||||
int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
|
||||
|
||||
// Get the global scaling factor, which will be applied to the SF.
|
||||
// Note SFScale is the same as next GEMM's alpha, which is
|
||||
// (448.f / (Alpha_A / 6.f)).
|
||||
float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
|
||||
|
||||
// Input tensor row/col loops.
|
||||
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
|
||||
for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
|
||||
// Iterate over all rows and cols including padded ones -
|
||||
// ensures we visit every single scale factor address to initialize it.
|
||||
for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
|
||||
for (int colIdx = threadIdx.x;
|
||||
colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
|
||||
colIdx += blockDim.x) {
|
||||
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
|
||||
|
||||
PackedVec in_vec;
|
||||
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
|
||||
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
|
||||
// Get the output tensor offset.
|
||||
// Same as inOffset because 8 elements are packed into one uint32_t.
|
||||
int64_t outOffset = inOffset;
|
||||
auto& out_pos = out[outOffset];
|
||||
|
||||
// If we are outside valid rows OR outside valid columns -> Use Zeros
|
||||
if (rowIdx >= numRows || elem_idx >= numCols) {
|
||||
memset(&in_vec, 0, sizeof(PackedVec));
|
||||
|
||||
} else {
|
||||
// Valid Region: Load actual data
|
||||
in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
|
||||
}
|
||||
|
||||
auto sf_out =
|
||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx, colIdx, numKTiles, SFout);
|
||||
|
||||
out_pos =
|
||||
auto out_val =
|
||||
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
|
||||
|
||||
// We do NOT write output for padding because the 'out' tensor is not
|
||||
// padded.
|
||||
if (rowIdx < numRows && elem_idx < numCols) {
|
||||
// Same as inOffset because 8 elements are packed into one uint32_t.
|
||||
out[inOffset] = out_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -134,4 +144,4 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
|
||||
reinterpret_cast<uint32_t*>(sf_out));
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -685,16 +685,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
|
||||
cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
|
||||
|
||||
// Copy the cache blocks from src to dst.
|
||||
cache_ops.def(
|
||||
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
|
||||
"Tensor block_mapping) -> ()");
|
||||
cache_ops.impl("copy_blocks", torch::kCUDA, ©_blocks);
|
||||
|
||||
cache_ops.def(
|
||||
"copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
|
||||
cache_ops.impl("copy_blocks_mla", torch::kCUDA, ©_blocks_mla);
|
||||
|
||||
// Reshape the key and value tensors and cache them.
|
||||
cache_ops.def(
|
||||
"reshape_and_cache(Tensor key, Tensor value,"
|
||||
|
||||
@ -183,7 +183,7 @@ ARG nvcc_threads=8
|
||||
ENV NVCC_THREADS=$nvcc_threads
|
||||
|
||||
ARG USE_SCCACHE
|
||||
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
|
||||
ARG SCCACHE_DOWNLOAD_URL
|
||||
ARG SCCACHE_ENDPOINT
|
||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
@ -201,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
&& case "${TARGETPLATFORM}" in \
|
||||
linux/arm64) SCCACHE_ARCH="aarch64" ;; \
|
||||
linux/amd64) SCCACHE_ARCH="x86_64" ;; \
|
||||
*) echo "Unsupported TARGETPLATFORM for sccache: ${TARGETPLATFORM}" >&2; exit 1 ;; \
|
||||
esac \
|
||||
&& export SCCACHE_DOWNLOAD_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
|
||||
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
|
||||
&& tar -xzf sccache.tar.gz \
|
||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& sudo mv sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl \
|
||||
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
|
||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||
|
||||
@ -2,4 +2,4 @@
|
||||
|
||||
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
|
||||
|
||||
Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.
|
||||
You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
|
||||
|
||||
5
docs/deployment/integrations/llm-d.md
Normal file
5
docs/deployment/integrations/llm-d.md
Normal file
@ -0,0 +1,5 @@
|
||||
# llm-d
|
||||
|
||||
vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers.
|
||||
|
||||
You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
|
||||
@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
|
||||
- [Helm](frameworks/helm.md)
|
||||
- [InftyAI/llmaz](integrations/llmaz.md)
|
||||
- [llm-d](integrations/llm-d.md)
|
||||
- [KAITO](integrations/kaito.md)
|
||||
- [KServe](integrations/kserve.md)
|
||||
- [Kthena](integrations/kthena.md)
|
||||
|
||||
152
setup.py
152
setup.py
@ -50,15 +50,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
|
||||
sys.platform,
|
||||
)
|
||||
VLLM_TARGET_DEVICE = "empty"
|
||||
elif (
|
||||
sys.platform.startswith("linux")
|
||||
and torch.version.cuda is None
|
||||
and os.getenv("VLLM_TARGET_DEVICE") is None
|
||||
and torch.version.hip is None
|
||||
):
|
||||
# if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
|
||||
# fallback to cpu
|
||||
VLLM_TARGET_DEVICE = "cpu"
|
||||
elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
|
||||
if torch.version.hip is not None:
|
||||
VLLM_TARGET_DEVICE = "rocm"
|
||||
logger.info("Auto-detected ROCm")
|
||||
elif torch.version.cuda is not None:
|
||||
VLLM_TARGET_DEVICE = "cuda"
|
||||
logger.info("Auto-detected CUDA")
|
||||
else:
|
||||
VLLM_TARGET_DEVICE = "cpu"
|
||||
|
||||
|
||||
def is_sccache_available() -> bool:
|
||||
@ -108,20 +108,26 @@ class cmake_build_ext(build_ext):
|
||||
num_jobs = os.cpu_count()
|
||||
|
||||
nvcc_threads = None
|
||||
if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
|
||||
# `nvcc_threads` is either the value of the NVCC_THREADS
|
||||
# environment variable (if defined) or 1.
|
||||
# when it is set, we reduce `num_jobs` to avoid
|
||||
# overloading the system.
|
||||
nvcc_threads = envs.NVCC_THREADS
|
||||
if nvcc_threads is not None:
|
||||
nvcc_threads = int(nvcc_threads)
|
||||
logger.info(
|
||||
"Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
|
||||
)
|
||||
else:
|
||||
nvcc_threads = 1
|
||||
num_jobs = max(1, num_jobs // nvcc_threads)
|
||||
if _is_cuda() and CUDA_HOME is not None:
|
||||
try:
|
||||
nvcc_version = get_nvcc_cuda_version()
|
||||
if nvcc_version >= Version("11.2"):
|
||||
# `nvcc_threads` is either the value of the NVCC_THREADS
|
||||
# environment variable (if defined) or 1.
|
||||
# when it is set, we reduce `num_jobs` to avoid
|
||||
# overloading the system.
|
||||
nvcc_threads = envs.NVCC_THREADS
|
||||
if nvcc_threads is not None:
|
||||
nvcc_threads = int(nvcc_threads)
|
||||
logger.info(
|
||||
"Using NVCC_THREADS=%d as the number of nvcc threads.",
|
||||
nvcc_threads,
|
||||
)
|
||||
else:
|
||||
nvcc_threads = 1
|
||||
num_jobs = max(1, num_jobs // nvcc_threads)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to get NVCC version: %s", e)
|
||||
|
||||
return num_jobs, nvcc_threads
|
||||
|
||||
@ -199,9 +205,9 @@ class cmake_build_ext(build_ext):
|
||||
# Default build tool to whatever cmake picks.
|
||||
build_tool = []
|
||||
# Make sure we use the nvcc from CUDA_HOME
|
||||
if _is_cuda():
|
||||
if _is_cuda() and CUDA_HOME is not None:
|
||||
cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
|
||||
elif _is_hip():
|
||||
elif _is_hip() and ROCM_HOME is not None:
|
||||
cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
|
||||
|
||||
other_cmake_args = os.environ.get("CMAKE_ARGS")
|
||||
@ -339,6 +345,89 @@ class precompiled_wheel_utils:
|
||||
wheels = json.loads(resp.read().decode("utf-8"))
|
||||
return wheels, repo_url
|
||||
|
||||
@staticmethod
|
||||
def is_rocm_system() -> bool:
|
||||
"""Detect ROCm without relying on torch (for build environment)."""
|
||||
if os.getenv("ROCM_PATH"):
|
||||
return True
|
||||
if os.path.isdir("/opt/rocm"):
|
||||
return True
|
||||
if which("rocminfo") is not None:
|
||||
return True
|
||||
try:
|
||||
import torch
|
||||
|
||||
return torch.version.hip is not None
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def find_local_rocm_wheel() -> str | None:
|
||||
"""Search for a local vllm wheel in common locations."""
|
||||
import glob
|
||||
|
||||
for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
|
||||
wheels = glob.glob(pattern)
|
||||
if wheels:
|
||||
return sorted(wheels)[-1]
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
|
||||
"""Fetch the latest wheel URL from a PyPI-style simple index."""
|
||||
import platform
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import urljoin
|
||||
from urllib.request import urlopen
|
||||
|
||||
arch = platform.machine()
|
||||
|
||||
class WheelLinkParser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.wheels = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == "a":
|
||||
for name, value in attrs:
|
||||
if name == "href" and value.endswith(".whl"):
|
||||
self.wheels.append(value)
|
||||
|
||||
simple_url = f"{index_url.rstrip('/')}/{package}/"
|
||||
print(f"Fetching wheel list from {simple_url}")
|
||||
with urlopen(simple_url) as resp:
|
||||
html = resp.read().decode("utf-8")
|
||||
|
||||
parser = WheelLinkParser()
|
||||
parser.feed(html)
|
||||
|
||||
for wheel in reversed(parser.wheels):
|
||||
if arch in wheel:
|
||||
if wheel.startswith("http"):
|
||||
return wheel
|
||||
return urljoin(simple_url, wheel)
|
||||
|
||||
raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
|
||||
|
||||
@staticmethod
|
||||
def determine_wheel_url_rocm() -> tuple[str, str | None]:
|
||||
"""Determine the precompiled wheel for ROCm."""
|
||||
# Search for local wheel first
|
||||
local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
|
||||
if local_wheel is not None:
|
||||
print(f"Found local ROCm wheel: {local_wheel}")
|
||||
return local_wheel, None
|
||||
|
||||
# Fall back to AMD's PyPI index
|
||||
index_url = os.getenv(
|
||||
"VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
|
||||
)
|
||||
print(f"Fetching ROCm precompiled wheel from {index_url}")
|
||||
wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
|
||||
download_filename = wheel_url.split("/")[-1].split("#")[0]
|
||||
print(f"Using ROCm precompiled wheel: {wheel_url}")
|
||||
return wheel_url, download_filename
|
||||
|
||||
@staticmethod
|
||||
def determine_wheel_url() -> tuple[str, str | None]:
|
||||
"""
|
||||
@ -359,6 +448,11 @@ class precompiled_wheel_utils:
|
||||
print(f"Using user-specified precompiled wheel location: {wheel_location}")
|
||||
return wheel_location, None
|
||||
else:
|
||||
# ROCm: use local wheel or AMD's PyPI index
|
||||
# TODO: When we have ROCm nightly wheels, we can update this logic.
|
||||
if precompiled_wheel_utils.is_rocm_system():
|
||||
return precompiled_wheel_utils.determine_wheel_url_rocm()
|
||||
|
||||
import platform
|
||||
|
||||
arch = platform.machine()
|
||||
@ -465,6 +559,8 @@ class precompiled_wheel_utils:
|
||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
# ROCm-specific libraries
|
||||
"vllm/_rocm_C.abi3.so",
|
||||
]
|
||||
|
||||
flash_attn_regex = re.compile(
|
||||
@ -601,6 +697,8 @@ def get_rocm_version():
|
||||
# Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
|
||||
# see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
|
||||
try:
|
||||
if ROCM_HOME is None:
|
||||
return None
|
||||
librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
|
||||
if not librocm_core_file.is_file():
|
||||
return None
|
||||
@ -745,7 +843,9 @@ if _is_hip():
|
||||
|
||||
if _is_cuda():
|
||||
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
|
||||
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
|
||||
if envs.VLLM_USE_PRECOMPILED or (
|
||||
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
|
||||
):
|
||||
# FA3 requires CUDA 12.3 or later
|
||||
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
|
||||
# Optional since this doesn't get built (produce an .so file) when
|
||||
|
||||
@ -511,6 +511,16 @@ def test_human_readable_model_len():
|
||||
args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
|
||||
assert args.max_model_len == 10212345123456
|
||||
|
||||
# Special value -1 for auto-fit to GPU memory
|
||||
args = parser.parse_args(["--max-model-len", "-1"])
|
||||
assert args.max_model_len == -1
|
||||
|
||||
# 'auto' is an alias for -1
|
||||
args = parser.parse_args(["--max-model-len", "auto"])
|
||||
assert args.max_model_len == -1
|
||||
args = parser.parse_args(["--max-model-len", "AUTO"])
|
||||
assert args.max_model_len == -1
|
||||
|
||||
# Invalid (do not allow decimals with binary multipliers)
|
||||
for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
|
||||
with pytest.raises(ArgumentError):
|
||||
|
||||
@ -5,6 +5,30 @@ import pytest
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
|
||||
def add_attention_backend(server_args, attention_config):
|
||||
"""Append attention backend CLI arg if specified.
|
||||
|
||||
Args:
|
||||
server_args: List of server arguments to extend in-place.
|
||||
attention_config: Dict with 'backend' key, or None.
|
||||
"""
|
||||
if attention_config and "backend" in attention_config:
|
||||
server_args.extend(["--attention-backend", attention_config["backend"]])
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def rocm_aiter_fa_attention():
|
||||
"""Return attention config for transcription/translation tests on ROCm.
|
||||
|
||||
On ROCm, audio tests require ROCM_AITER_FA attention backend.
|
||||
"""
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
return {"backend": "ROCM_AITER_FA"}
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mary_had_lamb():
|
||||
path = AudioAsset("mary_had_lamb").get_local_path()
|
||||
|
||||
@ -8,7 +8,7 @@ import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.multimodal.utils import encode_audio_base64, fetch_audio
|
||||
from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]:
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def url_encoded_audio() -> dict[str, str]:
|
||||
return {
|
||||
audio_url: encode_audio_url(*fetch_audio(audio_url))
|
||||
for audio_url in TEST_AUDIO_URLS
|
||||
}
|
||||
|
||||
|
||||
def dummy_messages_from_audio_url(
|
||||
audio_urls: str | list[str],
|
||||
content_text: str = "What's happening in this audio?",
|
||||
@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
audio_url: str,
|
||||
base64_encoded_audio: dict[str, str],
|
||||
url_encoded_audio: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_audio_url(
|
||||
f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
|
||||
)
|
||||
messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url])
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
|
||||
@ -254,12 +254,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
|
||||
{"role": "system", "content": "you are a helpful assistant"},
|
||||
{"role": "user", "content": "what is 1+1?"},
|
||||
]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
max_completion_tokens=5,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
)
|
||||
@ -267,13 +266,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=37, total_tokens=47
|
||||
completion_tokens=5, prompt_tokens=37, total_tokens=42
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.content is not None and len(message.content) >= 5
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
@ -282,7 +282,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
max_completion_tokens=5,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest):
|
||||
"2",
|
||||
*passed_params,
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -504,7 +504,11 @@ async def test_web_search(client: OpenAI, model_name: str):
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_code_interpreter(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
# Code interpreter may need more time for container init + code execution
|
||||
timeout_value = client.timeout * 3
|
||||
client_with_timeout = client.with_options(timeout=timeout_value)
|
||||
|
||||
response = await client_with_timeout.responses.create(
|
||||
model=model_name,
|
||||
# TODO: Ideally should be able to set max tool calls
|
||||
# to prevent multi-turn, but it is not currently supported
|
||||
@ -868,6 +872,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.flaky(reruns=3)
|
||||
async def test_function_call_with_previous_input_messages(
|
||||
client: OpenAI, model_name: str
|
||||
):
|
||||
|
||||
@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # default with Qwen3
|
||||
)
|
||||
|
||||
for ignore_eos in [True, False]:
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
||||
}
|
||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
generate_data = generate_resp.json()
|
||||
generate_res = tokenizer.decode(
|
||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
||||
)
|
||||
gen_token_ids = generate_data["choices"][0]["token_ids"]
|
||||
generate_res = tokenizer.decode(gen_token_ids, skip_special_tokens=True)
|
||||
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
||||
"temperature": 0.0,
|
||||
"stream": False,
|
||||
"ignore_eos": ignore_eos,
|
||||
"chat_template_kwargs": dict(enable_thinking=False),
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
}
|
||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||
completions_data = completions_resp.json()
|
||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||
|
||||
if ignore_eos:
|
||||
# When ignoring EOS, only compare up to the first EOS token
|
||||
# Post-EOS generation is undefined and may differ
|
||||
eos_tokens = {
|
||||
tokenizer.eos_token_id,
|
||||
*tokenizer.additional_special_tokens_ids,
|
||||
}
|
||||
# Find first EOS in generated tokens
|
||||
eos_pos = None
|
||||
for i, tid in enumerate(gen_token_ids):
|
||||
if tid in eos_tokens:
|
||||
eos_pos = i
|
||||
break
|
||||
if eos_pos is not None:
|
||||
gen_token_ids_truncated = gen_token_ids[:eos_pos]
|
||||
generate_res = tokenizer.decode(
|
||||
gen_token_ids_truncated, skip_special_tokens=True
|
||||
)
|
||||
# Truncate completions_res to same length for comparison
|
||||
completions_res = completions_res[: len(generate_res)]
|
||||
|
||||
assert generate_res == completions_res
|
||||
|
||||
|
||||
|
||||
@ -9,10 +9,16 @@ import time
|
||||
import openai
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
|
||||
# GPU initialization might take take longer
|
||||
_IS_ROCM = current_platform.is_rocm()
|
||||
_SERVER_STARTUP_TIMEOUT = 120
|
||||
_PROCESS_EXIT_TIMEOUT = 15
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_shutdown_on_engine_failure():
|
||||
@ -45,9 +51,11 @@ async def test_shutdown_on_engine_failure():
|
||||
"2",
|
||||
"--disable-frontend-multiprocessing",
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
# ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
|
||||
# stdout/stderr pipes are enabled during ROCm GPU initialization.
|
||||
stdout=None if _IS_ROCM else subprocess.PIPE,
|
||||
stderr=None if _IS_ROCM else subprocess.PIPE,
|
||||
text=None if _IS_ROCM else True,
|
||||
preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
|
||||
)
|
||||
|
||||
@ -61,7 +69,7 @@ async def test_shutdown_on_engine_failure():
|
||||
)
|
||||
|
||||
# Poll until server is ready
|
||||
while time.time() - start_time < 30:
|
||||
while time.time() - start_time < _SERVER_STARTUP_TIMEOUT:
|
||||
try:
|
||||
await client.completions.create(
|
||||
model=MODEL_NAME, prompt="Hello", max_tokens=1
|
||||
@ -70,14 +78,18 @@ async def test_shutdown_on_engine_failure():
|
||||
except Exception:
|
||||
time.sleep(0.5)
|
||||
if proc.poll() is not None:
|
||||
stdout, stderr = proc.communicate(timeout=1)
|
||||
pytest.fail(
|
||||
f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
|
||||
)
|
||||
if _IS_ROCM:
|
||||
pytest.fail(f"Server died during startup: {proc.returncode}")
|
||||
else:
|
||||
stdout, stderr = proc.communicate(timeout=1)
|
||||
pytest.fail(
|
||||
f"Server died during startup. "
|
||||
f"stdout: {stdout}, stderr: {stderr}"
|
||||
)
|
||||
else:
|
||||
proc.terminate()
|
||||
proc.wait(timeout=5)
|
||||
pytest.fail("Server failed to start in 30 seconds")
|
||||
proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
|
||||
pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds")
|
||||
|
||||
# Kill server to simulate crash
|
||||
proc.terminate()
|
||||
@ -89,5 +101,5 @@ async def test_shutdown_on_engine_failure():
|
||||
model=MODEL_NAME, prompt="This should fail", max_tokens=1
|
||||
)
|
||||
|
||||
return_code = proc.wait(timeout=5)
|
||||
return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
|
||||
assert return_code is not None
|
||||
|
||||
@ -7,6 +7,7 @@ import json
|
||||
import pytest
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .conftest import add_attention_backend
|
||||
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode",
|
||||
@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
|
||||
async def test_basic_audio(mary_had_lamb, model_name):
|
||||
async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
if model_name.startswith("mistralai"):
|
||||
server_args += MISTRAL_FORMAT_ARGS
|
||||
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
|
||||
"""Ensure STT (transcribe) requests can pass LoRA through to generate."""
|
||||
# ROCm SPECIFIC CONFIGURATION:
|
||||
# To ensure the test passes on ROCm, we modify the max model length to 512.
|
||||
# We DO NOT apply this to other platforms to maintain strict upstream parity.
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
model_name = "ibm-granite/granite-speech-3.3-2b"
|
||||
lora_model_name = "speech"
|
||||
server_args = [
|
||||
@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
"--lora-modules",
|
||||
f"{lora_model_name}={model_name}",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"512" if current_platform.is_rocm() else "2048",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
]
|
||||
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_gemma(foscolo):
|
||||
async def test_basic_audio_gemma(foscolo, rocm_aiter_fa_attention):
|
||||
# Gemma accuracy on some of the audio samples we use is particularly bad,
|
||||
# hence we use a different one here. WER is evaluated separately.
|
||||
model_name = "google/gemma-3n-E2B-it"
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
model_name, server_args, max_wait_seconds=480
|
||||
) as remote_server:
|
||||
|
||||
@ -14,16 +14,26 @@ import pytest_asyncio
|
||||
import soundfile as sf
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .conftest import add_attention_backend
|
||||
|
||||
SERVER_ARGS = ["--enforce-eager"]
|
||||
|
||||
|
||||
def _get_server_args(attention_config):
|
||||
"""Get server args with attention backend if specified."""
|
||||
args = SERVER_ARGS.copy()
|
||||
add_attention_backend(args, attention_config)
|
||||
return args
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
|
||||
)
|
||||
def server(request):
|
||||
def server(request, rocm_aiter_fa_attention):
|
||||
# Parametrize over model name
|
||||
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
request.param, _get_server_args(rocm_aiter_fa_attention)
|
||||
) as remote_server:
|
||||
yield remote_server, request.param
|
||||
|
||||
|
||||
@ -35,10 +45,12 @@ async def client_and_model(server):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_asr_model(foscolo):
|
||||
async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
|
||||
# text to text model
|
||||
model_name = "JackFram/llama-68m"
|
||||
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
model_name, _get_server_args(rocm_aiter_fa_attention)
|
||||
) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
res = await client.audio.translations.create(
|
||||
model=model_name, file=foscolo, temperature=0.0
|
||||
@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
|
||||
"""Ensure STT (translate) requests can pass LoRA through to generate."""
|
||||
# ROCm SPECIFIC CONFIGURATION:
|
||||
# To ensure the test passes on ROCm, we modify the max model length to 512.
|
||||
# We DO NOT apply this to other platforms to maintain strict upstream parity.
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# NOTE - careful to call this test before the module scoped server
|
||||
# fixture, otherwise it'll OOMkill the CI
|
||||
model_name = "ibm-granite/granite-speech-3.3-2b"
|
||||
@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
"--lora-modules",
|
||||
f"{lora_model_name}={model_name}",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"512" if current_platform.is_rocm() else "2048",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
]
|
||||
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
|
||||
@ -7,7 +7,8 @@ import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.multimodal.utils import encode_video_base64, fetch_video
|
||||
from vllm.multimodal.utils import encode_video_url, fetch_video
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
@ -37,7 +38,16 @@ def server():
|
||||
json.dumps({"video": MAXIMUM_VIDEOS}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
# ROCm: Increase timeouts to handle potential network delays and slower
|
||||
# video processing when downloading multiple videos from external sources
|
||||
env_overrides = {}
|
||||
if current_platform.is_rocm():
|
||||
env_overrides = {
|
||||
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
|
||||
}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -48,9 +58,9 @@ async def client(server):
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_video() -> dict[str, str]:
|
||||
def url_encoded_video() -> dict[str, str]:
|
||||
return {
|
||||
video_url: encode_video_base64(fetch_video(video_url)[0])
|
||||
video_url: encode_video_url(fetch_video(video_url)[0])
|
||||
for video_url in TEST_VIDEO_URLS
|
||||
}
|
||||
|
||||
@ -175,11 +185,9 @@ async def test_single_chat_session_video_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
video_url: str,
|
||||
base64_encoded_video: dict[str, str],
|
||||
url_encoded_video: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_video_url(
|
||||
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||
)
|
||||
messages = dummy_messages_from_video_url(url_encoded_video[video_url])
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
@ -223,11 +231,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
video_url: str,
|
||||
base64_encoded_video: dict[str, str],
|
||||
url_encoded_video: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_video_url(
|
||||
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
|
||||
)
|
||||
messages = dummy_messages_from_video_url(url_encoded_video[video_url])
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
@ -291,6 +297,11 @@ async def test_chat_streaming_video(
|
||||
@pytest.mark.parametrize(
|
||||
"video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
|
||||
)
|
||||
@pytest.mark.flaky(
|
||||
reruns=2,
|
||||
reruns_delay=5,
|
||||
condition=current_platform.is_rocm(),
|
||||
)
|
||||
async def test_multi_video_input(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
|
||||
):
|
||||
|
||||
@ -9,7 +9,8 @@ import pytest_asyncio
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
from vllm.multimodal.utils import encode_image_url, fetch_image
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
@ -35,7 +36,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
|
||||
],
|
||||
[
|
||||
"The image shows a Venn diagram with three over",
|
||||
"The image shows a colorful Venn diagram with",
|
||||
"The image displays a Venn diagram with three over",
|
||||
],
|
||||
[
|
||||
"This image displays a gradient of colors ranging from",
|
||||
@ -43,6 +44,27 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
|
||||
],
|
||||
]
|
||||
|
||||
EXPECTED_MM_BEAM_SEARCH_RES_ROCM = [
|
||||
# MultiHeadAttention attn_backend: FLASH_ATTN
|
||||
# with Triton Attention backend
|
||||
[
|
||||
"The image shows a wooden boardwalk leading through a",
|
||||
"The image shows a wooden boardwalk extending into a",
|
||||
],
|
||||
[
|
||||
"The image shows two parrots perched on",
|
||||
"The image shows two birds perched on a cur",
|
||||
],
|
||||
[
|
||||
"The image shows a Venn diagram with three over",
|
||||
"The image contains a Venn diagram with three over",
|
||||
],
|
||||
[
|
||||
"This image displays a gradient of colors ranging from",
|
||||
"This image displays a gradient of colors transitioning from",
|
||||
],
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
@ -59,7 +81,16 @@ def server():
|
||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
# ROCm: Increase timeouts to handle potential network delays and slower
|
||||
# video processing when downloading multiple videos from external sources
|
||||
env_overrides = {}
|
||||
if current_platform.is_rocm():
|
||||
env_overrides = {
|
||||
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
|
||||
}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -70,11 +101,9 @@ async def client(server):
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
def url_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_asset: encode_image_base64(
|
||||
local_asset_server.get_image_asset(image_asset)
|
||||
)
|
||||
image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
|
||||
for image_asset in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
@ -234,11 +263,11 @@ async def test_single_chat_session_image_base64encoded(
|
||||
model_name: str,
|
||||
raw_image_url: str,
|
||||
image_url: str,
|
||||
base64_encoded_image: dict[str, str],
|
||||
url_encoded_image: dict[str, str],
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = dummy_messages_from_image_url(
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
|
||||
url_encoded_image[raw_image_url],
|
||||
content_text,
|
||||
)
|
||||
|
||||
@ -288,15 +317,20 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_idx: int,
|
||||
base64_encoded_image: dict[str, str],
|
||||
url_encoded_image: dict[str, str],
|
||||
):
|
||||
# ROCm: Switch expected results based on platform
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# NOTE: This test also validates that we pass MM data through beam search
|
||||
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
|
||||
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
|
||||
|
||||
messages = dummy_messages_from_image_url(
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
|
||||
)
|
||||
if current_platform.is_rocm():
|
||||
expected_res = EXPECTED_MM_BEAM_SEARCH_RES_ROCM[image_idx]
|
||||
else:
|
||||
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
|
||||
|
||||
messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
|
||||
@ -33,6 +33,7 @@ def _terratorch_dummy_messages():
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
|
||||
)
|
||||
|
||||
@ -9,11 +9,6 @@ from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
PROMPTS = [
|
||||
@ -35,6 +30,12 @@ TOKEN_IDS = [
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
|
||||
# that supports encoder-only models on ROCm.
|
||||
attention_config = None
|
||||
if current_platform.is_rocm():
|
||||
attention_config = {"backend": "FLEX_ATTENTION"}
|
||||
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
@ -44,6 +45,7 @@ def llm():
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
attention_config=attention_config,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
@ -9,11 +9,6 @@ import pytest_asyncio
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
|
||||
max_model_len = 128
|
||||
|
||||
@ -44,6 +39,10 @@ def server():
|
||||
str(max_model_len),
|
||||
]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
28
tests/entrypoints/pooling/embed/conftest.py
Normal file
28
tests/entrypoints/pooling/embed/conftest.py
Normal file
@ -0,0 +1,28 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Pytest configuration for vLLM pooling embed tests."""
|
||||
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
"""Configure ROCm-specific settings based on collected tests."""
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
|
||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
torch.backends.cuda.enable_math_sdp(True)
|
||||
warnings.warn(
|
||||
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
|
||||
"to avoid HuggingFace Transformers accuracy issues",
|
||||
UserWarning,
|
||||
stacklevel=1,
|
||||
)
|
||||
@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
||||
|
||||
MODEL_NAME = "intfloat/e5-small"
|
||||
@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667
|
||||
def server():
|
||||
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
prompts = ["The chef prepared a delicious meal."]
|
||||
@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."]
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
|
||||
# that supports encoder-only models on ROCm.
|
||||
attention_config = None
|
||||
if current_platform.is_rocm():
|
||||
attention_config = {"backend": "FLEX_ATTENTION"}
|
||||
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
@ -32,6 +33,7 @@ def llm():
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
attention_config=attention_config,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
|
||||
decode_pooling_output,
|
||||
)
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
|
||||
DTYPE = "bfloat16"
|
||||
|
||||
|
||||
if current_platform.is_rocm():
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
|
||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
torch.backends.cuda.enable_math_sdp(True)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
@ -53,6 +57,10 @@ def server():
|
||||
DUMMY_CHAT_TEMPLATE,
|
||||
]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
|
||||
EmbedModelInfo(
|
||||
@ -62,6 +57,10 @@ def server(model_info, dtype: str):
|
||||
["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
|
||||
)
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(model_info.name, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
|
||||
def _generate_random_text(word_count: int) -> str:
|
||||
"""Generate random text with approximately the specified word count."""
|
||||
@ -228,6 +223,10 @@ def server_with_chunked_processing():
|
||||
"0.8",
|
||||
]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ from transformers import AutoProcessor
|
||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
|
||||
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
|
||||
MAXIMUM_IMAGES = 2
|
||||
@ -48,14 +48,6 @@ def server():
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
|
||||
for image_url in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
|
||||
def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_name, trust_remote_code=True, num_crops=4
|
||||
|
||||
@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
||||
|
||||
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||||
@ -30,6 +25,10 @@ st_main_score = 0.33457
|
||||
def server():
|
||||
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
|
||||
# that supports encoder-only models on ROCm.
|
||||
attention_config = None
|
||||
if current_platform.is_rocm():
|
||||
attention_config = {"backend": "FLEX_ATTENTION"}
|
||||
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
@ -30,6 +31,7 @@ def llm():
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
attention_config=attention_config,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
|
||||
from vllm.entrypoints.pooling.score.protocol import RerankResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "BAAI/bge-reranker-base"
|
||||
DTYPE = "bfloat16"
|
||||
|
||||
@ -24,6 +19,10 @@ DTYPE = "bfloat16"
|
||||
def server():
|
||||
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.score.protocol import ScoreResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
{"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
|
||||
{"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
|
||||
@ -44,6 +39,10 @@ def model(request):
|
||||
def server(model: dict[str, Any]):
|
||||
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
|
||||
|
||||
# ROCm: Use Flex Attention to support encoder-only self-attention.
|
||||
if current_platform.is_rocm():
|
||||
args.extend(["--attention-backend", "FLEX_ATTENTION"])
|
||||
|
||||
with RemoteOpenAIServer(model["name"], args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -25,9 +25,9 @@ from vllm.entrypoints.chat_utils import (
|
||||
)
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
|
||||
from vllm.multimodal.utils import (
|
||||
encode_audio_base64,
|
||||
encode_image_base64,
|
||||
encode_video_base64,
|
||||
encode_audio_url,
|
||||
encode_image_url,
|
||||
encode_video_url,
|
||||
)
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
@ -141,22 +141,19 @@ def mistral_model_config():
|
||||
@pytest.fixture(scope="module")
|
||||
def image_url():
|
||||
image = ImageAsset("cherry_blossom")
|
||||
base64 = encode_image_base64(image.pil_image)
|
||||
return f"data:image/jpeg;base64,{base64}"
|
||||
return encode_image_url(image.pil_image)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def video_url():
|
||||
video = VideoAsset("baby_reading", 1)
|
||||
base64 = encode_video_base64(video.np_ndarrays)
|
||||
return f"data:video/jpeg;base64,{base64}"
|
||||
return encode_video_url(video.np_ndarrays)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def audio_url():
|
||||
audio = AudioAsset("mary_had_lamb")
|
||||
base64 = encode_audio_base64(*audio.audio_and_sample_rate)
|
||||
return f"data:audio/ogg;base64,{base64}"
|
||||
return encode_audio_url(*audio.audio_and_sample_rate)
|
||||
|
||||
|
||||
def _assert_mm_data_is_image_input(
|
||||
|
||||
11
tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
Normal file
11
tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
Normal file
@ -0,0 +1,11 @@
|
||||
model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
|
||||
accuracy_threshold: 0.85
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: >-
|
||||
--max-model-len 4096
|
||||
--tensor-parallel-size 2
|
||||
--enable-expert-parallel
|
||||
--async-scheduling
|
||||
env:
|
||||
VLLM_USE_FLASHINFER_MOE_FP8: "1"
|
||||
@ -4,3 +4,4 @@ Qwen1.5-MoE-W4A16-CT.yaml
|
||||
DeepSeek-V2-Lite-Instruct-FP8.yaml
|
||||
Qwen3-30B-A3B-NVFP4.yaml
|
||||
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
|
||||
Qwen3-Next-FP8-EP2.yaml
|
||||
|
||||
@ -71,6 +71,7 @@ def test_gsm8k_correctness(config_filename):
|
||||
print(f"Number of questions: {eval_config['num_questions']}")
|
||||
print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
|
||||
print(f"Server args: {' '.join(server_args)}")
|
||||
print(f"Environment variables: {env_dict}")
|
||||
|
||||
# Launch server and run evaluation
|
||||
with RemoteOpenAIServer(
|
||||
|
||||
@ -40,93 +40,6 @@ KV_CACHE_DTYPE = ["auto", "fp8"]
|
||||
RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
|
||||
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||
@torch.inference_mode()
|
||||
def test_copy_blocks(
|
||||
kv_cache_factory,
|
||||
num_mappings: int,
|
||||
num_layers: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
kv_cache_dtype: str,
|
||||
device: str,
|
||||
) -> None:
|
||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||
pytest.skip()
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
# Generate random block mappings where each source block is mapped to two
|
||||
# destination blocks.
|
||||
assert 2 * num_mappings <= num_blocks
|
||||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
||||
remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
|
||||
dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
|
||||
block_mapping: list[tuple[int, int]] = []
|
||||
for i in range(num_mappings):
|
||||
src = src_blocks[i]
|
||||
dst1 = dst_blocks[2 * i]
|
||||
dst2 = dst_blocks[2 * i + 1]
|
||||
block_mapping.append((src, dst1))
|
||||
block_mapping.append((src, dst2))
|
||||
|
||||
# Create the KV caches.
|
||||
key_caches, value_caches = kv_cache_factory(
|
||||
num_blocks,
|
||||
block_size,
|
||||
num_layers,
|
||||
num_heads,
|
||||
head_size,
|
||||
kv_cache_dtype,
|
||||
dtype,
|
||||
seed,
|
||||
device,
|
||||
)
|
||||
|
||||
# Clone the KV caches.
|
||||
cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
|
||||
cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
|
||||
|
||||
# Call the copy blocks kernel.
|
||||
block_mapping_tensor = torch.tensor(
|
||||
block_mapping, dtype=torch.int64, device=device
|
||||
).view(-1, 2)
|
||||
|
||||
opcheck(
|
||||
torch.ops._C_cache_ops.copy_blocks,
|
||||
(key_caches, value_caches, block_mapping_tensor),
|
||||
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
|
||||
cond=(head_size == HEAD_SIZES[0]),
|
||||
)
|
||||
ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
|
||||
|
||||
# Run the reference implementation.
|
||||
for src, dst in block_mapping:
|
||||
for cloned_key_cache in cloned_key_caches:
|
||||
cloned_key_cache[dst].copy_(cloned_key_cache[src])
|
||||
for cloned_value_cache in cloned_value_caches:
|
||||
cloned_value_cache[dst].copy_(cloned_value_cache[src])
|
||||
|
||||
# Compare the results.
|
||||
for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
|
||||
torch.testing.assert_close(key_cache, cloned_key_cache)
|
||||
for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
|
||||
torch.testing.assert_close(value_cache, cloned_value_cache)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@ -763,73 +676,6 @@ def test_concat_and_cache_ds_mla(
|
||||
torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
|
||||
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
|
||||
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||
@torch.inference_mode()
|
||||
def test_copy_blocks_mla(
|
||||
kv_lora_rank: int,
|
||||
qk_rope_head_dim: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
num_layers: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
kv_cache_dtype: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
entry_size = kv_lora_rank + qk_rope_head_dim
|
||||
|
||||
kv_caches = []
|
||||
for _ in range(num_layers):
|
||||
kv_cache = _create_mla_cache(
|
||||
num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
|
||||
)
|
||||
_fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
|
||||
kv_caches.append(kv_cache)
|
||||
|
||||
ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
|
||||
|
||||
num_mappings = min(2, num_blocks // 2)
|
||||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
||||
remaining = list(set(range(num_blocks)) - set(src_blocks))
|
||||
dst_blocks = random.sample(remaining, 2 * num_mappings)
|
||||
block_mapping = []
|
||||
for i in range(num_mappings):
|
||||
src = src_blocks[i]
|
||||
dst1 = dst_blocks[2 * i]
|
||||
dst2 = dst_blocks[2 * i + 1]
|
||||
block_mapping.append((src, dst1))
|
||||
block_mapping.append((src, dst2))
|
||||
block_mapping_tensor = torch.tensor(
|
||||
block_mapping, dtype=torch.int64, device=device
|
||||
).view(-1, 2)
|
||||
|
||||
for src, dst in block_mapping:
|
||||
for ref_cache in ref_caches:
|
||||
ref_cache[dst].copy_(ref_cache[src])
|
||||
|
||||
opcheck(
|
||||
torch.ops._C_cache_ops.copy_blocks_mla,
|
||||
(kv_caches, block_mapping_tensor),
|
||||
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
|
||||
)
|
||||
ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
|
||||
|
||||
for kv_cache, ref_cache in zip(kv_caches, ref_caches):
|
||||
torch.testing.assert_close(kv_cache, ref_cache)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
|
||||
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
|
||||
|
||||
@ -19,7 +19,7 @@ def pytest_collection_modifyitems(config, items):
|
||||
return
|
||||
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
# accuracy issues
|
||||
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
|
||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
|
||||
@ -513,6 +513,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=8192,
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||
num_logprobs=10 if current_platform.is_rocm() else 5,
|
||||
),
|
||||
"intern_vl-hf": VLMTestInfo(
|
||||
models=["OpenGVLab/InternVL3-1B-hf"],
|
||||
|
||||
@ -8,7 +8,7 @@ from PIL.Image import Image
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
|
||||
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
|
||||
|
||||
@ -31,10 +31,7 @@ def test_keye_vl(
|
||||
question: str,
|
||||
):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
|
||||
]
|
||||
image_urls = [encode_image_url(image) for image in images]
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
|
||||
@ -15,7 +15,7 @@ from transformers import AutoProcessor
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
from vllm.multimodal.video import sample_frames_from_video
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
|
||||
"""Build Dots.OCR specific prompt with OCR instructions."""
|
||||
# Use only stop_sign image for Dots.OCR
|
||||
image = images[0] # Already filtered to stop_sign
|
||||
|
||||
image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
|
||||
image_url = encode_image_url(image)
|
||||
|
||||
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
|
||||
messages = [
|
||||
@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
|
||||
config["model_name"], trust_remote_code=True
|
||||
)
|
||||
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
|
||||
]
|
||||
image_urls = [encode_image_url(img) for img in images]
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
@ -225,9 +222,7 @@ def build_processor_prompt(images, config):
|
||||
|
||||
def build_ovis_prompt(images, config):
|
||||
"""Build Ovis2.5 specific prompt with custom format."""
|
||||
image_urls = [
|
||||
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
|
||||
]
|
||||
image_urls = [encode_image_url(img) for img in images]
|
||||
|
||||
placeholders = "\n".join(
|
||||
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
|
||||
|
||||
@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.message.content == "In the first audio clip, you hear a brief"
|
||||
assert choice.finish_reason == "length"
|
||||
|
||||
@ -860,6 +860,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
# disable this temporarily until we support HF format
|
||||
is_available_online=False,
|
||||
),
|
||||
"VoxtralStreamingGeneration": _HfExamplesInfo(
|
||||
"<place-holder>",
|
||||
# disable this temporarily until we support HF format
|
||||
is_available_online=False,
|
||||
),
|
||||
# [Encoder-decoder]
|
||||
"WhisperForConditionalGeneration": _HfExamplesInfo(
|
||||
"openai/whisper-large-v3-turbo",
|
||||
|
||||
@ -38,7 +38,7 @@ def test_inference(
|
||||
max_num_seqs=32,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.llm.encode(prompt)
|
||||
vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin")
|
||||
assert torch.equal(
|
||||
torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
|
||||
)
|
||||
|
||||
@ -4,6 +4,11 @@
|
||||
set -e
|
||||
set -x
|
||||
|
||||
if command -v rocminfo >/dev/null 2>&1; then
|
||||
echo "Skipping test for ROCm platform"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
cd /vllm-workspace/
|
||||
|
||||
rm -rf .venv
|
||||
@ -36,7 +41,7 @@ if diff before.txt after.txt; then
|
||||
echo "torch version not overridden."
|
||||
else
|
||||
echo "torch version overridden by nightly_torch_test.txt, \
|
||||
if the dependency is not triggered by the pytroch nightly test,\
|
||||
if the dependency is not triggered by the pytorch nightly test,\
|
||||
please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@ -281,6 +281,8 @@ def test_extract_tool_calls_pre_v11_tokenizer(
|
||||
"single_tool_add",
|
||||
"single_tool_weather",
|
||||
"multiple_tool_calls",
|
||||
"complex",
|
||||
"wrong_json",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
@ -326,6 +328,36 @@ def test_extract_tool_calls_pre_v11_tokenizer(
|
||||
],
|
||||
None,
|
||||
),
|
||||
(
|
||||
# Complex
|
||||
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="bash",
|
||||
arguments=json.dumps(
|
||||
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
|
||||
)[:-2],
|
||||
)
|
||||
)
|
||||
],
|
||||
"hi{hi",
|
||||
),
|
||||
(
|
||||
# Wrong json
|
||||
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
name="bash",
|
||||
arguments=json.dumps(
|
||||
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
|
||||
),
|
||||
)
|
||||
)
|
||||
],
|
||||
"hi{hi",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls(
|
||||
@ -673,7 +705,7 @@ def test_extract_tool_calls_streaming(
|
||||
),
|
||||
(
|
||||
# Complex
|
||||
"""[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
|
||||
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
|
||||
[
|
||||
ToolCall(
|
||||
function=FunctionCall(
|
||||
@ -684,7 +716,7 @@ def test_extract_tool_calls_streaming(
|
||||
)
|
||||
)
|
||||
],
|
||||
"",
|
||||
"hi{hi",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
@ -106,6 +106,7 @@ class RemoteOpenAIServer:
|
||||
env.update(env_dict)
|
||||
serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
|
||||
print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
|
||||
print(f"Environment variables: {env}")
|
||||
self.proc: subprocess.Popen = subprocess.Popen(
|
||||
serve_cmd,
|
||||
env=env,
|
||||
|
||||
@ -1798,3 +1798,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
|
||||
)
|
||||
)
|
||||
assert block_hashes[1] == expected_hash2
|
||||
|
||||
|
||||
def test_auto_fit_max_model_len():
|
||||
"""Test that max_model_len=-1 auto-fits to available GPU memory."""
|
||||
# Create config with original_max_model_len=-1 to trigger auto-fit
|
||||
model_config = ModelConfig(max_model_len=1024)
|
||||
# Simulate the user passing -1 by setting original_max_model_len
|
||||
model_config.original_max_model_len = -1
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # 16KB per block per layer
|
||||
kv_cache_specs = {
|
||||
"layer_1": new_kv_cache_spec(),
|
||||
"layer_2": new_kv_cache_spec(),
|
||||
}
|
||||
|
||||
# With enough memory, max_model_len stays at the derived max
|
||||
large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [large_available_memory]
|
||||
)
|
||||
assert vllm_config.model_config.max_model_len == 1024
|
||||
|
||||
# Reset for next test
|
||||
model_config = ModelConfig(max_model_len=1024)
|
||||
model_config.original_max_model_len = -1
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
# With limited memory, max_model_len should be reduced
|
||||
# Need memory for at least max_model_len tokens
|
||||
# 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
|
||||
limited_memory = mem_per_block_per_layer * 2 * 32
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [limited_memory]
|
||||
)
|
||||
# Should be reduced to fit in memory
|
||||
assert vllm_config.model_config.max_model_len < 1024
|
||||
assert vllm_config.model_config.max_model_len > 0
|
||||
|
||||
|
||||
def test_auto_fit_max_model_len_not_triggered():
|
||||
"""Test that auto-fit is not triggered when original_max_model_len is not -1."""
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
# original_max_model_len should be None by default, not -1
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
|
||||
kv_cache_specs = {
|
||||
"layer_1": new_kv_cache_spec(),
|
||||
"layer_2": new_kv_cache_spec(),
|
||||
}
|
||||
|
||||
# This should work normally without auto-fit
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
|
||||
)
|
||||
assert vllm_config.model_config.max_model_len == 16
|
||||
|
||||
@ -1356,6 +1356,69 @@ def test_kv_cache_events(blocks_to_cache: int):
|
||||
assert len(manager.block_pool.cached_block_hash_to_block) == 0
|
||||
|
||||
|
||||
def test_null_parent_block_hash():
|
||||
block_size = 1
|
||||
num_cached_blocks = 2
|
||||
num_full_blocks = 4
|
||||
|
||||
pool = BlockPool(
|
||||
num_gpu_blocks=8,
|
||||
enable_caching=True,
|
||||
hash_block_size=block_size,
|
||||
enable_kv_cache_events=True,
|
||||
)
|
||||
|
||||
req = make_request(
|
||||
"req_null_parent",
|
||||
prompt_token_ids=[10, 11, 12, 13],
|
||||
block_size=block_size,
|
||||
hash_fn=sha256,
|
||||
)
|
||||
assert len(req.block_hashes) == num_full_blocks
|
||||
|
||||
# Physical parent is `null_block` (no hash), while the logical parent hash
|
||||
# still exists in `request.block_hashes[num_cached_blocks - 1]`.
|
||||
assert pool.null_block.block_hash is None
|
||||
new_blocks = pool.get_new_blocks(num_full_blocks - 1)
|
||||
blocks = [
|
||||
new_blocks[: num_cached_blocks - 1],
|
||||
pool.null_block, # physical parent
|
||||
*new_blocks[num_cached_blocks - 1 :],
|
||||
]
|
||||
|
||||
pool.cache_full_blocks(
|
||||
request=req,
|
||||
blocks=blocks,
|
||||
num_cached_blocks=num_cached_blocks,
|
||||
num_full_blocks=num_full_blocks,
|
||||
block_size=block_size,
|
||||
kv_cache_group_id=0,
|
||||
)
|
||||
|
||||
events = pool.take_events()
|
||||
assert len(events) == 1
|
||||
event = events[0]
|
||||
assert isinstance(event, BlockStored)
|
||||
|
||||
expected_parent = kv_cache_utils.maybe_convert_block_hash(
|
||||
req.block_hashes[num_cached_blocks - 1]
|
||||
)
|
||||
assert event.parent_block_hash == expected_parent
|
||||
assert event.parent_block_hash is not None
|
||||
|
||||
expected_new_hashes = [
|
||||
kv_cache_utils.maybe_convert_block_hash(h)
|
||||
for h in req.block_hashes[num_cached_blocks:num_full_blocks]
|
||||
]
|
||||
assert event.block_hashes == expected_new_hashes
|
||||
|
||||
# Ensure we didn't accidentally assign a hash to the null block.
|
||||
assert pool.null_block.block_hash is None
|
||||
# Sanity check: newly cached physical blocks should have hashes assigned.
|
||||
assert blocks[num_cached_blocks].block_hash is not None
|
||||
assert blocks[num_full_blocks - 1].block_hash is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
|
||||
def test_kv_cache_events_with_lora(blocks_to_cache: int):
|
||||
"""Test BlockStored events contain correct lora_id when using LoRA requests."""
|
||||
|
||||
@ -31,7 +31,7 @@ import openai
|
||||
import requests
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
|
||||
MAX_OUTPUT_LEN = 256
|
||||
|
||||
@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image;base64,{encode_image_base64(image_1)}"
|
||||
},
|
||||
"image_url": {"url": encode_image_url(image_1)},
|
||||
},
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image;base64,{encode_image_base64(image_2)}"
|
||||
},
|
||||
"image_url": {"url": encode_image_url(image_2)},
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
|
||||
@ -260,7 +260,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
|
||||
|
||||
# Use multi-abort to abort multiple requests at once
|
||||
abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
|
||||
await engine.abort(abort_request_ids)
|
||||
await engine.abort(abort_request_ids, internal=False)
|
||||
|
||||
# Wait for all tasks to complete
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
@ -609,7 +609,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Abort the request
|
||||
await engine.abort(request_id)
|
||||
await engine.abort(request_id, internal=False)
|
||||
|
||||
# Wait for generation to complete and return final output
|
||||
final_output = await generated
|
||||
|
||||
@ -40,10 +40,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
PROMPT = "I am Gyoubu Masataka Oniwa"
|
||||
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
|
||||
|
||||
_REQUEST_COUNTER = 0
|
||||
|
||||
|
||||
def make_request() -> EngineCoreRequest:
|
||||
global _REQUEST_COUNTER
|
||||
_REQUEST_COUNTER += 1
|
||||
request_id = f"request-{_REQUEST_COUNTER}"
|
||||
return EngineCoreRequest(
|
||||
request_id=str(uuid.uuid4()),
|
||||
request_id=request_id,
|
||||
external_req_id=f"{request_id}-{uuid.uuid4()}",
|
||||
prompt_token_ids=PROMPT_TOKENS,
|
||||
mm_features=None,
|
||||
sampling_params=SamplingParams(),
|
||||
|
||||
@ -45,6 +45,8 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
PROMPT = "Hello my name is Robert and I love quantization kernels"
|
||||
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
|
||||
|
||||
_REQUEST_COUNTER = 0
|
||||
|
||||
|
||||
def make_request(
|
||||
params: SamplingParams, prompt_tokens_ids: list[int] | None = None
|
||||
@ -52,8 +54,12 @@ def make_request(
|
||||
if not prompt_tokens_ids:
|
||||
prompt_tokens_ids = PROMPT_TOKENS
|
||||
|
||||
global _REQUEST_COUNTER
|
||||
_REQUEST_COUNTER += 1
|
||||
request_id = f"request-{_REQUEST_COUNTER}"
|
||||
return EngineCoreRequest(
|
||||
request_id=str(uuid.uuid4()),
|
||||
request_id=request_id,
|
||||
external_req_id=f"{request_id}-{uuid.uuid4()}",
|
||||
prompt_token_ids=prompt_tokens_ids,
|
||||
mm_features=None,
|
||||
sampling_params=params,
|
||||
|
||||
@ -27,6 +27,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
|
||||
params = SamplingParams(skip_special_tokens=True)
|
||||
request = EngineCoreRequest(
|
||||
request_id="test",
|
||||
external_req_id="test-ext",
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
mm_features=None,
|
||||
sampling_params=params,
|
||||
|
||||
@ -58,12 +58,12 @@ def test_incremental_detokenization(
|
||||
output_processor = OutputProcessor(
|
||||
dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
|
||||
)
|
||||
engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
|
||||
|
||||
# Make N requests.
|
||||
requests = [
|
||||
EngineCoreRequest(
|
||||
request_id=f"request-{idx}",
|
||||
request_id=f"request-{idx}-int",
|
||||
external_req_id=f"request-{idx}",
|
||||
prompt_token_ids=prompt_tokens,
|
||||
mm_features=None,
|
||||
eos_token_id=None,
|
||||
@ -83,6 +83,11 @@ def test_incremental_detokenization(
|
||||
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
||||
]
|
||||
|
||||
engine_core = MockEngineCore(
|
||||
tokens_list=dummy_test_vectors.generation_tokens,
|
||||
request_ids=[req.request_id for req in requests],
|
||||
)
|
||||
|
||||
# Add requests to the detokenizer.
|
||||
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
|
||||
output_processor.add_request(request, prompt)
|
||||
@ -438,15 +443,6 @@ def test_logprobs_processor(
|
||||
dummy_test_vectors,
|
||||
):
|
||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
|
||||
engine_core = MockEngineCore(
|
||||
tokens_list=dummy_test_vectors.generation_tokens,
|
||||
generated_logprobs_raw=None
|
||||
if num_sample_logprobs is None
|
||||
else dummy_test_vectors.generation_logprobs,
|
||||
prompt_logprobs_raw=None
|
||||
if num_prompt_logprobs is None
|
||||
else dummy_test_vectors.prompt_logprobs,
|
||||
)
|
||||
|
||||
# Make N requests.
|
||||
request_id_list = [
|
||||
@ -454,7 +450,8 @@ def test_logprobs_processor(
|
||||
]
|
||||
requests = [
|
||||
EngineCoreRequest(
|
||||
request_id=request_id_list[idx],
|
||||
request_id=request_id_list[idx] + "-int",
|
||||
external_req_id=request_id_list[idx],
|
||||
prompt_token_ids=prompt_tokens,
|
||||
mm_features=None,
|
||||
eos_token_id=None,
|
||||
@ -476,6 +473,17 @@ def test_logprobs_processor(
|
||||
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
||||
]
|
||||
|
||||
engine_core = MockEngineCore(
|
||||
tokens_list=dummy_test_vectors.generation_tokens,
|
||||
generated_logprobs_raw=None
|
||||
if num_sample_logprobs is None
|
||||
else dummy_test_vectors.generation_logprobs,
|
||||
prompt_logprobs_raw=None
|
||||
if num_prompt_logprobs is None
|
||||
else dummy_test_vectors.prompt_logprobs,
|
||||
request_ids=[req.request_id for req in requests],
|
||||
)
|
||||
|
||||
# Add requests to the detokenizer.
|
||||
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
|
||||
output_processor.add_request(request, prompt)
|
||||
@ -621,19 +629,12 @@ def test_stop_token(
|
||||
]
|
||||
prompt_string = dummy_test_vectors.prompt_strings[0]
|
||||
prompt_tokens = dummy_test_vectors.prompt_tokens[0]
|
||||
engine_core = MockEngineCore(
|
||||
tokens_list=[generation_tokens],
|
||||
generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
|
||||
prompt_logprobs_raw=None,
|
||||
eos_token_id=eos_token_id,
|
||||
stop_token_ids=stop_token_ids,
|
||||
ignore_eos=ignore_eos,
|
||||
)
|
||||
|
||||
# Make request.
|
||||
request_id = "request-0"
|
||||
request = EngineCoreRequest(
|
||||
request_id=request_id,
|
||||
external_req_id=request_id + "-ext",
|
||||
prompt_token_ids=prompt_tokens,
|
||||
mm_features=None,
|
||||
eos_token_id=eos_token_id,
|
||||
@ -655,6 +656,16 @@ def test_stop_token(
|
||||
pooling_params=None,
|
||||
)
|
||||
|
||||
engine_core = MockEngineCore(
|
||||
tokens_list=[generation_tokens],
|
||||
generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
|
||||
prompt_logprobs_raw=None,
|
||||
eos_token_id=eos_token_id,
|
||||
stop_token_ids=stop_token_ids,
|
||||
ignore_eos=ignore_eos,
|
||||
request_ids=[request.request_id],
|
||||
)
|
||||
|
||||
# Add request to the detokenizer.
|
||||
output_processor.add_request(request, prompt_string)
|
||||
|
||||
@ -720,13 +731,6 @@ def test_stop_string(
|
||||
dummy_test_vectors,
|
||||
):
|
||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
|
||||
engine_core = MockEngineCore(
|
||||
tokens_list=dummy_test_vectors.generation_tokens,
|
||||
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
|
||||
if num_sample_logprobs
|
||||
else None,
|
||||
prompt_logprobs_raw=None,
|
||||
)
|
||||
|
||||
# Make N requests.
|
||||
request_id_list = [
|
||||
@ -734,7 +738,8 @@ def test_stop_string(
|
||||
]
|
||||
requests = [
|
||||
EngineCoreRequest(
|
||||
request_id=request_id_list[idx],
|
||||
request_id=request_id_list[idx] + "-int",
|
||||
external_req_id=request_id_list[idx],
|
||||
prompt_token_ids=prompt_tokens,
|
||||
mm_features=None,
|
||||
eos_token_id=None,
|
||||
@ -756,6 +761,15 @@ def test_stop_string(
|
||||
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
||||
]
|
||||
|
||||
engine_core = MockEngineCore(
|
||||
tokens_list=dummy_test_vectors.generation_tokens,
|
||||
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
|
||||
if num_sample_logprobs
|
||||
else None,
|
||||
prompt_logprobs_raw=None,
|
||||
request_ids=[req.request_id for req in requests],
|
||||
)
|
||||
|
||||
# Add requests to the detokenizer.
|
||||
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
|
||||
output_processor.add_request(request, prompt)
|
||||
@ -813,9 +827,12 @@ def test_stop_string(
|
||||
for idx, (ref_gen_str, stop_str) in enumerate(
|
||||
zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
|
||||
):
|
||||
# Request should be aborted.
|
||||
# Request should be aborted (check internal ID in abort list).
|
||||
internal_request_id = f"request-{idx}-int"
|
||||
assert internal_request_id in aborted
|
||||
|
||||
# Use external ID for collecting outputs
|
||||
request_id = f"request-{idx}"
|
||||
assert request_id in aborted
|
||||
|
||||
# Collected values that were generated.
|
||||
gen_str = gen_strings[request_id]
|
||||
@ -848,13 +865,13 @@ def test_stop_string(
|
||||
|
||||
def test_iteration_stats(dummy_test_vectors):
|
||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
|
||||
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
|
||||
engine_core_timestamp = time.monotonic()
|
||||
|
||||
# Make N requests.
|
||||
requests = [
|
||||
EngineCoreRequest(
|
||||
request_id=f"request-{idx}",
|
||||
external_req_id=f"request-{idx}-ext",
|
||||
prompt_token_ids=prompt_tokens,
|
||||
mm_features=None,
|
||||
eos_token_id=None,
|
||||
@ -868,6 +885,11 @@ def test_iteration_stats(dummy_test_vectors):
|
||||
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
||||
]
|
||||
|
||||
engine_core = MockEngineCore(
|
||||
dummy_test_vectors.generation_tokens,
|
||||
request_ids=[req.request_id for req in requests],
|
||||
)
|
||||
|
||||
# Add all requests except one to the OutputProcessor.
|
||||
num_active = len(dummy_test_vectors.generation_tokens) - 1
|
||||
for request in requests[:num_active]:
|
||||
@ -922,7 +944,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
||||
output_processor = OutputProcessor(
|
||||
dummy_test_vectors.tokenizer, log_stats=log_stats
|
||||
)
|
||||
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
|
||||
engine_core_timestamp = time.monotonic()
|
||||
|
||||
# Create LoRA requests
|
||||
@ -936,7 +957,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
||||
lora_assignments = [lora1, lora2, None]
|
||||
requests = [
|
||||
EngineCoreRequest(
|
||||
request_id=f"request-{idx}",
|
||||
request_id=f"request-{idx}-int",
|
||||
external_req_id=f"request-{idx}",
|
||||
prompt_token_ids=prompt_tokens,
|
||||
mm_features=None,
|
||||
eos_token_id=None,
|
||||
@ -950,6 +972,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
||||
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
||||
]
|
||||
|
||||
engine_core = MockEngineCore(
|
||||
dummy_test_vectors.generation_tokens,
|
||||
request_ids=[req.request_id for req in requests],
|
||||
)
|
||||
|
||||
# Add all requests to the OutputProcessor
|
||||
for request in requests:
|
||||
output_processor.add_request(request, None)
|
||||
@ -1015,9 +1042,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
||||
outputs = EngineCoreOutputs(
|
||||
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
|
||||
)
|
||||
# Find and mark request-0 as finished (it uses lora-1)
|
||||
# Find and mark request-0-int as finished (it uses lora-1)
|
||||
for output in outputs.outputs:
|
||||
if output.request_id == "request-0":
|
||||
if output.request_id == "request-0-int":
|
||||
output.finish_reason = FinishReason.LENGTH
|
||||
break
|
||||
|
||||
@ -1040,9 +1067,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
||||
outputs = EngineCoreOutputs(
|
||||
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
|
||||
)
|
||||
# Find and mark request-1 as finished (it uses lora-2)
|
||||
# Find and mark request-1-int as finished (it uses lora-2)
|
||||
for output in outputs.outputs:
|
||||
if output.request_id == "request-1":
|
||||
if output.request_id == "request-1-int":
|
||||
output.finish_reason = FinishReason.LENGTH
|
||||
break
|
||||
|
||||
@ -1064,9 +1091,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
|
||||
outputs = EngineCoreOutputs(
|
||||
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
|
||||
)
|
||||
# Find and mark request-2 as finished (it has no LoRA)
|
||||
# Find and mark request-2-int as finished (it has no LoRA)
|
||||
for output in outputs.outputs:
|
||||
if output.request_id == "request-2":
|
||||
if output.request_id == "request-2-int":
|
||||
output.finish_reason = FinishReason.LENGTH
|
||||
break
|
||||
|
||||
@ -1107,7 +1134,9 @@ async def test_request_output_collector():
|
||||
for idx in range(NUM_REQS)
|
||||
]
|
||||
|
||||
collector = RequestOutputCollector(RequestOutputKind.DELTA)
|
||||
collector = RequestOutputCollector(
|
||||
RequestOutputKind.DELTA, request_id="my-request-id-int"
|
||||
)
|
||||
|
||||
# CASE 1: Put then get.
|
||||
outputs = make_outputs()
|
||||
@ -1163,7 +1192,9 @@ async def test_request_output_collector():
|
||||
@pytest.mark.asyncio
|
||||
async def test_cumulative_output_collector_n():
|
||||
"""Test collector correctly handles multiple outputs by index."""
|
||||
collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
|
||||
collector = RequestOutputCollector(
|
||||
RequestOutputKind.CUMULATIVE, request_id="my-request-id-int"
|
||||
)
|
||||
outputs = [
|
||||
RequestOutput(
|
||||
request_id="my-request-id",
|
||||
@ -1242,11 +1273,13 @@ async def test_cumulative_output_collector_n():
|
||||
|
||||
|
||||
@pytest.mark.parametrize("runner", ["generate", "pooling"])
|
||||
def test_abort_requests(runner: str, dummy_test_vectors):
|
||||
@pytest.mark.parametrize("abort_by", ["internal", "external"])
|
||||
def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors):
|
||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
|
||||
requests = [
|
||||
EngineCoreRequest(
|
||||
request_id=f"request-{idx}",
|
||||
external_req_id=f"external-{idx}",
|
||||
prompt_token_ids=prompt_tokens,
|
||||
mm_features=None,
|
||||
eos_token_id=None,
|
||||
@ -1265,8 +1298,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
|
||||
output_kind = request.sampling_params.output_kind
|
||||
else:
|
||||
output_kind = request.pooling_params.output_kind
|
||||
queue = RequestOutputCollector(output_kind=output_kind)
|
||||
queue = RequestOutputCollector(
|
||||
output_kind=output_kind, request_id=request.request_id
|
||||
)
|
||||
output_processor.add_request(request, None, queue=queue)
|
||||
|
||||
for request in requests:
|
||||
output_processor.abort_requests([request.request_id])
|
||||
if abort_by == "internal":
|
||||
output_processor.abort_requests([request.request_id], internal=True)
|
||||
else:
|
||||
output_processor.abort_requests([request.external_req_id], internal=False)
|
||||
|
||||
@ -4,11 +4,12 @@
|
||||
from vllm import SamplingParams
|
||||
from vllm.outputs import CompletionOutput
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.parallel_sampling import ParentRequest
|
||||
|
||||
|
||||
def test_parent_request_to_output_stream() -> None:
|
||||
parent_request = ParentRequest("parent_id", SamplingParams(n=2))
|
||||
parent_request = ParentRequest(make_request(SamplingParams(n=2)))
|
||||
parent_request.child_requests = {"child_id_0", "child_id_1"}
|
||||
output_0 = CompletionOutput(
|
||||
index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
|
||||
@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
|
||||
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
|
||||
)
|
||||
# Request not finished
|
||||
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
|
||||
"child_id_0", output_0
|
||||
)
|
||||
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
|
||||
"child_id_1", output_1
|
||||
)
|
||||
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
|
||||
"child_id_0", output_0
|
||||
)
|
||||
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
|
||||
"child_id_1", output_1
|
||||
)
|
||||
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
|
||||
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
|
||||
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
|
||||
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
|
||||
|
||||
# output_1 finished
|
||||
output_1.finish_reason = "ended"
|
||||
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
|
||||
"child_id_0", output_0
|
||||
)
|
||||
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
|
||||
"child_id_1", output_1
|
||||
)
|
||||
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
|
||||
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
|
||||
# Finished output_1 had already returned, DO NOT returned again
|
||||
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
|
||||
"child_id_0", output_0
|
||||
)
|
||||
assert parent_request.get_outputs("child_id_1", output_1) == (
|
||||
"parent_id",
|
||||
[],
|
||||
False,
|
||||
)
|
||||
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
|
||||
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
|
||||
|
||||
# output_0 finished
|
||||
output_0.finish_reason = "ended"
|
||||
assert ("parent_id", [output_0], True) == parent_request.get_outputs(
|
||||
"child_id_0", output_0
|
||||
)
|
||||
assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
|
||||
assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0)
|
||||
assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
|
||||
# Finished output_0 had already returned, DO NOT returned again
|
||||
assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
|
||||
assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
|
||||
assert parent_request.get_outputs("child_id_0", output_0) == ([], True)
|
||||
assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
|
||||
|
||||
|
||||
def test_parent_request_to_output_final_only() -> None:
|
||||
parent_request = ParentRequest(
|
||||
"parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
|
||||
make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY))
|
||||
)
|
||||
parent_request.child_requests = {"child_id_0", "child_id_1"}
|
||||
output_0 = CompletionOutput(
|
||||
@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
|
||||
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
|
||||
)
|
||||
# Request not finished, return nothing
|
||||
assert parent_request.get_outputs("child_id_0", output_0) == (
|
||||
"parent_id",
|
||||
[],
|
||||
False,
|
||||
)
|
||||
assert parent_request.get_outputs("child_id_1", output_1) == (
|
||||
"parent_id",
|
||||
[],
|
||||
False,
|
||||
)
|
||||
assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
|
||||
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
|
||||
# output_1 finished, but outputs won't be returned until all child requests finished
|
||||
output_1.finish_reason = "ended"
|
||||
assert parent_request.get_outputs("child_id_0", output_0) == (
|
||||
"parent_id",
|
||||
[],
|
||||
False,
|
||||
)
|
||||
assert parent_request.get_outputs("child_id_1", output_1) == (
|
||||
"parent_id",
|
||||
[],
|
||||
False,
|
||||
)
|
||||
assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
|
||||
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
|
||||
# output_0 finished, as all child requests finished, the output would be returned
|
||||
output_0.finish_reason = "ended"
|
||||
assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
|
||||
assert ([output_0, output_1], True) == parent_request.get_outputs(
|
||||
"child_id_0", output_0
|
||||
)
|
||||
assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
|
||||
assert ([output_0, output_1], True) == parent_request.get_outputs(
|
||||
"child_id_1", output_1
|
||||
)
|
||||
|
||||
|
||||
def make_request(sampling_params: SamplingParams) -> EngineCoreRequest:
|
||||
return EngineCoreRequest(
|
||||
request_id="parent_id",
|
||||
external_req_id="ext_parent_id",
|
||||
prompt_token_ids=None,
|
||||
mm_features=None,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=None,
|
||||
eos_token_id=None,
|
||||
arrival_time=0.0,
|
||||
lora_request=None,
|
||||
cache_salt=None,
|
||||
data_parallel_rank=None,
|
||||
)
|
||||
|
||||
@ -6,6 +6,7 @@ import pytest
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
|
||||
from vllm.multimodal import MultiModalUUIDDict
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.engine import input_processor as input_processor_mod
|
||||
from vllm.v1.engine.input_processor import InputProcessor
|
||||
@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
|
||||
monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
|
||||
)
|
||||
|
||||
captured: dict[str, object] = {}
|
||||
captured: dict[str, MultiModalUUIDDict] = {}
|
||||
|
||||
def fake_preprocess(
|
||||
prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
|
||||
@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
|
||||
)
|
||||
|
||||
# Expect request-id-based overrides are passed through
|
||||
assert captured["mm_uuids"] == {
|
||||
"image": [f"{request_id}-image-0", f"{request_id}-image-1"],
|
||||
"video": [f"{request_id}-video-0"],
|
||||
}
|
||||
mm_uuids = captured["mm_uuids"]
|
||||
assert set(mm_uuids.keys()) == {"image", "video"}
|
||||
assert len(mm_uuids["image"]) == 2
|
||||
assert len(mm_uuids["video"]) == 1
|
||||
assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
|
||||
"image"
|
||||
][0].endswith("-0")
|
||||
assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
|
||||
"image"
|
||||
][1].endswith("-1")
|
||||
assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
|
||||
"video"
|
||||
][0].endswith("-0")
|
||||
|
||||
@ -343,6 +343,7 @@ class MockEngineCore:
|
||||
eos_token_id: int | None = None,
|
||||
stop_token_ids: list[int] | None = None,
|
||||
ignore_eos: bool = False,
|
||||
request_ids: list[str] | None = None,
|
||||
) -> None:
|
||||
self.num_requests = len(tokens_list)
|
||||
self.tokens_list = tokens_list
|
||||
@ -355,6 +356,11 @@ class MockEngineCore:
|
||||
self.eos_token_id = eos_token_id
|
||||
self.stop_token_ids = stop_token_ids
|
||||
self.ignore_eos = ignore_eos
|
||||
self.request_ids = (
|
||||
request_ids
|
||||
if request_ids is not None
|
||||
else [f"request-{i}" for i in range(self.num_requests)]
|
||||
)
|
||||
|
||||
def get_outputs(self) -> list[EngineCoreOutput]:
|
||||
do_logprobs = self.do_logprobs
|
||||
@ -386,7 +392,7 @@ class MockEngineCore:
|
||||
prompt_logprobs = None
|
||||
new_token_id = token_ids[token_idx]
|
||||
output = EngineCoreOutput(
|
||||
request_id=f"request-{req_idx}",
|
||||
request_id=self.request_ids[req_idx],
|
||||
new_token_ids=[new_token_id],
|
||||
new_logprobs=logprobs,
|
||||
new_prompt_logprobs_tensors=prompt_logprobs,
|
||||
|
||||
@ -8,7 +8,7 @@ import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
|
||||
# Use a small vision model for testing
|
||||
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
@ -52,9 +52,9 @@ async def client(image_server):
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
def url_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
|
||||
image_url: encode_image_url(local_asset_server.get_image_asset(image_url))
|
||||
for image_url in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
raw_image_url: str,
|
||||
base64_encoded_image: dict[str, str],
|
||||
url_encoded_image: dict[str, str],
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = [
|
||||
@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
|
||||
"content": [
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501
|
||||
"image_url": url_encoded_image[raw_image_url],
|
||||
"detail": "auto",
|
||||
},
|
||||
{"type": "input_text", "text": content_text},
|
||||
|
||||
@ -9,7 +9,7 @@ from PIL import Image
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.config import KVTransferConfig
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
|
||||
@ -74,7 +74,7 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
|
||||
placeholders = [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image;base64,{encode_image_base64(image_pil)}"},
|
||||
"image_url": {"url": encode_image_url(image_pil)},
|
||||
}
|
||||
for image_pil in image_urls
|
||||
]
|
||||
|
||||
@ -41,10 +41,13 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
|
||||
has_kv_transfer_group,
|
||||
)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms.interface import Platform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.output_processor import OutputProcessor
|
||||
from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
|
||||
from vllm.v1.request import RequestStatus
|
||||
|
||||
@ -1265,6 +1268,22 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
|
||||
run_test_and_cleanup()
|
||||
|
||||
|
||||
class RequestIdMapper:
|
||||
"""Helper class to map external request IDs to internal request IDs."""
|
||||
|
||||
def __init__(self, output_processor: OutputProcessor):
|
||||
self.req_id_mapping: dict[str, str] = {}
|
||||
self.original_add_request = output_processor.add_request
|
||||
output_processor.add_request = self._add_request
|
||||
|
||||
def _add_request(self, request: EngineCoreRequest, *args, **kwargs):
|
||||
self.req_id_mapping[request.external_req_id] = request.request_id
|
||||
return self.original_add_request(request, *args, **kwargs)
|
||||
|
||||
def __call__(self, external_req_id: str) -> str:
|
||||
return self.req_id_mapping[external_req_id]
|
||||
|
||||
|
||||
def _run_abort_timeout_test(llm: LLM, timeout: int):
|
||||
"""Helper function to run the abort timeout test logic."""
|
||||
remote_prefill_opts = {
|
||||
@ -1286,24 +1305,34 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
|
||||
0
|
||||
].req_to_blocks
|
||||
|
||||
id_mapper = RequestIdMapper(llm.llm_engine.output_processor)
|
||||
|
||||
def req_id(outputs: list[RequestOutput]) -> str:
|
||||
assert len(outputs) == 1
|
||||
return id_mapper(outputs[0].request_id)
|
||||
|
||||
padding = "Just making this request a little longer so that we're sure "
|
||||
"we're not hitting the small-request lower bound beneath which we don't "
|
||||
"actually trigger the whole kv transfer, but rather just recompute the "
|
||||
"blocks on D."
|
||||
_ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
|
||||
req0_id = req_id(
|
||||
llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
|
||||
)
|
||||
|
||||
# Request finished but not freed
|
||||
assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks
|
||||
assert req0_id in scheduler.finished_req_ids and req0_id in req_to_blocks
|
||||
# Some other request, 0 still not freed
|
||||
_ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
|
||||
assert "0" in req_to_blocks
|
||||
assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks
|
||||
req1_id = req_id(
|
||||
llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
|
||||
)
|
||||
assert req0_id in req_to_blocks
|
||||
assert req1_id in scheduler.finished_req_ids and req1_id in req_to_blocks
|
||||
|
||||
# Wait for timeout and trigger another scheduler loop
|
||||
time.sleep(timeout)
|
||||
_ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
|
||||
# Request-0 times out and is cleared!
|
||||
assert "0" not in req_to_blocks
|
||||
assert req0_id not in req_to_blocks
|
||||
# Need to shutdown the background thread to release NIXL side channel port
|
||||
llm.llm_engine.engine_core.shutdown()
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
import openai
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal.utils import encode_image_base64
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
|
||||
@ -12,11 +12,9 @@ from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
def url_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_asset: encode_image_base64(
|
||||
local_asset_server.get_image_asset(image_asset)
|
||||
)
|
||||
image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
|
||||
for image_asset in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
@ -24,19 +22,16 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
|
||||
@pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
|
||||
async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str]):
|
||||
async def test_basic_vision(model_name: str, url_encoded_image: dict[str, str]):
|
||||
pytest.skip("Skip this test until it's fixed.")
|
||||
|
||||
def whats_in_this_image_msg(b64):
|
||||
def whats_in_this_image_msg(url):
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
|
||||
},
|
||||
{"type": "image_url", "image_url": {"url": url}},
|
||||
],
|
||||
}
|
||||
]
|
||||
@ -63,14 +58,14 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str
|
||||
|
||||
# Other requests now should be much faster
|
||||
for image_url in TEST_IMAGE_ASSETS:
|
||||
image_base64 = base64_encoded_image[image_url]
|
||||
chat_completion_from_base64 = await client.chat.completions.create(
|
||||
image_url = url_encoded_image[image_url]
|
||||
chat_completion_from_url = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=whats_in_this_image_msg(image_base64),
|
||||
messages=whats_in_this_image_msg(image_url),
|
||||
max_completion_tokens=24,
|
||||
temperature=0.0,
|
||||
)
|
||||
result = chat_completion_from_base64
|
||||
result = chat_completion_from_url
|
||||
assert result
|
||||
choice = result.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
|
||||
@ -380,6 +380,31 @@ def _rocm_aiter_gemm_a8w8_fake(
|
||||
return Y
|
||||
|
||||
|
||||
def _rocm_aiter_triton_gemm_a8w8_blockscale_impl(
|
||||
A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
As: torch.Tensor,
|
||||
Bs: torch.Tensor,
|
||||
output_dtype: torch.dtype = torch.float16,
|
||||
) -> torch.Tensor:
|
||||
from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
|
||||
|
||||
return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
|
||||
|
||||
|
||||
def _rocm_aiter_triton_gemm_a8w8_blockscale_fake(
|
||||
A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
As: torch.Tensor,
|
||||
Bs: torch.Tensor,
|
||||
output_dtype: torch.dtype = torch.float16,
|
||||
) -> torch.Tensor:
|
||||
m = A.shape[0]
|
||||
n = B.shape[0]
|
||||
Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
|
||||
return Y
|
||||
|
||||
|
||||
def _rocm_aiter_gemm_a8w8_blockscale_impl(
|
||||
A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
@ -964,6 +989,12 @@ class rocm_aiter_ops:
|
||||
dispatch_key=current_platform.dispatch_key,
|
||||
)
|
||||
|
||||
direct_register_custom_op(
|
||||
op_name="rocm_aiter_triton_gemm_a8w8_blockscale",
|
||||
op_func=_rocm_aiter_triton_gemm_a8w8_blockscale_impl,
|
||||
fake_impl=_rocm_aiter_triton_gemm_a8w8_blockscale_fake,
|
||||
)
|
||||
|
||||
direct_register_custom_op(
|
||||
op_name="rocm_aiter_gemm_a8w8_blockscale",
|
||||
op_func=_rocm_aiter_gemm_a8w8_blockscale_impl,
|
||||
@ -1102,6 +1133,19 @@ class rocm_aiter_ops:
|
||||
) -> torch.Tensor:
|
||||
return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype)
|
||||
|
||||
@staticmethod
|
||||
def triton_gemm_a8w8_blockscale(
|
||||
A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
As: torch.Tensor,
|
||||
Bs: torch.Tensor,
|
||||
block_size: list[int],
|
||||
output_dtype: torch.dtype = torch.float16,
|
||||
) -> torch.Tensor:
|
||||
return torch.ops.vllm.rocm_aiter_triton_gemm_a8w8_blockscale(
|
||||
A, B, As, Bs, output_dtype
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def gemm_a8w8_blockscale(
|
||||
A: torch.Tensor,
|
||||
@ -1373,19 +1417,6 @@ class rocm_aiter_ops:
|
||||
config=config,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def triton_gemm_a8w8_blockscale(
|
||||
A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
As: torch.Tensor,
|
||||
Bs: torch.Tensor,
|
||||
block_size: list[int],
|
||||
output_dtype: torch.dtype = torch.float16,
|
||||
) -> torch.Tensor:
|
||||
from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
|
||||
|
||||
return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
|
||||
|
||||
@staticmethod
|
||||
def group_fp8_quant(
|
||||
input_2d: torch.Tensor,
|
||||
|
||||
@ -2328,18 +2328,6 @@ def concat_and_cache_mla(
|
||||
)
|
||||
|
||||
|
||||
def copy_blocks(
|
||||
key_caches: list[torch.Tensor],
|
||||
value_caches: list[torch.Tensor],
|
||||
block_mapping: torch.Tensor,
|
||||
) -> None:
|
||||
torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
|
||||
|
||||
|
||||
def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None:
|
||||
torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
|
||||
|
||||
|
||||
def swap_blocks(
|
||||
src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
|
||||
) -> None:
|
||||
|
||||
@ -383,18 +383,6 @@ class ipex_ops:
|
||||
)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
key_caches: list[torch.Tensor],
|
||||
value_caches: list[torch.Tensor],
|
||||
block_mapping: torch.Tensor,
|
||||
) -> None:
|
||||
torch.xpu.copy_blocks( # type: ignore
|
||||
key_caches,
|
||||
value_caches,
|
||||
block_mapping,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
|
||||
|
||||
@ -136,7 +136,7 @@ class MMEncoderAttention(CustomOp):
|
||||
cu_seqlens=cu_seqlens,
|
||||
)
|
||||
if is_reshaped:
|
||||
output = output.view(bsz, q_len, -1)
|
||||
output = output.reshape(bsz, q_len, -1)
|
||||
return output
|
||||
|
||||
def _forward_fa(
|
||||
@ -174,7 +174,7 @@ class MMEncoderAttention(CustomOp):
|
||||
fa_version=self._fa_version,
|
||||
)
|
||||
if is_reshaped:
|
||||
output = output.view(bsz, q_len, -1)
|
||||
output = output.reshape(bsz, q_len, -1)
|
||||
return output
|
||||
|
||||
def forward_native(
|
||||
|
||||
@ -1847,7 +1847,6 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
|
||||
random_seed=args.seed,
|
||||
dataset_path=args.dataset_path,
|
||||
disable_shuffle=args.disable_shuffle,
|
||||
prefix_len=args.common_prefix_len,
|
||||
).sample(
|
||||
tokenizer=tokenizer,
|
||||
num_requests=args.num_prompts,
|
||||
|
||||
@ -1281,12 +1281,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
help="Repetition penalty sampling parameter. Only has effect on "
|
||||
"openai-compatible backends.",
|
||||
)
|
||||
sampling_group.add_argument(
|
||||
"--common-prefix-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Common prefix length shared by all prompts (used by random dataset)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--served-model-name",
|
||||
|
||||
@ -172,7 +172,10 @@ class ModelConfig:
|
||||
format. Examples:\n
|
||||
- 1k -> 1000\n
|
||||
- 1K -> 1024\n
|
||||
- 25.6k -> 25,600"""
|
||||
- 25.6k -> 25,600\n
|
||||
- -1 or 'auto' -> Automatically choose the maximum model length that fits in
|
||||
GPU memory. This will use the model's maximum context length if it fits,
|
||||
otherwise it will find the largest length that can be accommodated."""
|
||||
spec_target_max_model_len: int | None = None
|
||||
"""Specify the maximum length for spec decoding draft models."""
|
||||
quantization: QuantizationMethods | str | None = None
|
||||
@ -1094,11 +1097,10 @@ class ModelConfig:
|
||||
# The size of inputs_embeds is usually identical to the size
|
||||
# of the hidden states, however there are exceptions, such as
|
||||
# embedding models like CLIP and SigLIP
|
||||
for target_attr in ("projection_dim", "projection_size"):
|
||||
if hasattr(self.hf_text_config, target_attr):
|
||||
return getattr(self.hf_text_config, target_attr)
|
||||
|
||||
return self.get_hidden_size()
|
||||
names = ("projection_dim", "projection_size")
|
||||
return getattr_iter(
|
||||
self.hf_text_config, names, default_factory=self.get_hidden_size
|
||||
)
|
||||
|
||||
@property
|
||||
def is_deepseek_mla(self) -> bool:
|
||||
@ -1231,14 +1233,12 @@ class ModelConfig:
|
||||
# For ChatGLM:
|
||||
"multi_query_group_num",
|
||||
]
|
||||
for attr in attributes:
|
||||
num_kv_heads = getattr(self.hf_text_config, attr, None)
|
||||
if num_kv_heads is not None:
|
||||
return num_kv_heads
|
||||
|
||||
# For non-grouped-query attention models, the number of KV heads is
|
||||
# equal to the number of attention heads.
|
||||
return self.hf_text_config.num_attention_heads
|
||||
default_factory = lambda: self.hf_text_config.num_attention_heads
|
||||
return getattr_iter(
|
||||
self.hf_text_config, attributes, default_factory=default_factory
|
||||
)
|
||||
|
||||
def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int:
|
||||
"""Returns the number of KV heads per GPU."""
|
||||
@ -1542,6 +1542,10 @@ class ModelConfig:
|
||||
def is_multimodal_raw_input_only_model(self) -> bool:
|
||||
return self._model_info.supports_multimodal_raw_input_only
|
||||
|
||||
@property
|
||||
def requires_raw_input_tokens(self) -> bool:
|
||||
return self._model_info.requires_raw_input_tokens
|
||||
|
||||
@property
|
||||
def is_cross_encoder(self) -> bool:
|
||||
return (
|
||||
@ -2150,9 +2154,10 @@ def _get_and_verify_max_len(
|
||||
if encoder_config and "max_seq_length" in encoder_config:
|
||||
derived_max_model_len = encoder_config["max_seq_length"]
|
||||
|
||||
# If the user didn't specify `max_model_len`, then use that derived from
|
||||
# the model config as a default value.
|
||||
if max_model_len is None:
|
||||
# If the user didn't specify `max_model_len` or specified -1 (auto-fit),
|
||||
# then use that derived from the model config as a default value.
|
||||
# When -1 is specified, the engine will later auto-fit to available memory.
|
||||
if max_model_len is None or max_model_len == -1:
|
||||
# For LongRoPE, default to original_max_position_embeddings to avoid
|
||||
# performance degradation for shorter sequences
|
||||
if rope_parameters is not None and any(
|
||||
|
||||
@ -465,6 +465,7 @@ class ParallelConfig:
|
||||
# Derived/runtime topology, networking, or launch details
|
||||
"data_parallel_rank",
|
||||
"data_parallel_rank_local",
|
||||
"data_parallel_size_local",
|
||||
"data_parallel_backend",
|
||||
"data_parallel_external_lb",
|
||||
"data_parallel_hybrid_lb",
|
||||
|
||||
@ -9,7 +9,7 @@ import inspect
|
||||
import json
|
||||
import pathlib
|
||||
import textwrap
|
||||
from collections.abc import Iterable, Mapping, Sequence, Set
|
||||
from collections.abc import Callable, Iterable, Mapping, Sequence, Set
|
||||
from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
|
||||
from itertools import pairwise
|
||||
from typing import TYPE_CHECKING, Any, Protocol, TypeVar
|
||||
@ -74,7 +74,11 @@ def get_field(cls: ConfigType, name: str) -> Field:
|
||||
|
||||
|
||||
def getattr_iter(
|
||||
object: object, names: Iterable[str], default: Any, warn: bool = False
|
||||
object: object,
|
||||
names: Iterable[str],
|
||||
default: Any | None = None,
|
||||
default_factory: Callable[[], Any] | None = None,
|
||||
warn: bool = False,
|
||||
) -> Any:
|
||||
"""
|
||||
A helper function that retrieves an attribute from an object which may
|
||||
@ -96,7 +100,7 @@ def getattr_iter(
|
||||
names[0],
|
||||
)
|
||||
return getattr(object, name)
|
||||
return default
|
||||
return default_factory() if default_factory is not None else default
|
||||
|
||||
|
||||
def contains_object_print(text: str) -> bool:
|
||||
|
||||
@ -408,7 +408,13 @@ class MooncakeConnectorWorker:
|
||||
|
||||
self.engine = TransferEngine()
|
||||
self.hostname = get_ip()
|
||||
ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", "rdma", "")
|
||||
protocol = self.vllm_config.kv_transfer_config.kv_connector_extra_config.get( # type: ignore[union-attr]
|
||||
"mooncake_protocol", "rdma"
|
||||
)
|
||||
logger.info(
|
||||
"The Mooncake Transfer Engine is using %s as its protocol.", protocol
|
||||
)
|
||||
ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", protocol, "")
|
||||
if ret_value != 0:
|
||||
raise RuntimeError("Mooncake Transfer Engine initialization failed.")
|
||||
|
||||
|
||||
@ -2045,13 +2045,20 @@ def _raise_unsupported_error(feature_name: str):
|
||||
def human_readable_int(value):
|
||||
"""Parse human-readable integers like '1k', '2M', etc.
|
||||
Including decimal values with decimal multipliers.
|
||||
Also accepts -1 or 'auto' as a special value for auto-detection.
|
||||
|
||||
Examples:
|
||||
- '1k' -> 1,000
|
||||
- '1K' -> 1,024
|
||||
- '25.6k' -> 25,600
|
||||
- '-1' or 'auto' -> -1 (special value for auto-detection)
|
||||
"""
|
||||
value = value.strip()
|
||||
|
||||
# Handle -1 or 'auto' as a special value for auto-detection
|
||||
if value == "-1" or value.lower() == "auto":
|
||||
return -1
|
||||
|
||||
match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
|
||||
if match:
|
||||
decimal_multiplier = {
|
||||
|
||||
@ -1621,7 +1621,7 @@ class LLM:
|
||||
added_request_ids.append(request_id)
|
||||
except Exception as e:
|
||||
if added_request_ids:
|
||||
self.llm_engine.abort_request(added_request_ids)
|
||||
self.llm_engine.abort_request(added_request_ids, internal=True)
|
||||
raise e
|
||||
|
||||
def _validate_mm_data_and_uuids(
|
||||
@ -1731,7 +1731,7 @@ class LLM:
|
||||
priority=priority,
|
||||
prompt_text=prompt_text,
|
||||
)
|
||||
return request_id
|
||||
return engine_request.request_id
|
||||
|
||||
def _run_engine(
|
||||
self, *, use_tqdm: bool | Callable[..., tqdm] = True
|
||||
|
||||
28
vllm/entrypoints/pooling/embed/conftest.py
Normal file
28
vllm/entrypoints/pooling/embed/conftest.py
Normal file
@ -0,0 +1,28 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Pytest configuration for vLLM pooling embed tests."""
|
||||
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
"""Configure ROCm-specific settings based on collected tests."""
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
|
||||
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
|
||||
torch.backends.cuda.enable_flash_sdp(False)
|
||||
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
||||
torch.backends.cuda.enable_math_sdp(True)
|
||||
warnings.warn(
|
||||
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
|
||||
"to avoid HuggingFace Transformers accuracy issues",
|
||||
UserWarning,
|
||||
stacklevel=1,
|
||||
)
|
||||
@ -118,6 +118,7 @@ class ShortConv(MambaBase, CustomOp):
|
||||
conv_state = self_kv_cache[0].transpose(-1, -2)
|
||||
state_indices_tensor = attn_metadata.state_indices_tensor
|
||||
has_initial_states_p = attn_metadata.has_initial_states_p
|
||||
query_start_loc_p = attn_metadata.query_start_loc_p
|
||||
|
||||
BCx, _ = self.in_proj(hidden_states)
|
||||
|
||||
@ -165,11 +166,6 @@ class ShortConv(MambaBase, CustomOp):
|
||||
[num_decodes, num_prefills],
|
||||
dim=0,
|
||||
)
|
||||
query_start_loc_p = (
|
||||
attn_metadata.query_start_loc[-num_prefills - 1 :] - num_decodes
|
||||
if has_prefill
|
||||
else None
|
||||
)
|
||||
|
||||
conv_output_list = []
|
||||
|
||||
|
||||
@ -111,7 +111,7 @@ class AudioFlamingo3EmbeddingInputs(TensorSchema):
|
||||
|
||||
audio_embeds: Annotated[
|
||||
list[torch.Tensor],
|
||||
TensorShape("bn", "naf", "hs"),
|
||||
TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -878,11 +878,14 @@ class Indexer(nn.Module):
|
||||
)
|
||||
|
||||
q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
|
||||
# `rotary_emb` is shape-preserving; `q_pe` is already
|
||||
# [num_tokens, n_head, rope_dim].
|
||||
# Note: RoPE (NeoX) can introduce extra leading dimensions during compilation
|
||||
# so we need to reshape back to token-flattened shapes
|
||||
q_pe = q_pe.reshape(-1, self.n_head, self.rope_dim)
|
||||
k_pe = k_pe.reshape(-1, 1, self.rope_dim)
|
||||
|
||||
q = torch.cat([q_pe, q_nope], dim=-1)
|
||||
# `k_pe` is [num_tokens, 1, rope_dim] (MQA).
|
||||
k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1)
|
||||
k = torch.cat([k_pe.squeeze(-2), k_nope], dim=-1)
|
||||
|
||||
# we only quant q here since k quant is fused with cache insertion
|
||||
q = q.view(-1, self.head_dim)
|
||||
@ -1595,7 +1598,11 @@ class DeepseekV2ForCausalLM(
|
||||
# Determine split axis based on op type
|
||||
# gate/up: ColumnParallel → split along dim 0
|
||||
# down: RowParallel → split along dim 1
|
||||
split_dim = 1 if "down_proj.weight" in name else 0
|
||||
split_dim = (
|
||||
1
|
||||
if ("down_proj.weight" in name and loaded_weight.ndim > 1)
|
||||
else 0
|
||||
)
|
||||
total = loaded_weight.shape[split_dim]
|
||||
assert total % num_chunks == 0, (
|
||||
f"Shared expert weight dim {total} "
|
||||
@ -1608,14 +1615,13 @@ class DeepseekV2ForCausalLM(
|
||||
weight_to_load = loaded_weight
|
||||
|
||||
if is_fusion_moe_shared_experts_layer:
|
||||
if split_dim == 0:
|
||||
weight_to_load = loaded_weight[
|
||||
j * chunk_size : (j + 1) * chunk_size, :
|
||||
]
|
||||
chunk_slice = slice(j * chunk_size, (j + 1) * chunk_size)
|
||||
if loaded_weight.ndim == 1:
|
||||
weight_to_load = loaded_weight[chunk_slice]
|
||||
elif split_dim == 0:
|
||||
weight_to_load = loaded_weight[chunk_slice, :]
|
||||
else:
|
||||
weight_to_load = loaded_weight[
|
||||
:, j * chunk_size : (j + 1) * chunk_size
|
||||
]
|
||||
weight_to_load = loaded_weight[:, chunk_slice]
|
||||
# Synthesize an expert-style name so expert mapping
|
||||
# can route it
|
||||
chunk_name = name.replace(
|
||||
|
||||
@ -94,6 +94,12 @@ class SupportsMultiModal(Protocol):
|
||||
`multimodal_config.mm_encoder_tp_mode="data"`.
|
||||
"""
|
||||
|
||||
requires_raw_input_tokens: ClassVar[bool] = False
|
||||
"""
|
||||
A flag that indicates this model processes input id tokens
|
||||
in their raw form and not input embeddings.
|
||||
"""
|
||||
|
||||
merge_by_field_config: ClassVar[bool | None] = None
|
||||
"""
|
||||
[DEPRECATED] A flag that indicates which implementation of
|
||||
@ -306,6 +312,10 @@ def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
|
||||
return getattr(model, "supports_multimodal_raw_input_only", False)
|
||||
|
||||
|
||||
def requires_raw_input_tokens(model: type[object] | object) -> bool:
|
||||
return getattr(model, "requires_raw_input_tokens", False)
|
||||
|
||||
|
||||
def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool:
|
||||
return getattr(model, "supports_encoder_tp_data", False)
|
||||
|
||||
|
||||
@ -139,7 +139,7 @@ class MiniCPMVImageEmbeddingInputs(TensorSchema):
|
||||
type: Literal["image_embeds"]
|
||||
image_embeds: Annotated[
|
||||
torch.Tensor | list[torch.Tensor],
|
||||
TensorShape("bn", "ns", "hs"),
|
||||
TensorShape("bn", "ns", "hs", dynamic_dims={"ns"}),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -101,7 +101,7 @@ class Qwen2AudioEmbeddingInputs(TensorSchema):
|
||||
|
||||
audio_embeds: Annotated[
|
||||
list[torch.Tensor],
|
||||
TensorShape("bn", "naf", "hs"),
|
||||
TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -118,7 +118,7 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
|
||||
output_lengths = (
|
||||
((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
|
||||
)
|
||||
return feat_lengths, output_lengths
|
||||
return output_lengths
|
||||
|
||||
|
||||
class Qwen3_VisionPatchEmbed(nn.Module):
|
||||
@ -921,13 +921,11 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
||||
if audio_feature_lengths is None and feature_attention_mask is None:
|
||||
audio_output_lengths = []
|
||||
elif audio_feature_lengths is not None:
|
||||
_, audio_output_lens = _get_feat_extract_output_lengths(
|
||||
audio_feature_lengths
|
||||
)
|
||||
audio_output_lens = _get_feat_extract_output_lengths(audio_feature_lengths)
|
||||
audio_output_lengths = audio_output_lens.tolist()
|
||||
elif feature_attention_mask is not None:
|
||||
assert isinstance(feature_attention_mask, torch.Tensor)
|
||||
_, audio_output_lens = _get_feat_extract_output_lengths(
|
||||
audio_output_lens = _get_feat_extract_output_lengths(
|
||||
feature_attention_mask.sum(-1)
|
||||
)
|
||||
audio_output_lengths = audio_output_lens.tolist()
|
||||
@ -1111,18 +1109,16 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix
|
||||
audio_input: Qwen2_5OmniAudioFeatureInputs,
|
||||
audio_hashes: list[str] | None = None,
|
||||
cached_audio_features: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
) -> tuple[torch.Tensor, ...]:
|
||||
input_features = audio_input["input_features"]
|
||||
audio_feature_lengths = audio_input["audio_feature_lengths"]
|
||||
|
||||
audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(
|
||||
audio_feature_lengths
|
||||
)
|
||||
audio_output_lengths = _get_feat_extract_output_lengths(audio_feature_lengths)
|
||||
|
||||
audio_outputs = self.audio_tower(
|
||||
input_features.to(self.audio_tower.dtype),
|
||||
feature_lens=audio_feature_lengths,
|
||||
aftercnn_lens=audio_feat_lengths,
|
||||
aftercnn_lens=audio_output_lengths,
|
||||
)
|
||||
audio_features = audio_outputs.last_hidden_state
|
||||
return audio_features.split(audio_output_lengths.tolist())
|
||||
@ -1579,7 +1575,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
||||
+ st_idx
|
||||
)
|
||||
st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
|
||||
_, audio_len = _get_feat_extract_output_lengths(
|
||||
audio_len = _get_feat_extract_output_lengths(
|
||||
audio_feature_lengths[audio_idx]
|
||||
)
|
||||
llm_pos_ids = (
|
||||
@ -1700,7 +1696,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
||||
llm_pos_ids_list.append(bos_block)
|
||||
llm_pos_ids_list.append(bos_block)
|
||||
st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
|
||||
_, audio_len = _get_feat_extract_output_lengths(
|
||||
audio_len = _get_feat_extract_output_lengths(
|
||||
audio_feature_lengths[audio_idx]
|
||||
)
|
||||
audio_llm_pos_ids = (
|
||||
|
||||
@ -46,6 +46,7 @@ from .interfaces import (
|
||||
has_noops,
|
||||
is_attention_free,
|
||||
is_hybrid,
|
||||
requires_raw_input_tokens,
|
||||
supports_cross_encoding,
|
||||
supports_mamba_prefix_caching,
|
||||
supports_multimodal,
|
||||
@ -422,6 +423,7 @@ _MULTIMODAL_MODELS = {
|
||||
),
|
||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
|
||||
"VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"), # noqa: E501
|
||||
# [Encoder-decoder]
|
||||
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501
|
||||
}
|
||||
@ -539,6 +541,7 @@ class _ModelInfo:
|
||||
supports_cross_encoding: bool
|
||||
supports_multimodal: bool
|
||||
supports_multimodal_raw_input_only: bool
|
||||
requires_raw_input_tokens: bool
|
||||
supports_multimodal_encoder_tp_data: bool
|
||||
supports_pp: bool
|
||||
has_inner_state: bool
|
||||
@ -562,6 +565,7 @@ class _ModelInfo:
|
||||
supports_multimodal_raw_input_only=supports_multimodal_raw_input_only(
|
||||
model
|
||||
),
|
||||
requires_raw_input_tokens=requires_raw_input_tokens(model),
|
||||
supports_multimodal_encoder_tp_data=supports_multimodal_encoder_tp_data(
|
||||
model
|
||||
),
|
||||
|
||||
@ -163,8 +163,10 @@ def apply_rotary_pos_emb(
|
||||
enable_fp32_compute=True,
|
||||
)
|
||||
|
||||
if is_flash_attn_backend and not current_platform.is_cuda():
|
||||
if is_flash_attn_backend and current_platform.is_cuda():
|
||||
apply_rotary_emb_func = apply_rotary_emb.forward_cuda
|
||||
elif is_flash_attn_backend and current_platform.is_rocm():
|
||||
apply_rotary_emb_func = apply_rotary_emb.forward_hip
|
||||
else:
|
||||
apply_rotary_emb_func = apply_rotary_emb.forward_native
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import inspect
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from functools import cached_property
|
||||
@ -116,10 +117,7 @@ class VoxtralProcessorAdapter:
|
||||
self,
|
||||
audio_length: int,
|
||||
) -> int:
|
||||
pad_audio_length = self._audio_processor.next_multiple_of_chunk_frames(
|
||||
audio_length, self.sampling_rate
|
||||
)
|
||||
return ceil(pad_audio_length / (self.sampling_rate // self.frame_rate))
|
||||
return ceil(audio_length / (self.sampling_rate // self.frame_rate))
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
@ -158,7 +156,14 @@ class VoxtralProcessorAdapter:
|
||||
assert audio.ndim == 1
|
||||
|
||||
# pad if necessary
|
||||
audio = self._audio_processor.pad(audio, self.sampling_rate)
|
||||
# TODO(Patrick) - remove once mistral-common is bumped
|
||||
sig = inspect.signature(self._audio_processor.pad)
|
||||
if "is_online_streaming" in sig.parameters:
|
||||
audio = self._audio_processor.pad(
|
||||
audio, self.sampling_rate, is_online_streaming=False
|
||||
)
|
||||
else:
|
||||
audio = self._audio_processor.pad(audio, self.sampling_rate)
|
||||
|
||||
audio_tokens = [self.begin_audio_token_id] + [
|
||||
self.audio_token_id
|
||||
@ -510,6 +515,7 @@ class VoxtralForConditionalGeneration(
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
||||
remapping_rules = [
|
||||
(r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"),
|
||||
(r"mm_whisper_embeddings\.(.*)", r"\1"),
|
||||
(r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"),
|
||||
(
|
||||
@ -535,13 +541,16 @@ class VoxtralForConditionalGeneration(
|
||||
def llm_weights_generator():
|
||||
nonlocal loaded_weights
|
||||
for name, w in weights:
|
||||
is_encoder = (
|
||||
name.startswith("mm_whisper_embeddings")
|
||||
and not name.startswith("mm_whisper_embeddings.tok_embeddings")
|
||||
and not name.startswith(
|
||||
"mm_whisper_embeddings.audio_language_projection"
|
||||
is_encoder = False
|
||||
for k in [
|
||||
"mm_whisper_embeddings",
|
||||
"mm_streams_embeddings.embedding_module",
|
||||
]:
|
||||
is_encoder |= (
|
||||
name.startswith(k)
|
||||
and not name.startswith(f"{k}.tok_embeddings")
|
||||
and not name.startswith(f"{k}.audio_language_projection")
|
||||
)
|
||||
)
|
||||
|
||||
for pattern, repl in remapping_rules:
|
||||
if re.fullmatch(pattern, name):
|
||||
@ -676,6 +685,7 @@ class VoxtralEncoderModel(nn.Module):
|
||||
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
|
||||
|
||||
mistral_remapping = [
|
||||
(r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"),
|
||||
(
|
||||
r"whisper_encoder\.conv_layers\.0\.(weight|bias)",
|
||||
r"whisper_encoder.conv1.\1",
|
||||
@ -684,6 +694,14 @@ class VoxtralEncoderModel(nn.Module):
|
||||
r"whisper_encoder\.conv_layers\.1\.(weight|bias)",
|
||||
r"whisper_encoder.conv2.\1",
|
||||
),
|
||||
(
|
||||
r"whisper_encoder\.conv_layers\.0\.conv\.(weight|bias)",
|
||||
r"whisper_encoder.conv1.\1",
|
||||
), # noqa: E501
|
||||
(
|
||||
r"whisper_encoder\.conv_layers\.1\.conv\.(weight|bias)",
|
||||
r"whisper_encoder.conv2.\1",
|
||||
), # noqa: E501
|
||||
(
|
||||
r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)", # noqa: E501
|
||||
r"whisper_encoder.layers.\1.self_attn.\2_proj.\3",
|
||||
|
||||
243
vllm/model_executor/models/voxtral_streaming.py
Normal file
243
vllm/model_executor/models/voxtral_streaming.py
Normal file
@ -0,0 +1,243 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
from collections.abc import Mapping
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config.vllm import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.models.interfaces import MultiModalEmbeddings
|
||||
from vllm.model_executor.models.voxtral import (
|
||||
VoxtralDummyInputsBuilder,
|
||||
VoxtralForConditionalGeneration,
|
||||
VoxtralMultiModalProcessor,
|
||||
VoxtralProcessingInfo,
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.cache import _I, BaseMultiModalProcessorCache
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalKwargsOptionalItems,
|
||||
)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (
|
||||
MultiModalPromptUpdates,
|
||||
PlaceholderFeaturesInfo,
|
||||
)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from .utils import (
|
||||
_flatten_embeddings,
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
info: _I,
|
||||
dummy_inputs: BaseDummyInputsBuilder[_I],
|
||||
*,
|
||||
cache: BaseMultiModalProcessorCache | None = None,
|
||||
) -> None:
|
||||
# streaming can't make use of a cache yet
|
||||
super().__init__(info, dummy_inputs, cache=None)
|
||||
|
||||
def _maybe_apply_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
prompt_ids: list[int],
|
||||
mm_kwargs: MultiModalKwargsOptionalItems,
|
||||
mm_prompt_updates: MultiModalPromptUpdates,
|
||||
is_update_applied: bool,
|
||||
) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
||||
# there are no placeholder audio tokens for streaming
|
||||
# so we need to build the place placeholder positions manually
|
||||
|
||||
# in streaming there is always only one audio input
|
||||
audios = mm_kwargs.get("audio", [])
|
||||
assert len(audios) == 1, (
|
||||
f"Expected only one audio input for streaming, got {mm_kwargs=}"
|
||||
)
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
audio_config = tokenizer.instruct.audio_encoder.audio_config
|
||||
|
||||
num_audio_samples = audios[0]["audio_arrays"].data.shape[0]
|
||||
length = audio_config.num_audio_tokens(num_audio_samples)
|
||||
|
||||
features_info = PlaceholderFeaturesInfo(
|
||||
modality="audio",
|
||||
item_idx=0,
|
||||
start_idx=0,
|
||||
tokens=length
|
||||
* [0], # only used for length computation, so we can take dummy inputs
|
||||
is_embed=None,
|
||||
)
|
||||
return prompt_ids, {"audio": [features_info]}
|
||||
|
||||
|
||||
class TimeEmbedding(torch.nn.Module):
|
||||
"""Sinusoidal Embedding for encoding time"""
|
||||
|
||||
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.theta = theta
|
||||
inv_freq = torch.exp(
|
||||
-math.log(self.theta)
|
||||
* torch.arange(self.dim // 2).float()
|
||||
/ (self.dim // 2)
|
||||
)
|
||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||
|
||||
def forward(self, t: torch.Tensor) -> torch.Tensor:
|
||||
t = t[..., None] # (B,) -> (B, 1) or (B, T) -> (B, T, 1)
|
||||
inv_freq = self.inv_freq.to(device=t.device, dtype=t.dtype)
|
||||
emb = (
|
||||
t * inv_freq
|
||||
) # (B, 1) x (D/2,) -> (B, D/2) or (B, T, 1) x (D/2,) -> (B, T, D/2)
|
||||
return torch.cat((emb.cos(), emb.sin()), dim=-1) # (B, D) or (B, T, D)
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(
|
||||
VoxtralStreamingMultiModalProcessor,
|
||||
info=VoxtralProcessingInfo,
|
||||
dummy_inputs=VoxtralDummyInputsBuilder,
|
||||
)
|
||||
class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
|
||||
requires_raw_input_tokens = True
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
self.time_embedding: TimeEmbedding = TimeEmbedding(
|
||||
dim=self.config.text_config.hidden_size
|
||||
)
|
||||
|
||||
audio_config = self.tokenizer.instruct.audio_encoder.audio_config
|
||||
_n_delay_tokens = (
|
||||
audio_config.frame_rate * audio_config.transcription_delay_ms / 1000
|
||||
)
|
||||
assert _n_delay_tokens.is_integer(), (
|
||||
f"n_delay_tokens must be integer, got {_n_delay_tokens}"
|
||||
)
|
||||
|
||||
self.n_delay_tokens = int(_n_delay_tokens)
|
||||
|
||||
@property
|
||||
def audio_config(self):
|
||||
return self.tokenizer.instruct.audio_encoder.audio_config
|
||||
|
||||
def embed_input_ids(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
multimodal_embeddings: MultiModalEmbeddings | None = None,
|
||||
*,
|
||||
is_multimodal: torch.Tensor | None = None,
|
||||
# Multi-modal token ID may exceed vocab size
|
||||
handle_oov_mm_token: bool = True,
|
||||
) -> torch.Tensor:
|
||||
"""Pass post-conv embeddings directly as input"""
|
||||
# for streaming we simply flatten the multimodal embeddings
|
||||
# to be in tensor format, we treat the input ids later
|
||||
assert multimodal_embeddings is not None
|
||||
assert len(multimodal_embeddings) > 0, (
|
||||
"For streaming you must provide a multimodal_embedding at every step."
|
||||
)
|
||||
mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
|
||||
return mm_embeds_flat
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: IntermediateTensors | None = None,
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
**kwargs: object,
|
||||
) -> torch.Tensor | IntermediateTensors:
|
||||
assert inputs_embeds is not None
|
||||
assert input_ids is not None
|
||||
|
||||
pool_size = self.config.audio_config.block_pool_size
|
||||
inputs_embeds = inputs_embeds.view(
|
||||
inputs_embeds.shape[0] * pool_size, inputs_embeds.shape[1] // pool_size
|
||||
)
|
||||
|
||||
audio_hidden_states = self.whisper_encoder.whisper_encoder.forward_layers(
|
||||
inputs_embeds
|
||||
)
|
||||
|
||||
num_tokens, audio_hidden_size = audio_hidden_states.shape
|
||||
assert num_tokens % self.downsample_factor == 0
|
||||
audio_hidden_states = audio_hidden_states.reshape(
|
||||
num_tokens // self.downsample_factor,
|
||||
audio_hidden_size * self.downsample_factor,
|
||||
)
|
||||
audio_text_embeds = self.audio_language_adapter(audio_hidden_states)
|
||||
|
||||
text_embeds = self.language_model.embed_input_ids(input_ids)
|
||||
|
||||
# sum pool text and audio embeddings
|
||||
inputs_embeds = audio_text_embeds + text_embeds
|
||||
|
||||
time_tensor = torch.tensor(
|
||||
[self.n_delay_tokens],
|
||||
device=inputs_embeds.device,
|
||||
dtype=inputs_embeds.dtype,
|
||||
)
|
||||
inputs_embeds = inputs_embeds + self.time_embedding(time_tensor)
|
||||
|
||||
hidden_states = self.language_model.model(
|
||||
input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
|
||||
)
|
||||
|
||||
return hidden_states
|
||||
|
||||
def embed_multimodal(
|
||||
self, **kwargs
|
||||
) -> list[torch.Tensor] | torch.Tensor | tuple[torch.Tensor, ...] | None:
|
||||
"""Transform audio waveforms -> initial whisper post-conv embeddings"""
|
||||
audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
|
||||
|
||||
assert audio_inputs is not None, (
|
||||
"For streaming you must provide an audio input at every step."
|
||||
)
|
||||
|
||||
multiple_of = self.audio_config.raw_audio_length_per_tok
|
||||
assert all(
|
||||
(this_audio := audio.shape[0]) % multiple_of == 0 for audio in audio_inputs
|
||||
), (
|
||||
f"Every input audio waveform has to be a multiple of {multiple_of}, but"
|
||||
f" one is {this_audio} with {(this_audio / multiple_of)=}."
|
||||
)
|
||||
|
||||
mel_features = [
|
||||
self.whisper_encoder.compute_whisper_melspec(audio).to(
|
||||
self.whisper_encoder.dtype
|
||||
)
|
||||
for audio in audio_inputs
|
||||
]
|
||||
seq_lens = [mel.shape[1] for mel in mel_features]
|
||||
# [total_num_20ms_frames, hidden_size]
|
||||
audio_embeddings = self.whisper_encoder.whisper_encoder.forward_conv(
|
||||
mel_features
|
||||
)[0]
|
||||
conv_stride = self.whisper_encoder.whisper_encoder.total_stride
|
||||
audio_embeddings_per_sample = audio_embeddings.split(
|
||||
[s // conv_stride for s in seq_lens], dim=0
|
||||
)
|
||||
|
||||
# audio_embeddings per sample need to be divisible by 4
|
||||
pool_size = self.config.audio_config.block_pool_size
|
||||
assert all(
|
||||
(this_shape := sample.shape[0]) % pool_size == 0
|
||||
for sample in audio_embeddings_per_sample
|
||||
), f"Every audio embedding has to be a multiple of 4, but one is {this_shape}."
|
||||
|
||||
audio_embeddings_per_sample = [
|
||||
e.view(e.shape[0] // pool_size, e.shape[1] * pool_size)
|
||||
for e in audio_embeddings_per_sample
|
||||
]
|
||||
return audio_embeddings_per_sample
|
||||
@ -1,9 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import enum
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from contextlib import nullcontext
|
||||
from functools import partial
|
||||
from typing import Annotated, Literal, cast
|
||||
|
||||
import numpy as np
|
||||
@ -16,7 +18,10 @@ from transformers import (
|
||||
)
|
||||
from transformers.models.whisper.modeling_whisper import sinusoids
|
||||
|
||||
from vllm.attention.layer import Attention, AttentionType
|
||||
from vllm.attention.backends.abstract import (
|
||||
AttentionType,
|
||||
)
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.attention.layers.cross_attention import CrossAttention
|
||||
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
|
||||
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
|
||||
@ -34,6 +39,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.whisper_utils import (
|
||||
ISO639_1_SUPPORTED_LANGS,
|
||||
WhisperAttentionWithBlockPooling,
|
||||
WhisperCausalConv1d,
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
@ -64,67 +74,11 @@ from .utils import (
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
|
||||
|
||||
ISO639_1_SUPPORTED_LANGS = {
|
||||
"af": "Afrikaans",
|
||||
"ar": "Arabic",
|
||||
"hy": "Armenian",
|
||||
"az": "Azerbaijani",
|
||||
"be": "Belarusian",
|
||||
"bs": "Bosnian",
|
||||
"bg": "Bulgarian",
|
||||
"ca": "Catalan",
|
||||
"zh": "Chinese",
|
||||
"hr": "Croatian",
|
||||
"cs": "Czech",
|
||||
"da": "Danish",
|
||||
"nl": "Dutch",
|
||||
"en": "English",
|
||||
"et": "Estonian",
|
||||
"fi": "Finnish",
|
||||
"fr": "French",
|
||||
"gl": "Galician",
|
||||
"de": "German",
|
||||
"el": "Greek",
|
||||
"he": "Hebrew",
|
||||
"hi": "Hindi",
|
||||
"hu": "Hungarian",
|
||||
"is": "Icelandic",
|
||||
"id": "Indonesian",
|
||||
"it": "Italian",
|
||||
"ja": "Japanese",
|
||||
"kn": "Kannada",
|
||||
"kk": "Kazakh",
|
||||
"ko": "Korean",
|
||||
"lv": "Latvian",
|
||||
"lt": "Lithuanian",
|
||||
"mk": "Macedonian",
|
||||
"ms": "Malay",
|
||||
"mr": "Marathi",
|
||||
"mi": "Maori",
|
||||
"ne": "Nepali",
|
||||
"no": "Norwegian",
|
||||
"fa": "Persian",
|
||||
"pl": "Polish",
|
||||
"pt": "Portuguese",
|
||||
"ro": "Romanian",
|
||||
"ru": "Russian",
|
||||
"sr": "Serbian",
|
||||
"sk": "Slovak",
|
||||
"sl": "Slovenian",
|
||||
"es": "Spanish",
|
||||
"sw": "Swahili",
|
||||
"sv": "Swedish",
|
||||
"tl": "Tagalog",
|
||||
"ta": "Tamil",
|
||||
"th": "Thai",
|
||||
"tr": "Turkish",
|
||||
"uk": "Ukrainian",
|
||||
"ur": "Urdu",
|
||||
"vi": "Vietnamese",
|
||||
"cy": "Welsh",
|
||||
}
|
||||
class WhisperPosEmbedType(enum.Enum):
|
||||
SINUSOIDAL = "sinusoidal"
|
||||
NOPE = "nope"
|
||||
LEARNED = "learned"
|
||||
|
||||
|
||||
class WhisperAudioInputs(TensorSchema):
|
||||
@ -184,6 +138,8 @@ class WhisperAttention(nn.Module):
|
||||
num_heads: int,
|
||||
bias: bool = True,
|
||||
attn_type: AttentionType = AttentionType.DECODER,
|
||||
per_layer_sliding_window: int | None = None,
|
||||
block_pool_size: int = 1,
|
||||
cache_config: CacheConfig | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
prefix: str = "",
|
||||
@ -242,7 +198,14 @@ class WhisperAttention(nn.Module):
|
||||
attn_type=self.attn_type,
|
||||
)
|
||||
else: # AttentionType.DECODER (regular decoder self-attention)
|
||||
self.attn = Attention(
|
||||
if block_pool_size > 1:
|
||||
attn_cls = partial(
|
||||
WhisperAttentionWithBlockPooling, block_pool_size=block_pool_size
|
||||
)
|
||||
else:
|
||||
attn_cls = Attention
|
||||
|
||||
self.attn = attn_cls(
|
||||
self.num_heads,
|
||||
self.head_dim,
|
||||
self.scaling,
|
||||
@ -251,6 +214,7 @@ class WhisperAttention(nn.Module):
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn",
|
||||
attn_type=self.attn_type,
|
||||
per_layer_sliding_window=per_layer_sliding_window,
|
||||
)
|
||||
|
||||
def _init_qkv(
|
||||
@ -386,6 +350,9 @@ class WhisperEncoderLayer(nn.Module):
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
is_causal = getattr(config, "is_causal", False)
|
||||
sliding_window = getattr(config, "sliding_window", None)
|
||||
block_pool_size = getattr(config, "block_pool_size", 1)
|
||||
cache_config = vllm_config.cache_config
|
||||
quant_config = vllm_config.quant_config
|
||||
|
||||
@ -393,7 +360,9 @@ class WhisperEncoderLayer(nn.Module):
|
||||
self.self_attn = WhisperAttention(
|
||||
embed_dim=self.embed_dim,
|
||||
num_heads=config.encoder_attention_heads,
|
||||
attn_type=AttentionType.ENCODER,
|
||||
attn_type=AttentionType.DECODER if is_causal else AttentionType.ENCODER,
|
||||
block_pool_size=block_pool_size,
|
||||
per_layer_sliding_window=sliding_window,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
@ -492,12 +461,21 @@ class WhisperEncoder(nn.Module):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
embed_dim = config.d_model
|
||||
|
||||
self.pos_embed_type = WhisperPosEmbedType(
|
||||
getattr(config, "pos_embed", "sinusoidal")
|
||||
)
|
||||
self.num_mel_bins = config.num_mel_bins
|
||||
self.max_source_positions = config.max_source_positions
|
||||
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
|
||||
|
||||
self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
|
||||
self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
|
||||
is_causal = getattr(config, "is_causal", False)
|
||||
Conv1d = WhisperCausalConv1d if is_causal else partial(nn.Conv1d, padding=1)
|
||||
|
||||
self.conv1 = Conv1d(self.num_mel_bins, embed_dim, kernel_size=3)
|
||||
self.conv2 = Conv1d(embed_dim, embed_dim, stride=2, kernel_size=3)
|
||||
|
||||
self.total_stride = self.conv1.stride[0] * self.conv2.stride[0]
|
||||
self.start_layer, self.end_layer, self.layers = make_layers(
|
||||
config.encoder_layers,
|
||||
lambda prefix: WhisperEncoderLayer(
|
||||
@ -507,29 +485,54 @@ class WhisperEncoder(nn.Module):
|
||||
)
|
||||
self.layer_norm = nn.LayerNorm(config.d_model)
|
||||
|
||||
maybe_fp32_init_ctx = (
|
||||
set_default_torch_dtype(torch.float32) if init_in_fp32 else nullcontext()
|
||||
)
|
||||
|
||||
with (
|
||||
torch.no_grad(),
|
||||
maybe_fp32_init_ctx,
|
||||
if is_causal and self.pos_embed_type != WhisperPosEmbedType.NOPE:
|
||||
raise ValueError(
|
||||
"Only NOPE position embeddings are supported "
|
||||
f"for causal models, but got {self.pos_embed_type}"
|
||||
)
|
||||
elif self.pos_embed_type in (
|
||||
WhisperPosEmbedType.SINUSOIDAL,
|
||||
WhisperPosEmbedType.LEARNED,
|
||||
):
|
||||
self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
|
||||
self.embed_positions.weight.copy_(
|
||||
sinusoids(*self.embed_positions.weight.shape)
|
||||
maybe_fp32_init_ctx = (
|
||||
set_default_torch_dtype(torch.float32)
|
||||
if init_in_fp32
|
||||
else nullcontext()
|
||||
)
|
||||
|
||||
def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
|
||||
with (
|
||||
torch.no_grad(),
|
||||
maybe_fp32_init_ctx,
|
||||
):
|
||||
self.embed_positions = nn.Embedding(
|
||||
self.max_source_positions, embed_dim
|
||||
)
|
||||
self.embed_positions.weight.copy_(
|
||||
sinusoids(*self.embed_positions.weight.shape)
|
||||
)
|
||||
|
||||
def forward_conv(
|
||||
self, input_features: torch.Tensor | list[torch.Tensor]
|
||||
) -> torch.Tensor:
|
||||
hidden_states = []
|
||||
input_is_batched = False
|
||||
for features in input_features:
|
||||
embeds = nn.functional.gelu(self.conv1(features))
|
||||
embeds = nn.functional.gelu(self.conv2(embeds))
|
||||
embeds = embeds.transpose(-1, -2)
|
||||
embeds = (embeds + self.embed_positions.weight[: embeds.size(-2), :]).to(
|
||||
embeds.dtype
|
||||
)
|
||||
|
||||
if self.pos_embed_type in (
|
||||
WhisperPosEmbedType.SINUSOIDAL,
|
||||
WhisperPosEmbedType.LEARNED,
|
||||
):
|
||||
embeds = embeds.transpose(-1, -2)
|
||||
embeds = (
|
||||
embeds + self.embed_positions.weight[: embeds.size(-2), :]
|
||||
).to(embeds.dtype)
|
||||
elif self.pos_embed_type == WhisperPosEmbedType.NOPE:
|
||||
embeds = embeds.transpose(-1, -2).to(embeds.dtype)
|
||||
else:
|
||||
raise ValueError(f"Unknown pos_embed_type: {self.pos_embed_type}")
|
||||
|
||||
hidden_states.append(embeds)
|
||||
input_is_batched = embeds.ndim > 2
|
||||
# Input to MHA must be B x T x D
|
||||
@ -539,12 +542,19 @@ class WhisperEncoder(nn.Module):
|
||||
else:
|
||||
hidden_states = torch.stack(hidden_states, dim=0)
|
||||
|
||||
return hidden_states
|
||||
|
||||
def forward_layers(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
for encoder_layer in self.layers:
|
||||
hidden_states = encoder_layer(hidden_states)
|
||||
|
||||
hidden_states = self.layer_norm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
|
||||
hidden_states = self.forward_conv(input_features)
|
||||
return self.forward_layers(hidden_states)
|
||||
|
||||
|
||||
class WhisperDecoder(nn.Module):
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
|
||||
299
vllm/model_executor/models/whisper_utils.py
Normal file
299
vllm/model_executor/models/whisper_utils.py
Normal file
@ -0,0 +1,299 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import copy
|
||||
import functools
|
||||
import math
|
||||
from dataclasses import replace
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from vllm.attention.backends.abstract import (
|
||||
AttentionBackend,
|
||||
AttentionMetadata,
|
||||
AttentionType,
|
||||
)
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.attention.selector import get_attn_backend
|
||||
from vllm.config import CacheConfig, VllmConfig
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
||||
from vllm.v1.attention.backends.utils import (
|
||||
CommonAttentionMetadata,
|
||||
subclass_attention_backend_with_overrides,
|
||||
)
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||
|
||||
# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
|
||||
ISO639_1_SUPPORTED_LANGS = {
|
||||
"af": "Afrikaans",
|
||||
"ar": "Arabic",
|
||||
"hy": "Armenian",
|
||||
"az": "Azerbaijani",
|
||||
"be": "Belarusian",
|
||||
"bs": "Bosnian",
|
||||
"bg": "Bulgarian",
|
||||
"ca": "Catalan",
|
||||
"zh": "Chinese",
|
||||
"hr": "Croatian",
|
||||
"cs": "Czech",
|
||||
"da": "Danish",
|
||||
"nl": "Dutch",
|
||||
"en": "English",
|
||||
"et": "Estonian",
|
||||
"fi": "Finnish",
|
||||
"fr": "French",
|
||||
"gl": "Galician",
|
||||
"de": "German",
|
||||
"el": "Greek",
|
||||
"he": "Hebrew",
|
||||
"hi": "Hindi",
|
||||
"hu": "Hungarian",
|
||||
"is": "Icelandic",
|
||||
"id": "Indonesian",
|
||||
"it": "Italian",
|
||||
"ja": "Japanese",
|
||||
"kn": "Kannada",
|
||||
"kk": "Kazakh",
|
||||
"ko": "Korean",
|
||||
"lv": "Latvian",
|
||||
"lt": "Lithuanian",
|
||||
"mk": "Macedonian",
|
||||
"ms": "Malay",
|
||||
"mr": "Marathi",
|
||||
"mi": "Maori",
|
||||
"ne": "Nepali",
|
||||
"no": "Norwegian",
|
||||
"fa": "Persian",
|
||||
"pl": "Polish",
|
||||
"pt": "Portuguese",
|
||||
"ro": "Romanian",
|
||||
"ru": "Russian",
|
||||
"sr": "Serbian",
|
||||
"sk": "Slovak",
|
||||
"sl": "Slovenian",
|
||||
"es": "Spanish",
|
||||
"sw": "Swahili",
|
||||
"sv": "Swedish",
|
||||
"tl": "Tagalog",
|
||||
"ta": "Tamil",
|
||||
"th": "Thai",
|
||||
"tr": "Turkish",
|
||||
"uk": "Ukrainian",
|
||||
"ur": "Urdu",
|
||||
"vi": "Vietnamese",
|
||||
"cy": "Welsh",
|
||||
}
|
||||
|
||||
|
||||
def _pad1d(
|
||||
x: torch.Tensor,
|
||||
paddings: tuple[int, int],
|
||||
mode: str = "constant",
|
||||
value: float = 0.0,
|
||||
) -> torch.Tensor:
|
||||
"""Tiny wrapper around F.pad, just to allow for
|
||||
reflect padding on small input.
|
||||
If this is the case, we insert extra 0 padding
|
||||
to the right before the reflection happen.
|
||||
"""
|
||||
length = x.shape[-1]
|
||||
padding_left, padding_right = paddings
|
||||
assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
|
||||
if mode == "reflect":
|
||||
max_pad = max(padding_left, padding_right)
|
||||
extra_pad = 0
|
||||
if length <= max_pad:
|
||||
extra_pad = max_pad - length + 1
|
||||
x = F.pad(x, (0, extra_pad))
|
||||
padded = F.pad(x, paddings, mode, value)
|
||||
end = padded.shape[-1] - extra_pad
|
||||
return padded[..., :end]
|
||||
else:
|
||||
return F.pad(x, paddings, mode, value)
|
||||
|
||||
|
||||
class WhisperCausalConv1d(nn.Conv1d):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
kernel_size: int,
|
||||
stride: int = 1,
|
||||
padding: int = 0,
|
||||
bias: bool = True,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
bias=bias,
|
||||
)
|
||||
self._stride = self.stride[0]
|
||||
self._effective_kernel_size = (kernel_size - 1) * self.dilation[0] + 1
|
||||
self._padding_total = self._effective_kernel_size - self._stride
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
n_frames = (
|
||||
x.shape[-1] - self._effective_kernel_size + self._padding_total
|
||||
) / self._stride + 1
|
||||
target_length = (math.ceil(n_frames) - 1) * self._stride + (
|
||||
self._effective_kernel_size - self._padding_total
|
||||
)
|
||||
extra_padding = target_length - x.shape[-1]
|
||||
x = _pad1d(x, (self._padding_total, extra_padding), mode="constant")
|
||||
return super().forward(x)
|
||||
|
||||
|
||||
@functools.lru_cache
|
||||
def create_whisper_attention_backend_with_block_pooling(
|
||||
underlying_attn_backend: AttentionBackend, block_pool_size: int
|
||||
) -> type[AttentionBackend]:
|
||||
prefix = "WhisperAttentionWithBlockPooling_"
|
||||
underlying_builder = underlying_attn_backend.get_builder_cls()
|
||||
|
||||
class WhisperAttentionWithBlockPoolingBuilder(underlying_builder): # type: ignore
|
||||
def __init__(
|
||||
self,
|
||||
kv_cache_spec: AttentionSpec,
|
||||
layer_names: list[str],
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
):
|
||||
assert kv_cache_spec.num_kv_heads % block_pool_size == 0
|
||||
kv_cache_spec = replace(
|
||||
kv_cache_spec,
|
||||
block_size=kv_cache_spec.block_size * block_pool_size,
|
||||
num_kv_heads=kv_cache_spec.num_kv_heads // block_pool_size,
|
||||
)
|
||||
super().__init__(kv_cache_spec, layer_names, vllm_config, device)
|
||||
|
||||
def build(
|
||||
self,
|
||||
common_prefix_len: int,
|
||||
common_attn_metadata: CommonAttentionMetadata,
|
||||
fast_build: bool = False,
|
||||
) -> AttentionMetadata:
|
||||
new_common_attn_metadata = copy.deepcopy(common_attn_metadata)
|
||||
new_common_attn_metadata.query_start_loc *= block_pool_size
|
||||
new_common_attn_metadata.query_start_loc_cpu *= block_pool_size
|
||||
new_common_attn_metadata.seq_lens *= block_pool_size
|
||||
new_common_attn_metadata._seq_lens_cpu *= block_pool_size
|
||||
new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size
|
||||
new_common_attn_metadata.num_actual_tokens *= block_pool_size
|
||||
new_common_attn_metadata.max_query_len *= block_pool_size
|
||||
new_common_attn_metadata.max_seq_len *= block_pool_size
|
||||
original_slot_mapping = common_attn_metadata.slot_mapping
|
||||
common_prefix_len *= block_pool_size
|
||||
new_common_attn_metadata.slot_mapping = (
|
||||
(
|
||||
original_slot_mapping.unsqueeze(1) * block_pool_size
|
||||
+ torch.arange(block_pool_size, device=original_slot_mapping.device)
|
||||
)
|
||||
.flatten()
|
||||
.clamp(min=-1)
|
||||
)
|
||||
return super().build(
|
||||
common_prefix_len, new_common_attn_metadata, fast_build
|
||||
)
|
||||
|
||||
if not issubclass(underlying_attn_backend, FlashAttentionBackend):
|
||||
raise NotImplementedError(
|
||||
f"{underlying_attn_backend} is not yet supported."
|
||||
"Contributions to support more backends are much "
|
||||
"appreciated."
|
||||
)
|
||||
|
||||
attn_backend = subclass_attention_backend_with_overrides(
|
||||
name_prefix=prefix,
|
||||
attention_backend_cls=underlying_attn_backend,
|
||||
overrides={
|
||||
"get_builder_cls": lambda: WhisperAttentionWithBlockPoolingBuilder,
|
||||
"get_kv_cache_shape": lambda num_blocks,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
cache_dtype_str: (
|
||||
2,
|
||||
num_blocks,
|
||||
# we stretch each block by `block_pool_size`
|
||||
block_size * block_pool_size,
|
||||
num_kv_heads // block_pool_size,
|
||||
head_size,
|
||||
), # TODO: generalize to other backends
|
||||
},
|
||||
)
|
||||
|
||||
return attn_backend
|
||||
|
||||
|
||||
class WhisperAttentionWithBlockPooling(Attention):
|
||||
"""Attention layer with block pooling."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int | None = None,
|
||||
alibi_slopes: list[float] | None = None,
|
||||
cache_config: CacheConfig | None = None,
|
||||
quant_config: QuantizationConfig | None = None,
|
||||
logits_soft_cap: float | None = None,
|
||||
per_layer_sliding_window: int | None = None,
|
||||
prefix: str = "",
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: str | None = None,
|
||||
block_pool_size: int = 1,
|
||||
attn_backend: type[AttentionBackend] | None = None,
|
||||
**extra_impl_args,
|
||||
) -> None:
|
||||
self.block_pool_size = block_pool_size
|
||||
dtype = torch.get_default_dtype()
|
||||
|
||||
if cache_config is not None:
|
||||
kv_cache_dtype = cache_config.cache_dtype
|
||||
block_size = cache_config.block_size
|
||||
else:
|
||||
kv_cache_dtype = "auto"
|
||||
block_size = 16
|
||||
|
||||
underlying_attn_backend = get_attn_backend(
|
||||
head_size,
|
||||
dtype,
|
||||
kv_cache_dtype,
|
||||
block_size,
|
||||
attn_type=attn_type,
|
||||
)
|
||||
attn_backend = create_whisper_attention_backend_with_block_pooling(
|
||||
underlying_attn_backend, block_pool_size
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=alibi_slopes,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
logits_soft_cap=logits_soft_cap,
|
||||
per_layer_sliding_window=per_layer_sliding_window,
|
||||
prefix=prefix,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||
attn_backend=attn_backend,
|
||||
**extra_impl_args,
|
||||
)
|
||||
|
||||
def get_kv_cache_spec(self, vllm_config: VllmConfig):
|
||||
kv_cache_spec = super().get_kv_cache_spec(vllm_config)
|
||||
assert isinstance(kv_cache_spec, AttentionSpec)
|
||||
kv_cache_spec = replace(
|
||||
kv_cache_spec,
|
||||
num_kv_heads=self.block_pool_size * kv_cache_spec.num_kv_heads,
|
||||
)
|
||||
return kv_cache_spec
|
||||
@ -111,11 +111,16 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
|
||||
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
|
||||
return librosa.load(filepath, sr=None)
|
||||
|
||||
def encode_base64(self, media: tuple[npt.NDArray, int]) -> str:
|
||||
def encode_base64(
|
||||
self,
|
||||
media: tuple[npt.NDArray, int],
|
||||
*,
|
||||
audio_format: str = "WAV",
|
||||
) -> str:
|
||||
audio, sr = media
|
||||
|
||||
with BytesIO() as buffer:
|
||||
soundfile.write(buffer, audio, sr, format="WAV")
|
||||
soundfile.write(buffer, audio, sr, format=audio_format)
|
||||
data = buffer.getvalue()
|
||||
|
||||
return base64.b64encode(data).decode("utf-8")
|
||||
|
||||
@ -8,8 +8,12 @@ import pybase64
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .base import MediaIO, MediaWithBytes
|
||||
|
||||
logger = init_logger(__file__)
|
||||
|
||||
|
||||
def rescale_image_size(
|
||||
image: Image.Image, size_factor: float, transpose: int = -1
|
||||
@ -104,8 +108,17 @@ class ImageMediaIO(MediaIO[Image.Image]):
|
||||
self,
|
||||
media: Image.Image,
|
||||
*,
|
||||
image_format: str = "JPEG",
|
||||
image_format: str | None = None,
|
||||
) -> str:
|
||||
if image_format is None:
|
||||
logger.warning_once(
|
||||
"The default format of `ImageMediaIO.encode_base64` will be changed "
|
||||
'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
|
||||
"To continue using the old default, "
|
||||
'pass `format="JPEG"` explicitly to silence this warning.'
|
||||
)
|
||||
image_format = "JPEG"
|
||||
|
||||
image = media
|
||||
|
||||
with BytesIO() as buffer:
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
import asyncio
|
||||
import atexit
|
||||
import mimetypes
|
||||
from collections.abc import Generator, Set
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from itertools import groupby
|
||||
@ -357,17 +358,31 @@ class MediaConnector:
|
||||
def encode_audio_base64(
|
||||
audio: np.ndarray,
|
||||
sampling_rate: int,
|
||||
*,
|
||||
format: str = "WAV",
|
||||
) -> str:
|
||||
"""Encode audio as base64."""
|
||||
audio_io = AudioMediaIO()
|
||||
return audio_io.encode_base64((audio, sampling_rate))
|
||||
return audio_io.encode_base64((audio, sampling_rate), audio_format=format)
|
||||
|
||||
|
||||
def encode_audio_url(
|
||||
audio: np.ndarray,
|
||||
sampling_rate: int,
|
||||
*,
|
||||
format: str = "WAV",
|
||||
) -> str:
|
||||
"""Encode audio as a data URL."""
|
||||
audio_b64 = encode_audio_base64(audio, sampling_rate, format=format)
|
||||
mimetype = mimetypes.types_map.get("." + format.lower(), "audio")
|
||||
return f"data:{mimetype};base64,{audio_b64}"
|
||||
|
||||
|
||||
def encode_image_base64(
|
||||
image: Image.Image,
|
||||
*,
|
||||
image_mode: str = "RGB",
|
||||
format: str = "JPEG",
|
||||
format: str | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Encode a pillow image to base64 format.
|
||||
@ -378,10 +393,45 @@ def encode_image_base64(
|
||||
return image_io.encode_base64(image, image_format=format)
|
||||
|
||||
|
||||
def encode_video_base64(frames: npt.NDArray) -> str:
|
||||
def encode_image_url(
|
||||
image: Image.Image,
|
||||
*,
|
||||
image_mode: str = "RGB",
|
||||
format: str = "PNG",
|
||||
) -> str:
|
||||
"""
|
||||
Encode a pillow image as a data URL.
|
||||
|
||||
By default, the image is converted into RGB format before being encoded.
|
||||
"""
|
||||
image_b64 = encode_image_base64(image, image_mode=image_mode, format=format)
|
||||
mimetype = mimetypes.types_map.get("." + format.lower(), "image")
|
||||
return f"data:{mimetype};base64,{image_b64}"
|
||||
|
||||
|
||||
def encode_video_base64(
|
||||
frames: npt.NDArray,
|
||||
*,
|
||||
format: str = "JPEG",
|
||||
) -> str:
|
||||
image_io = ImageMediaIO()
|
||||
video_io = VideoMediaIO(image_io)
|
||||
return video_io.encode_base64(frames)
|
||||
return video_io.encode_base64(frames, video_format=format)
|
||||
|
||||
|
||||
def encode_video_url(
|
||||
frames: npt.NDArray,
|
||||
*,
|
||||
format: str = "JPEG",
|
||||
) -> str:
|
||||
video_b64 = encode_video_base64(frames, format=format)
|
||||
|
||||
if format.lower() == "jpeg":
|
||||
mimetype = "video/jpeg"
|
||||
else:
|
||||
mimetype = mimetypes.types_map.get("." + format.lower(), "video")
|
||||
|
||||
return f"data:{mimetype};base64,{video_b64}"
|
||||
|
||||
|
||||
def argsort_mm_positions(
|
||||
|
||||
@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@ -168,32 +167,6 @@ class XPUPlatform(Platform):
|
||||
if vllm_config.kv_transfer_config is not None:
|
||||
vllm_config.kv_transfer_config.enable_permute_local_kv = True
|
||||
|
||||
if parallel_config.distributed_executor_backend is None:
|
||||
if parallel_config.world_size > 1:
|
||||
parallel_config.distributed_executor_backend = "ray"
|
||||
else:
|
||||
parallel_config.distributed_executor_backend = "uni"
|
||||
elif parallel_config.distributed_executor_backend == "mp":
|
||||
# FIXME(kunshang):
|
||||
# spawn needs calling `if __name__ == '__main__':`
|
||||
# fork is not supported for xpu start new process.
|
||||
if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
logger.warning(
|
||||
"Please use spawn as start method if you want to use mp."
|
||||
)
|
||||
elif (
|
||||
parallel_config.distributed_executor_backend != "ray"
|
||||
and parallel_config.distributed_executor_backend != "uni"
|
||||
and parallel_config.distributed_executor_backend != "external_launcher"
|
||||
):
|
||||
logger.warning(
|
||||
"%s is not supported on XPU, fallback to ray distributed"
|
||||
" executor backend.",
|
||||
parallel_config.distributed_executor_backend,
|
||||
)
|
||||
parallel_config.distributed_executor_backend = "ray"
|
||||
|
||||
if model_config and model_config.use_mla:
|
||||
logger.info(
|
||||
"MLA is enabled on a non-GPU platform; forcing chunked "
|
||||
|
||||
@ -131,78 +131,105 @@ class MistralToolParser(ToolParser):
|
||||
request: ChatCompletionRequest,
|
||||
) -> ExtractedToolCallInformation:
|
||||
"""
|
||||
Extract the tool calls from a complete model response. Requires
|
||||
find-and-replacing single quotes with double quotes for JSON parsing,
|
||||
make sure your tool call arguments don't ever include quotes!
|
||||
Extract the tool calls from a complete model response.
|
||||
|
||||
Content and tool calls formatting depends on the Mistral's tokenizer version
|
||||
used to train the model:
|
||||
|
||||
- < v11: `content[BOT] [{tool_call1},{tool_call2}]`
|
||||
- >= v11: `content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}`
|
||||
|
||||
with [BOT] the tool call token.
|
||||
|
||||
Note:
|
||||
For tokenizer versions >= v11, tool calls with arguments wrongly formatted
|
||||
are still returned as tool calls. This is to allow the model to know it
|
||||
tried to make a tool call. It reduces chance of another failure and
|
||||
prevents that the context is filled with tool calls wrongly placed in
|
||||
assistant message contents.
|
||||
"""
|
||||
|
||||
# case -- if a tool call token is not present, return a text response
|
||||
# If the tool call token is not present, return a text response
|
||||
if self.bot_token not in model_output:
|
||||
return ExtractedToolCallInformation(
|
||||
tools_called=False, tool_calls=[], content=model_output
|
||||
)
|
||||
|
||||
# first remove the BOT token
|
||||
tool_content = model_output.replace(self.bot_token, "").strip()
|
||||
content_and_raw_tool_calls = model_output.split(self.bot_token)
|
||||
content = content_and_raw_tool_calls[0]
|
||||
raw_tool_calls = content_and_raw_tool_calls[1:]
|
||||
|
||||
try:
|
||||
# >= v11: content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}
|
||||
if not self._is_pre_v11:
|
||||
tool_calls = []
|
||||
for raw_tool_call in raw_tool_calls:
|
||||
if "{" not in raw_tool_call:
|
||||
continue
|
||||
|
||||
end_name = raw_tool_call.find("{")
|
||||
tool_name, args = (
|
||||
raw_tool_call[:end_name],
|
||||
raw_tool_call[end_name:],
|
||||
)
|
||||
|
||||
tool_calls.append({"name": tool_name, "arguments": args})
|
||||
|
||||
# < v11: content[BOT] [{tool_call1},{tool_call2}]
|
||||
else:
|
||||
if len(raw_tool_calls) != 1:
|
||||
raise ValueError(
|
||||
"Only one BOT token should have been outputted, "
|
||||
f"but got {model_output}."
|
||||
)
|
||||
stringified_tool_calls = raw_tool_calls[0].strip()
|
||||
try:
|
||||
if not self._is_pre_v11:
|
||||
function_call_arr = []
|
||||
for single_tool_content in model_output.split(self.bot_token):
|
||||
if "{" not in single_tool_content:
|
||||
continue
|
||||
|
||||
end_name = single_tool_content.find("{")
|
||||
fn_name, args = (
|
||||
single_tool_content[:end_name],
|
||||
single_tool_content[end_name:],
|
||||
)
|
||||
|
||||
# fn_name is encoded outside serialized json dump
|
||||
# only arguments are serialized
|
||||
function_call_arr.append(
|
||||
{"name": fn_name, "arguments": json.loads(args)}
|
||||
)
|
||||
else:
|
||||
function_call_arr = json.loads(tool_content)
|
||||
tool_calls = json.loads(stringified_tool_calls)
|
||||
except json.JSONDecodeError:
|
||||
# use a regex to find the part corresponding to the tool call.
|
||||
# NOTE: This use case should not happen if the model is trained
|
||||
# correctly. It's an easy possible fix so it's included, but
|
||||
# can be brittle for very complex / highly nested tool calls
|
||||
raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
|
||||
function_call_arr = json.loads(raw_tool_call)
|
||||
|
||||
# Tool Call
|
||||
tool_calls: list[MistralToolCall] = [
|
||||
MistralToolCall(
|
||||
type="function",
|
||||
function=FunctionCall(
|
||||
name=raw_function_call["name"],
|
||||
# function call args are JSON but as a string
|
||||
arguments=json.dumps(
|
||||
raw_function_call["arguments"], ensure_ascii=False
|
||||
try:
|
||||
raw_tool_call = self.tool_call_regex.findall(
|
||||
stringified_tool_calls
|
||||
)[0]
|
||||
tool_calls = json.loads(raw_tool_call)
|
||||
except (IndexError, json.JSONDecodeError):
|
||||
logger.exception("Error in extracting tool call from response: {e}")
|
||||
# If raw decoding and decoding post regex rule fails, then just
|
||||
# return content.
|
||||
return ExtractedToolCallInformation(
|
||||
tools_called=False,
|
||||
tool_calls=[],
|
||||
content=stringified_tool_calls,
|
||||
)
|
||||
else:
|
||||
tool_calls = [
|
||||
{
|
||||
"name": tool_call["name"],
|
||||
"arguments": json.dumps(
|
||||
tool_call["arguments"], ensure_ascii=False
|
||||
),
|
||||
),
|
||||
)
|
||||
for raw_function_call in function_call_arr
|
||||
]
|
||||
}
|
||||
for tool_call in tool_calls
|
||||
]
|
||||
|
||||
# get any content before the tool call
|
||||
content = model_output.split(self.bot_token)[0]
|
||||
return ExtractedToolCallInformation(
|
||||
tools_called=True,
|
||||
tool_calls=tool_calls,
|
||||
content=content if len(content) > 0 else None,
|
||||
mistral_tool_calls: list[MistralToolCall] = [
|
||||
MistralToolCall(
|
||||
type="function",
|
||||
function=FunctionCall(
|
||||
name=tool_call["name"],
|
||||
arguments=tool_call["arguments"],
|
||||
),
|
||||
)
|
||||
for tool_call in tool_calls
|
||||
]
|
||||
|
||||
except Exception:
|
||||
logger.exception("Error in extracting tool call from response.")
|
||||
# return information to just treat the tool call as regular JSON
|
||||
return ExtractedToolCallInformation(
|
||||
tools_called=False, tool_calls=[], content=tool_content
|
||||
)
|
||||
return ExtractedToolCallInformation(
|
||||
tools_called=True,
|
||||
tool_calls=mistral_tool_calls,
|
||||
content=content if len(content) > 0 else None,
|
||||
)
|
||||
|
||||
def extract_tool_calls_streaming(
|
||||
self,
|
||||
|
||||
@ -330,19 +330,25 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
|
||||
rope_theta = getattr_iter(config, names, None, warn=True)
|
||||
names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"]
|
||||
partial_rotary_factor = getattr_iter(config, names, None, warn=True)
|
||||
ompe = getattr(config, "original_max_position_embeddings", None)
|
||||
|
||||
if Version(version("transformers")) < Version("5.0.0.dev0"):
|
||||
# Transformers v4 installed, legacy config fields may be present
|
||||
if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
|
||||
config.rope_parameters = rope_scaling
|
||||
if (
|
||||
rope_theta is not None or partial_rotary_factor is not None
|
||||
rope_theta is not None
|
||||
or partial_rotary_factor is not None
|
||||
or ompe is not None
|
||||
) and not getattr(config, "rope_parameters", None):
|
||||
config.rope_parameters = {"rope_type": "default"}
|
||||
# Patch legacy fields into rope_parameters
|
||||
if rope_theta is not None:
|
||||
config.rope_parameters["rope_theta"] = rope_theta
|
||||
if partial_rotary_factor is not None:
|
||||
config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
|
||||
if ompe is not None:
|
||||
config.rope_parameters["original_max_position_embeddings"] = ompe
|
||||
elif rope_theta is not None or getattr(config, "rope_parameters", None):
|
||||
# Transformers v5 installed
|
||||
# Patch these fields in case they used non-standard names
|
||||
@ -358,10 +364,6 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
|
||||
if getattr(config, "rope_parameters", None) is None:
|
||||
return
|
||||
|
||||
# Add original_max_position_embeddings if present
|
||||
if ompe := getattr(config, "original_max_position_embeddings", None):
|
||||
config.rope_parameters["original_max_position_embeddings"] = ompe
|
||||
|
||||
# Handle nested rope_parameters in interleaved sliding attention models
|
||||
if is_rope_parameters_nested(config.rope_parameters):
|
||||
for rope_parameters_layer_type in config.rope_parameters.values():
|
||||
|
||||
@ -184,18 +184,42 @@ def _remap_mistral_audio_args(config: dict) -> dict:
|
||||
whisper_args = config["multimodal"].pop("whisper_model_args")
|
||||
encoder_args = whisper_args["encoder_args"]
|
||||
downsample_args = whisper_args["downsample_args"]
|
||||
downsample_factor = downsample_args["downsample_factor"]
|
||||
|
||||
# make sure that k/v blocks can be allocated with
|
||||
# unified k/v cache class and pool whisper k/v cache blocks
|
||||
# with downsample_factor:1 ratio
|
||||
if encoder_args.get("causal"):
|
||||
block_pool_size = downsample_factor
|
||||
config["projection_size"] = downsample_factor * encoder_args["dim"]
|
||||
else:
|
||||
block_pool_size = 1
|
||||
|
||||
_maybe_sliding_window = encoder_args.get("ragged_attention", None)
|
||||
if _maybe_sliding_window is None:
|
||||
sliding_window = None
|
||||
elif _maybe_sliding_window.isdigit():
|
||||
sliding_window = int(_maybe_sliding_window)
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}")
|
||||
|
||||
architecture = (
|
||||
"VoxtralStreamingGeneration"
|
||||
if encoder_args.get("causal")
|
||||
else "VoxtralForConditionalGeneration"
|
||||
)
|
||||
|
||||
quant_config = config.get("quantization_config")
|
||||
config = {
|
||||
"model_type": "whixtral",
|
||||
"architectures": ["VoxtralForConditionalGeneration"],
|
||||
"model_type": "voxtral",
|
||||
"architectures": [architecture],
|
||||
"text_config": PretrainedConfig.from_dict(config),
|
||||
"audio_config": WhisperConfig(
|
||||
num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
|
||||
window_size=encoder_args["audio_encoding_args"]["window_size"],
|
||||
sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
|
||||
hop_length=encoder_args["audio_encoding_args"]["hop_length"],
|
||||
downsample_factor=downsample_args["downsample_factor"],
|
||||
downsample_factor=downsample_factor,
|
||||
d_model=encoder_args["dim"],
|
||||
encoder_layers=encoder_args["n_layers"],
|
||||
encoder_ffn_dim=encoder_args["hidden_dim"],
|
||||
@ -203,6 +227,10 @@ def _remap_mistral_audio_args(config: dict) -> dict:
|
||||
vocab_size=encoder_args["vocab_size"],
|
||||
max_source_positions=encoder_args["max_source_positions"],
|
||||
is_encoder_decoder=False, # Override WhisperConfig default
|
||||
is_causal=encoder_args.get("causal", False),
|
||||
sliding_window=sliding_window,
|
||||
block_pool_size=block_pool_size,
|
||||
pos_embed=encoder_args.get("pos_embed", "sinusoidal"),
|
||||
),
|
||||
}
|
||||
if quant_config:
|
||||
|
||||
@ -3,17 +3,11 @@
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.attention.backends.abstract import AttentionBackend
|
||||
from vllm.attention.backends.utils import PAD_SLOT_ID
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
|
||||
from vllm.v1.attention.backends.utils import (
|
||||
CommonAttentionMetadata,
|
||||
split_decodes_and_prefills,
|
||||
from vllm.v1.attention.backends.mamba_attn import (
|
||||
BaseMambaAttentionMetadata,
|
||||
BaseMambaAttentionMetadataBuilder,
|
||||
)
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
|
||||
|
||||
|
||||
class Mamba1AttentionBackend(AttentionBackend):
|
||||
@ -23,137 +17,12 @@ class Mamba1AttentionBackend(AttentionBackend):
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mamba1AttentionMetadata:
|
||||
query_start_loc_p: torch.Tensor
|
||||
state_indices_tensor: torch.Tensor
|
||||
has_initial_states_p: torch.Tensor | None
|
||||
num_prefills: int
|
||||
num_prefill_tokens: int
|
||||
num_decodes: int
|
||||
num_decode_tokens: int
|
||||
|
||||
block_idx_last_scheduled_token: torch.Tensor # shape: [batch,]
|
||||
block_idx_first_scheduled_token_p: torch.Tensor # shape: [batch,]
|
||||
block_idx_last_computed_token: torch.Tensor # shape: [batch,]
|
||||
num_computed_tokens_p: torch.Tensor # shape: [batch,]
|
||||
class Mamba1AttentionMetadata(BaseMambaAttentionMetadata):
|
||||
pass
|
||||
|
||||
|
||||
class Mamba1AttentionMetadataBuilder(
|
||||
BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
kv_cache_spec: AttentionSpec,
|
||||
layer_names: list[str],
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
):
|
||||
super().__init__(kv_cache_spec, layer_names, vllm_config, device)
|
||||
assert isinstance(kv_cache_spec, MambaSpec)
|
||||
|
||||
def build(
|
||||
self,
|
||||
common_prefix_len: int,
|
||||
common_attn_metadata: CommonAttentionMetadata,
|
||||
fast_build: bool = False,
|
||||
) -> Mamba1AttentionMetadata:
|
||||
num_reqs = common_attn_metadata.num_reqs
|
||||
|
||||
num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
|
||||
split_decodes_and_prefills(
|
||||
common_attn_metadata, decode_threshold=self.reorder_batch_threshold
|
||||
)
|
||||
)
|
||||
|
||||
has_initial_states_p = None
|
||||
query_start_loc_p = None
|
||||
num_computed_tokens, num_computed_tokens_p = None, None
|
||||
block_idx_first_scheduled_token = None
|
||||
block_idx_first_scheduled_token_p = None
|
||||
|
||||
# TODO(@Josephasafg) Mamba1 and Mamba2 have a lot of code in common here.
|
||||
# We should consolidate this code
|
||||
if self.vllm_config.cache_config.enable_prefix_caching:
|
||||
# Return a tensor of shape (#requests, #max blocks)
|
||||
state_indices_tensor = common_attn_metadata.block_table_tensor
|
||||
mamba_block_size = self.kv_cache_spec.block_size
|
||||
num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
|
||||
self.device
|
||||
)
|
||||
(
|
||||
block_idx_last_computed_token,
|
||||
block_idx_first_scheduled_token,
|
||||
block_idx_last_scheduled_token,
|
||||
) = self._compute_prefix_caching_block_indices(
|
||||
common_attn_metadata, mamba_block_size
|
||||
)
|
||||
else:
|
||||
# Always return just a single block per each request:
|
||||
state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
|
||||
block_idx_last_scheduled_token = None
|
||||
block_idx_last_computed_token = None
|
||||
|
||||
if num_prefills > 0:
|
||||
query_start_loc_p = (
|
||||
common_attn_metadata.query_start_loc[-num_prefills - 1 :]
|
||||
- num_decode_tokens
|
||||
)
|
||||
has_initial_states_cpu = (
|
||||
common_attn_metadata.num_computed_tokens_cpu[
|
||||
num_reqs - num_prefills : num_reqs
|
||||
]
|
||||
> 0
|
||||
)
|
||||
has_initial_states_p = has_initial_states_cpu.to(
|
||||
common_attn_metadata.query_start_loc.device
|
||||
)
|
||||
|
||||
if self.vllm_config.cache_config.enable_prefix_caching:
|
||||
assert num_computed_tokens is not None
|
||||
num_computed_tokens_p = num_computed_tokens[
|
||||
num_reqs - num_prefills : num_reqs
|
||||
]
|
||||
assert block_idx_first_scheduled_token is not None
|
||||
block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
|
||||
num_reqs - num_prefills : num_reqs
|
||||
]
|
||||
|
||||
elif (
|
||||
num_decodes > 0
|
||||
and num_decodes <= self.decode_cudagraph_max_bs
|
||||
and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
):
|
||||
self.state_indices_tensor[:num_decodes].copy_(
|
||||
state_indices_tensor, non_blocking=True
|
||||
)
|
||||
state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
|
||||
state_indices_tensor[num_decodes:] = PAD_SLOT_ID
|
||||
|
||||
if self.vllm_config.cache_config.enable_prefix_caching:
|
||||
self.block_idx_last_scheduled_token[:num_decodes].copy_(
|
||||
block_idx_last_scheduled_token, non_blocking=True
|
||||
)
|
||||
block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
|
||||
:num_decode_tokens
|
||||
]
|
||||
|
||||
self.block_idx_last_computed_token[:num_decodes].copy_(
|
||||
block_idx_last_computed_token, non_blocking=True
|
||||
)
|
||||
block_idx_last_computed_token = self.block_idx_last_computed_token[
|
||||
:num_decode_tokens
|
||||
]
|
||||
|
||||
return Mamba1AttentionMetadata(
|
||||
query_start_loc_p=query_start_loc_p,
|
||||
has_initial_states_p=has_initial_states_p,
|
||||
state_indices_tensor=state_indices_tensor,
|
||||
num_prefills=num_prefills,
|
||||
num_prefill_tokens=num_prefill_tokens,
|
||||
num_decodes=num_decodes,
|
||||
num_decode_tokens=num_decode_tokens,
|
||||
block_idx_last_scheduled_token=block_idx_last_scheduled_token,
|
||||
block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
|
||||
block_idx_last_computed_token=block_idx_last_computed_token,
|
||||
num_computed_tokens_p=num_computed_tokens_p,
|
||||
)
|
||||
metadata_cls = Mamba1AttentionMetadata
|
||||
supports_update_block_table: bool = False
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user