mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 05:07:03 +08:00
Merge branch 'main' into imarkov/eplb_optimizations
This commit is contained in:
commit
691f09036c
@ -192,6 +192,7 @@ steps:
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
@ -970,6 +971,7 @@ steps:
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- vllm/model_executor/layers/fused_moe/layer.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
@ -1116,6 +1118,7 @@ steps:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
|
||||
@ -136,7 +136,7 @@ elseif(HIP_FOUND)
|
||||
|
||||
# ROCm 5.X and 6.X
|
||||
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||
Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
||||
endif()
|
||||
|
||||
11
csrc/cache.h
11
csrc/cache.h
@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
const double scale, const std::string& kv_cache_dtype);
|
||||
|
||||
void gather_and_maybe_dequant_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
|
||||
int64_t num_tokens, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& scale,
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
|
||||
|
||||
@ -905,91 +905,79 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
namespace vllm {
|
||||
|
||||
// grid is launched with dimensions (batch, num_splits)
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
|
||||
int ENTRY_SIZE, int CTA_SIZE>
|
||||
__global__ void gather_and_maybe_dequant_cache(
|
||||
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
||||
// ENTRIES...]
|
||||
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
||||
const int32_t block_size, const int32_t entry_size,
|
||||
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
||||
// ENTRIES...]
|
||||
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
||||
const int32_t* __restrict__ token_to_seq, // [MAX_TOKEN_ACROSS_CHUNK]
|
||||
const int32_t num_tokens, const int32_t block_size,
|
||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
|
||||
const float* __restrict__ scale,
|
||||
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
|
||||
// batch
|
||||
constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
|
||||
using ltype = vllm::vec_n_t<cache_t, vec_size>;
|
||||
using stype = vllm::vec_n_t<scalar_t, vec_size>;
|
||||
// We are adding this for code readability which will be optimized out when
|
||||
// build in release.
|
||||
assert(CTA_SIZE == blockDim.x);
|
||||
|
||||
const int64_t bid = blockIdx.x; // Batch ID
|
||||
const int32_t num_splits = gridDim.y;
|
||||
const int32_t split = blockIdx.y;
|
||||
const int32_t seq_start = cu_seq_lens[bid];
|
||||
const int32_t seq_end = cu_seq_lens[bid + 1];
|
||||
const int32_t seq_len = seq_end - seq_start;
|
||||
const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
|
||||
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
|
||||
#pragma unroll
|
||||
for (int token_id = blockIdx.x; token_id < num_tokens;
|
||||
token_id += gridDim.x) {
|
||||
int64_t batch_id = token_to_seq[token_id];
|
||||
int64_t batch_start = cu_seq_lens[batch_id];
|
||||
int64_t batch_end = cu_seq_lens[batch_id + 1];
|
||||
int32_t batch_offset = token_id - batch_start;
|
||||
|
||||
const int32_t split_start = split * split_blocks;
|
||||
const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
|
||||
if (token_id >= batch_end) return;
|
||||
int32_t offset = 0;
|
||||
if (seq_starts != nullptr) {
|
||||
offset = seq_starts[batch_id];
|
||||
}
|
||||
batch_offset += offset;
|
||||
int32_t block_table_id = batch_offset / block_size;
|
||||
int32_t slot_id = batch_offset % block_size;
|
||||
int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
|
||||
int32_t block_id = block_table[block_table_offset];
|
||||
int64_t cache_offset =
|
||||
block_id * cache_block_stride + slot_id * cache_entry_stride;
|
||||
constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
|
||||
scalar_t* dst_ = dst + token_id * dst_entry_stride;
|
||||
cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
|
||||
|
||||
const bool is_active_split = (split_start < tot_blocks);
|
||||
const bool is_last_split = (split_end == tot_blocks);
|
||||
|
||||
if (!is_active_split) return;
|
||||
|
||||
int32_t full_blocks_end = split_end;
|
||||
int32_t partial_block_size = 0;
|
||||
|
||||
// Adjust the pointer for the block_table for this batch.
|
||||
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
|
||||
// page_size)
|
||||
const int32_t batch_offset = bid * block_table_stride;
|
||||
int32_t offset = 0;
|
||||
if (seq_starts != nullptr) {
|
||||
offset = seq_starts[bid] / block_size;
|
||||
}
|
||||
const int32_t* batch_block_table = block_table + batch_offset + offset;
|
||||
|
||||
// Adjust dst pointer based on the cumulative sequence lengths.
|
||||
dst += seq_start * dst_entry_stride;
|
||||
|
||||
if (is_last_split) {
|
||||
partial_block_size = seq_len % block_size;
|
||||
if (partial_block_size) full_blocks_end -= 1;
|
||||
}
|
||||
|
||||
auto copy_entry = [&](const cache_t* __restrict__ _src,
|
||||
scalar_t* __restrict__ _dst) {
|
||||
for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
|
||||
#pragma unroll
|
||||
for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
_dst[i] = static_cast<scalar_t>(_src[i]);
|
||||
reinterpret_cast<stype*>(dst_)[idx] =
|
||||
static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
|
||||
} else {
|
||||
_dst[i] =
|
||||
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
|
||||
ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
|
||||
stype store_val;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < vec_size; ++j) {
|
||||
store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
|
||||
loaded_val.val[j], *scale);
|
||||
}
|
||||
reinterpret_cast<stype*>(dst_)[idx] = store_val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const auto loop_end =
|
||||
std::min((int64_t)full_blocks_end, block_table_stride - offset);
|
||||
for (int pid = split_start; pid < loop_end; ++pid) {
|
||||
auto block_id = batch_block_table[pid];
|
||||
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||
auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
|
||||
for (int eid = 0; eid < block_size; ++eid) {
|
||||
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||
block_dst_ptr + eid * dst_entry_stride);
|
||||
}
|
||||
}
|
||||
|
||||
if (partial_block_size) {
|
||||
if (offset + full_blocks_end < block_table_stride) {
|
||||
auto block_id = batch_block_table[full_blocks_end];
|
||||
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||
auto block_dst_ptr =
|
||||
dst + full_blocks_end * block_size * dst_entry_stride;
|
||||
for (int eid = 0; eid < partial_block_size; ++eid) {
|
||||
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||
block_dst_ptr + eid * dst_entry_stride);
|
||||
// process tail
|
||||
constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
|
||||
dst_ = dst_ + ENTRY_SIZE - tail_cnt;
|
||||
src_ = src_ + ENTRY_SIZE - tail_cnt;
|
||||
#pragma unroll
|
||||
for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
dst_[idx] = static_cast<scalar_t>(src_[idx]);
|
||||
} else {
|
||||
dst_[idx] =
|
||||
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1001,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache(
|
||||
// SCALAR_T is the data type of the destination tensor.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
|
||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||
block_size, entry_size, block_table_stride, cache_block_stride, \
|
||||
cache_entry_stride, dst_entry_stride, \
|
||||
reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
|
||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \
|
||||
thread_block_size> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
|
||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||
token_to_seq.data_ptr<int32_t>(), num_tokens, block_size, \
|
||||
block_table_stride, cache_block_stride, cache_entry_stride, \
|
||||
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
|
||||
seq_starts_ptr);
|
||||
|
||||
// Gather sequences from the cache into the destination tensor.
|
||||
// - cu_seq_lens contains the cumulative sequence lengths for each batch
|
||||
// - block_table contains the cache block indices for each sequence
|
||||
// - token_to_seq contains the back mapping from token_id to batch_id
|
||||
// - Optionally, seq_starts (if provided) offsets the starting block index by
|
||||
// (seq_starts[bid] / page_size)
|
||||
void gather_and_maybe_dequant_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
|
||||
int64_t num_tokens, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& scale,
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt) {
|
||||
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int32_t block_size = src_cache.size(1);
|
||||
int32_t entry_size = src_cache.flatten(2, -1).size(2);
|
||||
int32_t head_dim = dst.size(-1);
|
||||
|
||||
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
||||
"block_table must be int32");
|
||||
@ -1038,6 +1030,9 @@ void gather_and_maybe_dequant_cache(
|
||||
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
|
||||
"seq_starts must be int32");
|
||||
}
|
||||
TORCH_CHECK(head_dim == 576,
|
||||
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
|
||||
"for better performance")
|
||||
|
||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||
"src_cache and dst must be on the same device");
|
||||
@ -1055,10 +1050,9 @@ void gather_and_maybe_dequant_cache(
|
||||
int64_t cache_entry_stride = src_cache.stride(1);
|
||||
int64_t dst_entry_stride = dst.stride(0);
|
||||
|
||||
// Decide on the number of splits based on the batch size.
|
||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||
dim3 grid(batch_size, num_splits);
|
||||
dim3 block(1024);
|
||||
constexpr int32_t thread_block_size = 64;
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(thread_block_size);
|
||||
|
||||
const int32_t* seq_starts_ptr =
|
||||
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
|
||||
|
||||
@ -695,7 +695,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
cache_ops.def(
|
||||
"gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, "
|
||||
" Tensor block_table, Tensor cu_seq_lens, "
|
||||
" int batch_size, "
|
||||
" Tensor token_to_seq, "
|
||||
" int num_tokens, "
|
||||
" str kv_cache_dtype, "
|
||||
" Tensor scale, Tensor? seq_starts) -> ()");
|
||||
cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
|
||||
|
||||
@ -20,8 +20,8 @@ ARG PYTHON_VERSION=3.12
|
||||
# glibc version is baked into the distro, and binaries built with one glibc
|
||||
# version are not backwards compatible with OSes that use an earlier version.
|
||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
|
||||
|
||||
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
|
||||
# their own mirror. When doing so, we don't benefit from the transparent
|
||||
@ -85,7 +85,20 @@ ARG GET_PIP_URL
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
ccache \
|
||||
software-properties-common \
|
||||
git \
|
||||
curl \
|
||||
sudo \
|
||||
python3-pip \
|
||||
libibverbs-dev \
|
||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||
# as it was causing spam when compiling the CUTLASS kernels
|
||||
gcc-10 \
|
||||
g++-10 \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
|
||||
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
|
||||
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
|
||||
@ -110,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||
# as it was causing spam when compiling the CUTLASS kernels
|
||||
RUN apt-get install -y gcc-10 g++-10
|
||||
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
|
||||
RUN <<EOF
|
||||
gcc --version
|
||||
EOF
|
||||
@ -268,7 +277,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Install libnuma-dev, required by fastsafetensors (fixes #20384)
|
||||
RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
|
||||
COPY requirements/lint.txt requirements/lint.txt
|
||||
COPY requirements/test.txt requirements/test.txt
|
||||
COPY requirements/dev.txt requirements/dev.txt
|
||||
@ -305,8 +314,15 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y software-properties-common curl sudo python3-pip \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
software-properties-common \
|
||||
curl \
|
||||
sudo \
|
||||
python3-pip \
|
||||
ffmpeg \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
libgl1 \
|
||||
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
||||
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
|
||||
mkdir -p -m 0755 /etc/apt/keyrings ; \
|
||||
@ -321,13 +337,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
done ; \
|
||||
fi \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
python${PYTHON_VERSION} \
|
||||
python${PYTHON_VERSION}-dev \
|
||||
python${PYTHON_VERSION}-venv \
|
||||
libibverbs-dev \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Install CUDA development tools and build essentials for runtime JIT compilation
|
||||
# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
|
||||
RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
cuda-nvcc-${CUDA_VERSION_DASH} \
|
||||
cuda-cudart-${CUDA_VERSION_DASH} \
|
||||
cuda-nvrtc-${CUDA_VERSION_DASH} \
|
||||
cuda-cuobjdump-${CUDA_VERSION_DASH} \
|
||||
libcublas-${CUDA_VERSION_DASH} && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
|
||||
FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
@ -25,10 +25,14 @@ RUN apt clean && apt-get update -y && \
|
||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
|
||||
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
|
||||
|
||||
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
|
||||
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
|
||||
|
||||
# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
|
||||
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
|
||||
RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
|
||||
echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
|
||||
echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
|
||||
|
||||
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
|
||||
RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
|
||||
SHELL ["bash", "-c"]
|
||||
CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
|
||||
|
||||
@ -72,6 +76,7 @@ RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
ENV NIXL_VERSION=0.7.0
|
||||
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||
|
||||
# remove torch bundled oneccl to avoid conflicts
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip uninstall oneccl oneccl-devel -y
|
||||
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 131 KiB After Width: | Height: | Size: 146 KiB |
@ -22,9 +22,6 @@ export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxab
|
||||
vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
|
||||
```
|
||||
|
||||
!!! tip
|
||||
If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
|
||||
|
||||
!!! tip
|
||||
When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
|
||||
`VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
|
||||
|
||||
@ -7,7 +7,7 @@ This document shows you some examples of the different options that are
|
||||
available to generate structured outputs.
|
||||
|
||||
!!! warning
|
||||
If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
|
||||
If you are still using the following deprecated API fields which were removed in v0.12.0, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
|
||||
|
||||
- `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
|
||||
- `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`
|
||||
|
||||
@ -680,6 +680,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
|
||||
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
|
||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
15
examples/offline_inference/audio_language.py
Normal file → Executable file
15
examples/offline_inference/audio_language.py
Normal file → Executable file
@ -425,6 +425,13 @@ def parse_args():
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tensor-parallel-size",
|
||||
"-tp",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Tensor parallel size to override the model's default setting. ",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
@ -434,6 +441,12 @@ def main(args):
|
||||
if model not in model_example_map:
|
||||
raise ValueError(f"Model type {model} is not supported.")
|
||||
|
||||
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
|
||||
raise ValueError(
|
||||
f"tensor_parallel_size must be a positive integer, "
|
||||
f"got {args.tensor_parallel_size}"
|
||||
)
|
||||
|
||||
audio_count = args.num_audios
|
||||
req_data = model_example_map[model](
|
||||
question_per_audio_count[audio_count], audio_count
|
||||
@ -446,6 +459,8 @@ def main(args):
|
||||
)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
if args.tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
|
||||
170
examples/offline_inference/qwen3_omni/only_thinker.py
Normal file
170
examples/offline_inference/qwen3_omni/only_thinker.py
Normal file
@ -0,0 +1,170 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This example shows how to use vLLM for running offline inference
|
||||
with the correct prompt format on Qwen2.5-Omni (thinker only).
|
||||
"""
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class QueryResult(NamedTuple):
|
||||
inputs: dict
|
||||
limit_mm_per_prompt: dict[str, int]
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
# lower-end GPUs.
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
default_system = (
|
||||
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
|
||||
"Group, capable of perceiving auditory and visual inputs, as well as "
|
||||
"generating text and speech."
|
||||
)
|
||||
|
||||
|
||||
def get_mixed_modalities_query() -> QueryResult:
|
||||
question = (
|
||||
"What is recited in the audio? "
|
||||
"What is the content of this image? Why is this video funny?"
|
||||
)
|
||||
prompt = (
|
||||
f"<|im_start|>system\n{default_system}<|im_end|>\n"
|
||||
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
|
||||
"<|vision_start|><|image_pad|><|vision_end|>"
|
||||
"<|vision_start|><|video_pad|><|vision_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
f"<|im_start|>assistant\n"
|
||||
)
|
||||
return QueryResult(
|
||||
inputs={
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
|
||||
"image": convert_image_mode(
|
||||
ImageAsset("cherry_blossom").pil_image, "RGB"
|
||||
),
|
||||
"video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
|
||||
},
|
||||
},
|
||||
limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
|
||||
)
|
||||
|
||||
|
||||
def get_use_audio_in_video_query() -> QueryResult:
|
||||
question = (
|
||||
"Describe the content of the video in details, then convert what the "
|
||||
"baby say into text."
|
||||
)
|
||||
prompt = (
|
||||
f"<|im_start|>system\n{default_system}<|im_end|>\n"
|
||||
"<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
f"<|im_start|>assistant\n"
|
||||
)
|
||||
asset = VideoAsset(name="baby_reading", num_frames=16)
|
||||
audio = asset.get_audio(sampling_rate=16000)
|
||||
return QueryResult(
|
||||
inputs={
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"video": asset.np_ndarrays,
|
||||
"audio": audio,
|
||||
},
|
||||
"mm_processor_kwargs": {
|
||||
"use_audio_in_video": True,
|
||||
},
|
||||
},
|
||||
limit_mm_per_prompt={"audio": 1, "video": 1},
|
||||
)
|
||||
|
||||
|
||||
def get_multi_audios_query() -> QueryResult:
|
||||
question = "Are these two audio clips the same?"
|
||||
prompt = (
|
||||
f"<|im_start|>system\n{default_system}<|im_end|>\n"
|
||||
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
|
||||
"<|audio_start|><|audio_pad|><|audio_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
f"<|im_start|>assistant\n"
|
||||
)
|
||||
return QueryResult(
|
||||
inputs={
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"audio": [
|
||||
AudioAsset("winning_call").audio_and_sample_rate,
|
||||
AudioAsset("mary_had_lamb").audio_and_sample_rate,
|
||||
],
|
||||
},
|
||||
},
|
||||
limit_mm_per_prompt={
|
||||
"audio": 2,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
query_map = {
|
||||
"mixed_modalities": get_mixed_modalities_query,
|
||||
"use_audio_in_video": get_use_audio_in_video_query,
|
||||
"multi_audios": get_multi_audios_query,
|
||||
}
|
||||
|
||||
|
||||
def main(args):
|
||||
model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
|
||||
query_result = query_map[args.query_type]()
|
||||
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=12800,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt=query_result.limit_mm_per_prompt,
|
||||
seed=args.seed,
|
||||
)
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
# even when all prompts are identical when running batch inference.
|
||||
sampling_params = SamplingParams(temperature=0.2, max_tokens=256)
|
||||
|
||||
outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Demo on using vLLM for offline inference with "
|
||||
"audio language models"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--query-type",
|
||||
"-q",
|
||||
type=str,
|
||||
default="mixed_modalities",
|
||||
choices=query_map.keys(),
|
||||
help="Query type.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
41
examples/offline_inference/vision_language.py
Normal file → Executable file
41
examples/offline_inference/vision_language.py
Normal file → Executable file
@ -538,6 +538,31 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# HunyuanOCR
|
||||
def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "tencent/HunyuanOCR"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
placeholder = "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501
|
||||
prompts = [
|
||||
f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>"
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=None,
|
||||
)
|
||||
|
||||
|
||||
# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
|
||||
def run_hyperclovax_seed_vision(
|
||||
questions: list[str], modality: str
|
||||
@ -1820,6 +1845,7 @@ model_example_map = {
|
||||
"glm4_5v": run_glm4_5v,
|
||||
"glm4_5v_fp8": run_glm4_5v_fp8,
|
||||
"h2ovl_chat": run_h2ovl,
|
||||
"hunyuan_vl": run_hunyuan_vl,
|
||||
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
|
||||
"idefics3": run_idefics3,
|
||||
"interns1": run_interns1,
|
||||
@ -2038,6 +2064,13 @@ def parse_args():
|
||||
help="If True, will send all requests in a second batch with empty mm "
|
||||
"data to verify cache hits with UUIDs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tensor-parallel-size",
|
||||
"-tp",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Tensor parallel size to override the model's default setting. ",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -2046,6 +2079,12 @@ def main(args):
|
||||
if model not in model_example_map:
|
||||
raise ValueError(f"Model type {model} is not supported.")
|
||||
|
||||
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
|
||||
raise ValueError(
|
||||
f"tensor_parallel_size must be a positive integer, "
|
||||
f"got {args.tensor_parallel_size}"
|
||||
)
|
||||
|
||||
modality = args.modality
|
||||
mm_input = get_multi_modal_input(args)
|
||||
data = mm_input["data"]
|
||||
@ -2063,6 +2102,8 @@ def main(args):
|
||||
"seed": args.seed,
|
||||
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
|
||||
}
|
||||
if args.tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# Don't want to check the flag multiple times, so just hijack `prompts`.
|
||||
|
||||
40
examples/offline_inference/vision_language_multi_image.py
Normal file → Executable file
40
examples/offline_inference/vision_language_multi_image.py
Normal file → Executable file
@ -1352,10 +1352,18 @@ model_example_map = {
|
||||
}
|
||||
|
||||
|
||||
def run_generate(model, question: str, image_urls: list[str], seed: int | None):
|
||||
def run_generate(
|
||||
model,
|
||||
question: str,
|
||||
image_urls: list[str],
|
||||
seed: int | None,
|
||||
tensor_parallel_size: int | None,
|
||||
):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
if tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
@ -1378,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
|
||||
def run_chat(
|
||||
model: str,
|
||||
question: str,
|
||||
image_urls: list[str],
|
||||
seed: int | None,
|
||||
tensor_parallel_size: int | None,
|
||||
):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
# Disable other modalities to save memory
|
||||
@ -1388,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
|
||||
)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
if tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
sampling_params = (
|
||||
@ -1463,6 +1479,13 @@ def parse_args():
|
||||
default=2,
|
||||
help="Number of images to use for the demo.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tensor-parallel-size",
|
||||
"-tp",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Tensor parallel size to override the model's default setting. ",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -1470,13 +1493,20 @@ def main(args: Namespace):
|
||||
model = args.model_type
|
||||
method = args.method
|
||||
seed = args.seed
|
||||
tensor_parallel_size = args.tensor_parallel_size
|
||||
|
||||
if tensor_parallel_size is not None and tensor_parallel_size < 1:
|
||||
raise ValueError(
|
||||
f"tensor_parallel_size must be a positive integer, "
|
||||
f"got {tensor_parallel_size}"
|
||||
)
|
||||
|
||||
image_urls = IMAGE_URLS[: args.num_images]
|
||||
|
||||
if method == "generate":
|
||||
run_generate(model, QUESTION, image_urls, seed)
|
||||
run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
|
||||
elif method == "chat":
|
||||
run_chat(model, QUESTION, image_urls, seed)
|
||||
run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
|
||||
else:
|
||||
raise ValueError(f"Invalid method: {method}")
|
||||
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
lmcache
|
||||
nixl >= 0.6.0 # Required for disaggregated prefill
|
||||
nixl >= 0.7.1 # Required for disaggregated prefill
|
||||
|
||||
@ -45,3 +45,7 @@ multiprocess==0.70.16
|
||||
|
||||
# Plugins test
|
||||
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
|
||||
torchgeo==0.7.0
|
||||
|
||||
# Required for suffix decoding test
|
||||
arctic-inference == 0.1.1
|
||||
|
||||
@ -10,9 +10,9 @@ wheel
|
||||
jinja2>=3.1.6
|
||||
datasets # for benchmark scripts
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
torch==2.8.0+xpu
|
||||
--extra-index-url=https://download.pytorch.org/whl/xpu
|
||||
torch==2.9.0+xpu
|
||||
torchaudio
|
||||
torchvision
|
||||
--extra-index-url=https://download.pytorch.org/whl/xpu
|
||||
|
||||
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
|
||||
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
|
||||
|
||||
@ -111,6 +111,17 @@ if current_platform.is_cuda():
|
||||
async_tp=96, # MLP is MoE, half the fusions of dense
|
||||
),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="openai/gpt-oss-20b",
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=AttentionBackendEnum.FLASHINFER,
|
||||
matches=Matches(
|
||||
attention_fusion=0,
|
||||
allreduce_fusion=49,
|
||||
sequence_parallel=49,
|
||||
async_tp=48,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
elif current_platform.is_rocm():
|
||||
|
||||
@ -183,9 +183,6 @@ async def test_metrics_counts(
|
||||
EXPECTED_METRICS_V1 = [
|
||||
"vllm:num_requests_running",
|
||||
"vllm:num_requests_waiting",
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:gpu_prefix_cache_queries",
|
||||
"vllm:gpu_prefix_cache_hits",
|
||||
"vllm:kv_cache_usage_perc",
|
||||
"vllm:prefix_cache_queries",
|
||||
"vllm:prefix_cache_hits",
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import importlib
|
||||
import json
|
||||
import time
|
||||
|
||||
@ -35,6 +35,10 @@ GET_WEATHER_SCHEMA = {
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
assert importlib.util.find_spec("gpt_oss") is not None, (
|
||||
"Harmony tests require gpt_oss package to be installed"
|
||||
)
|
||||
|
||||
args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
|
||||
env_dict = dict(
|
||||
VLLM_ENABLE_RESPONSES_API_STORE="1",
|
||||
|
||||
@ -921,12 +921,16 @@ def test_gather_and_maybe_dequant_cache_mla(
|
||||
)
|
||||
_fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
|
||||
|
||||
seq_len_tensor = torch.randint(0, max_seq_len + 1, (batch_size,), device=device)
|
||||
seq_len_tensor = torch.randint(
|
||||
max_seq_len, max_seq_len + 1, (batch_size,), device=device
|
||||
)
|
||||
|
||||
total_tokens = seq_len_tensor.sum()
|
||||
cu_seq_lens = torch.empty((batch_size + 1), dtype=torch.int32, device=device)
|
||||
cu_seq_lens[0] = 0
|
||||
cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
|
||||
token_to_seq = torch.arange(0, batch_size, dtype=torch.int32, device=device)
|
||||
token_to_seq = torch.repeat_interleave(token_to_seq, seq_len_tensor)
|
||||
print("seq_len_tensor", seq_len_tensor)
|
||||
|
||||
tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
|
||||
@ -977,7 +981,8 @@ def test_gather_and_maybe_dequant_cache_mla(
|
||||
dst,
|
||||
block_table,
|
||||
cu_seq_lens,
|
||||
batch_size,
|
||||
token_to_seq,
|
||||
total_tokens,
|
||||
kv_cache_dtype,
|
||||
scale,
|
||||
None,
|
||||
@ -990,7 +995,8 @@ def test_gather_and_maybe_dequant_cache_mla(
|
||||
dst,
|
||||
block_table,
|
||||
cu_seq_lens,
|
||||
batch_size,
|
||||
token_to_seq,
|
||||
total_tokens,
|
||||
kv_cache_dtype,
|
||||
scale,
|
||||
None,
|
||||
|
||||
@ -11,7 +11,6 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
fp8_w8a8_moe_quant_config,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
|
||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
||||
from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
|
||||
apply_flashinfer_per_tensor_scale_fp8,
|
||||
flashinfer_cutlass_moe_fp8,
|
||||
@ -151,14 +150,11 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
|
||||
td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids = Llama4MoE.custom_routing_function(
|
||||
hidden_states=td.hidden_states,
|
||||
router_logits=score,
|
||||
use_grouped_topk=False,
|
||||
top_k=topk,
|
||||
gating_output=score,
|
||||
topk=topk,
|
||||
renormalize=False,
|
||||
custom_routing_function=Llama4MoE.custom_routing_function,
|
||||
scoring_func="softmax",
|
||||
)
|
||||
|
||||
quant_config = fp8_w8a8_moe_quant_config(
|
||||
@ -219,14 +215,11 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
|
||||
)
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids = Llama4MoE.custom_routing_function(
|
||||
hidden_states=td.hidden_states,
|
||||
router_logits=score,
|
||||
use_grouped_topk=False,
|
||||
top_k=topk,
|
||||
gating_output=score,
|
||||
topk=topk,
|
||||
renormalize=False,
|
||||
custom_routing_function=Llama4MoE.custom_routing_function,
|
||||
scoring_func="softmax",
|
||||
)
|
||||
|
||||
quant_config = fp8_w8a8_moe_quant_config(
|
||||
|
||||
221
tests/model_executor/test_qwen3_omni.py
Normal file
221
tests/model_executor/test_qwen3_omni.py
Normal file
@ -0,0 +1,221 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.multimodal.processing import InputProcessingContext
|
||||
|
||||
|
||||
# Helper function to print input IDs with coalesced audio/video tokens.
|
||||
def print_input_ids(input_ids):
|
||||
"""
|
||||
Print input IDs, compressing consecutive special tokens.
|
||||
- 151675: <|audio_pad|>
|
||||
- 151656: <|video_pad|>
|
||||
"""
|
||||
if not input_ids:
|
||||
print("[]")
|
||||
return
|
||||
|
||||
result = []
|
||||
i = 0
|
||||
|
||||
while i < len(input_ids):
|
||||
current_id = input_ids[i]
|
||||
|
||||
# Check if it's a special token that should be compressed
|
||||
if current_id in [151675, 151656]:
|
||||
# Count consecutive occurrences
|
||||
count = 1
|
||||
while i + count < len(input_ids) and input_ids[i + count] == current_id:
|
||||
count += 1
|
||||
|
||||
# Add compressed representation
|
||||
token_name = "<|audio_pad|>" if current_id == 151675 else "<|video_pad|>"
|
||||
result.append(f"{token_name} * {count}")
|
||||
i += count
|
||||
else:
|
||||
# Regular token, just add it
|
||||
result.append(str(current_id))
|
||||
i += 1
|
||||
|
||||
print(", ".join(result))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_qwen3_omni_config():
|
||||
"""Create a mock Qwen3OmniMoeThinker config."""
|
||||
config = Mock(spec=PretrainedConfig)
|
||||
# Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
|
||||
config.audio_token_id = 151675 # <|audio_pad|>
|
||||
config.video_token_id = 151656 # <|video_pad|>
|
||||
config.image_token_id = 151655 # <|image_pad|>
|
||||
config.audio_start_token_id = 151669 # <|audio_start|>
|
||||
config.audio_end_token_id = 151670 # <|audio_end|>
|
||||
config.vision_start_token_id = 151652 # <|vision_start|>
|
||||
config.position_id_per_seconds = 12.5
|
||||
|
||||
# Vision config
|
||||
vision_config = Mock()
|
||||
vision_config.spatial_merge_size = 2
|
||||
config.vision_config = vision_config
|
||||
|
||||
return config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_processor():
|
||||
"""Create a mock HF processor."""
|
||||
from transformers.models.whisper import WhisperFeatureExtractor
|
||||
|
||||
processor = Mock()
|
||||
processor.audio_token = "<|audio_pad|>"
|
||||
processor.image_token = "<|image_pad|>"
|
||||
processor.video_token = "<|video_pad|>"
|
||||
|
||||
# Create a real WhisperFeatureExtractor instance for the feature_extractor attribute
|
||||
feature_extractor = WhisperFeatureExtractor()
|
||||
processor.feature_extractor = feature_extractor
|
||||
|
||||
return processor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_tokenizer():
|
||||
"""Create a mock tokenizer."""
|
||||
tokenizer = Mock()
|
||||
# Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
|
||||
tokenizer.get_vocab = Mock(
|
||||
return_value={
|
||||
"<|audio_pad|>": 151675,
|
||||
"<|video_pad|>": 151656,
|
||||
"<|image_pad|>": 151655,
|
||||
"<|audio_start|>": 151669,
|
||||
"<|audio_end|>": 151670,
|
||||
"<|vision_start|>": 151652,
|
||||
"<|vision_end|>": 151653,
|
||||
}
|
||||
)
|
||||
tokenizer.encode = Mock(
|
||||
side_effect=lambda x: {
|
||||
"<|vision_start|>": [151652],
|
||||
"<|vision_end|>": [151653],
|
||||
"<|audio_start|>": [151669],
|
||||
"<|audio_end|>": [151670],
|
||||
"<|audio_pad|>": [151675],
|
||||
"<|image_pad|>": [151655],
|
||||
"<|video_pad|>": [151656],
|
||||
}.get(x, [0])
|
||||
)
|
||||
tokenizer.vision_bos_token = "<|vision_start|>"
|
||||
tokenizer.vision_eos_token = "<|vision_end|>"
|
||||
tokenizer.audio_bos_token = "<|audio_start|>"
|
||||
tokenizer.audio_eos_token = "<|audio_end|>"
|
||||
return tokenizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_image_processor():
|
||||
"""Create a mock image processor."""
|
||||
image_processor = Mock()
|
||||
image_processor.merge_size = 2
|
||||
return image_processor
|
||||
|
||||
|
||||
def test_qwen3_omni_get_updates_use_audio_in_video(
|
||||
mock_qwen3_omni_config,
|
||||
mock_processor,
|
||||
mock_tokenizer,
|
||||
mock_image_processor,
|
||||
):
|
||||
"""Test the get_updates_use_audio_in_video method directly."""
|
||||
|
||||
from vllm.model_executor.models.qwen3_omni_moe_thinker import (
|
||||
Qwen3OmniMoeThinkerMultiModalProcessor,
|
||||
Qwen3OmniMoeThinkerProcessingInfo,
|
||||
)
|
||||
|
||||
# Create a mock context
|
||||
mock_ctx = Mock(spec=InputProcessingContext)
|
||||
|
||||
# Create processing info
|
||||
info = Qwen3OmniMoeThinkerProcessingInfo(mock_ctx)
|
||||
info.get_hf_config = Mock(return_value=mock_qwen3_omni_config)
|
||||
info.get_hf_processor = Mock(return_value=mock_processor)
|
||||
info.get_tokenizer = Mock(return_value=mock_tokenizer)
|
||||
info.get_image_processor = Mock(return_value=mock_image_processor)
|
||||
|
||||
# Create a mock dummy_inputs builder
|
||||
mock_dummy_inputs = Mock()
|
||||
|
||||
# Create the processor
|
||||
processor = Qwen3OmniMoeThinkerMultiModalProcessor(info, mock_dummy_inputs)
|
||||
|
||||
# Test parameters from reference video
|
||||
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4
|
||||
audio_len = 85
|
||||
video_grid_thw = [6, 36, 64]
|
||||
video_second_per_grid_t = 2.0
|
||||
|
||||
# Call the method
|
||||
updates = processor.get_updates_use_audio_in_video(
|
||||
thinker_config=mock_qwen3_omni_config,
|
||||
audio_len=audio_len,
|
||||
video_grid_thw=video_grid_thw,
|
||||
video_second_per_grid_t=video_second_per_grid_t,
|
||||
)
|
||||
|
||||
# Updated input ids should align with HF implementation.
|
||||
# 151669,
|
||||
# <|video_pad|> * 576, <|audio_pad|> * 25,
|
||||
# <|video_pad|> * 576, <|audio_pad|> * 25,
|
||||
# <|video_pad|> * 576, <|audio_pad|> * 25,
|
||||
# <|video_pad|> * 576, <|audio_pad|> * 10,
|
||||
# <|video_pad|> * 1152,
|
||||
# 151670
|
||||
print_input_ids(updates)
|
||||
|
||||
# Verify structure
|
||||
assert isinstance(updates, list)
|
||||
assert len(updates) > 0
|
||||
|
||||
# Verify start and end tokens
|
||||
audio_start_token_id = mock_qwen3_omni_config.audio_start_token_id
|
||||
audio_end_token_id = mock_qwen3_omni_config.audio_end_token_id
|
||||
|
||||
assert updates[0] == audio_start_token_id
|
||||
assert updates[-1] == audio_end_token_id
|
||||
|
||||
# Verify both audio and video tokens are present
|
||||
audio_token_id = mock_qwen3_omni_config.audio_token_id
|
||||
video_token_id = mock_qwen3_omni_config.video_token_id
|
||||
|
||||
audio_count = updates.count(audio_token_id)
|
||||
video_count = updates.count(video_token_id)
|
||||
|
||||
assert audio_count == audio_len, (
|
||||
f"Expected {audio_len} audio tokens, got {audio_count}"
|
||||
)
|
||||
|
||||
# Calculate expected video token count
|
||||
spatial_merge_size = mock_qwen3_omni_config.vision_config.spatial_merge_size
|
||||
height = video_grid_thw[1] // spatial_merge_size
|
||||
width = video_grid_thw[2] // spatial_merge_size
|
||||
expected_video_count = video_grid_thw[0] * height * width
|
||||
|
||||
assert video_count == expected_video_count, (
|
||||
f"Expected {expected_video_count} video tokens, got {video_count}"
|
||||
)
|
||||
|
||||
# Total tokens should be: 1 (start) + audio_len + video_count + 1 (end)
|
||||
expected_total = 1 + audio_len + expected_video_count + 1
|
||||
assert len(updates) == expected_total, (
|
||||
f"Expected {expected_total} total tokens, got {len(updates)}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@ -626,6 +626,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
|
||||
trust_remote_code=True,
|
||||
),
|
||||
"HunYuanVLForConditionalGeneration": _HfExamplesInfo(
|
||||
"tencent/HunyuanOCR",
|
||||
is_available_online=False,
|
||||
),
|
||||
"Idefics3ForConditionalGeneration": _HfExamplesInfo(
|
||||
"HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},
|
||||
|
||||
@ -9,9 +9,16 @@ different routing strategies and analyze their performance, including
|
||||
integration tests with FusedMoE layer.
|
||||
"""
|
||||
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed import (
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.routing_simulator import (
|
||||
DistributionBasedRouting,
|
||||
RoutingSimulator,
|
||||
@ -89,6 +96,28 @@ def test_routing_strategy_integration(monkeypatch, device):
|
||||
# Test different routing strategies
|
||||
strategies = RoutingSimulator.get_available_strategies()
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
with set_current_vllm_config(vllm_config):
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
init_distributed_environment(
|
||||
world_size=1,
|
||||
rank=0,
|
||||
local_rank=0,
|
||||
distributed_init_method=f"file://{temp_file}",
|
||||
)
|
||||
initialize_model_parallel(
|
||||
tensor_model_parallel_size=1,
|
||||
pipeline_model_parallel_size=1,
|
||||
)
|
||||
fused_moe = FusedMoE(
|
||||
num_experts=num_experts,
|
||||
top_k=top_k,
|
||||
hidden_size=hidden_size,
|
||||
intermediate_size=0,
|
||||
use_grouped_topk=False,
|
||||
renormalize=True,
|
||||
)
|
||||
|
||||
for strategy in strategies:
|
||||
# Set environment variable
|
||||
env_name = "VLLM_MOE_ROUTING_SIMULATION_STRATEGY"
|
||||
@ -98,13 +127,9 @@ def test_routing_strategy_integration(monkeypatch, device):
|
||||
envs.environment_variables[env_name] = lambda s=strategy: s
|
||||
|
||||
# Test the select_experts method
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = fused_moe.select_experts(
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits,
|
||||
top_k=top_k,
|
||||
use_grouped_topk=False,
|
||||
renormalize=True,
|
||||
indices_type=torch.long,
|
||||
)
|
||||
|
||||
# Verify output shapes
|
||||
|
||||
@ -36,6 +36,12 @@ def mock_on_gfx9():
|
||||
@pytest.mark.parametrize(
|
||||
"env_vars, selected_backend, expected_backend_path",
|
||||
[
|
||||
# Test Case: Explicit FLEX_ATTENTION backend
|
||||
(
|
||||
{},
|
||||
"FLEX_ATTENTION",
|
||||
AttentionBackendEnum.FLEX_ATTENTION.get_path(),
|
||||
),
|
||||
# Test Case 1: Default (no env vars, no explicit backend)
|
||||
(
|
||||
{},
|
||||
|
||||
@ -340,4 +340,11 @@ full_cg_backend_configs = {
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
},
|
||||
),
|
||||
"RocmAttn": BackendConfig(
|
||||
name="RocmAttn",
|
||||
env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
},
|
||||
),
|
||||
}
|
||||
|
||||
@ -1436,6 +1436,65 @@ def test_get_kv_cache_config_one_worker():
|
||||
],
|
||||
)
|
||||
|
||||
# 6 full + 5 sliding, pad to 6 full + 6 sliding. This is a typical case for gpt-oss
|
||||
# eagle where there is only one more full attention layer than sliding window layers
|
||||
kv_cache_specs_hybrid = {
|
||||
"layer_1": new_kv_cache_spec(),
|
||||
"layer_2": new_kv_cache_spec(),
|
||||
"layer_3": new_kv_cache_spec(),
|
||||
"layer_4": new_kv_cache_spec(),
|
||||
"layer_5": new_kv_cache_spec(),
|
||||
"layer_6": new_kv_cache_spec(),
|
||||
"layer_7": new_sliding_window_spec(),
|
||||
"layer_8": new_sliding_window_spec(),
|
||||
"layer_9": new_sliding_window_spec(),
|
||||
"layer_10": new_sliding_window_spec(),
|
||||
"layer_11": new_sliding_window_spec(),
|
||||
}
|
||||
|
||||
kv_cache_config_hybrid = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 6 * 32]
|
||||
)[0]
|
||||
print(kv_cache_config_hybrid)
|
||||
assert kv_cache_config_hybrid == KVCacheConfig(
|
||||
num_blocks=32,
|
||||
kv_cache_tensors=[
|
||||
KVCacheTensor(
|
||||
size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_1", "layer_7"],
|
||||
),
|
||||
KVCacheTensor(
|
||||
size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_2", "layer_8"],
|
||||
),
|
||||
KVCacheTensor(
|
||||
size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_3", "layer_9"],
|
||||
),
|
||||
KVCacheTensor(
|
||||
size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_4", "layer_10"],
|
||||
),
|
||||
KVCacheTensor(
|
||||
size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_5", "layer_11"],
|
||||
),
|
||||
KVCacheTensor(
|
||||
size=mem_per_block_per_layer * 32,
|
||||
shared_by=["layer_6"],
|
||||
),
|
||||
],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(
|
||||
["layer_1", "layer_2", "layer_3", "layer_4", "layer_5", "layer_6"],
|
||||
new_kv_cache_spec(),
|
||||
),
|
||||
KVCacheGroupSpec(
|
||||
["layer_7", "layer_8", "layer_9", "layer_10", "layer_11"],
|
||||
new_sliding_window_spec(),
|
||||
),
|
||||
],
|
||||
)
|
||||
# different hidden size
|
||||
kv_cache_specs_hybrid = {
|
||||
"layer_1": new_kv_cache_spec(head_size=128),
|
||||
|
||||
@ -641,6 +641,34 @@ def test_schedule_concurrent_batches(
|
||||
scheduler.update_from_output(scheduler_output1, model_runner_output)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
|
||||
def test_schedule_order(enable_chunked_prefill: bool):
|
||||
scheduler = create_scheduler(
|
||||
max_num_batched_tokens=1024,
|
||||
max_num_seqs=3,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
)
|
||||
|
||||
# long requests
|
||||
requests = create_requests(num_requests=2, num_tokens=800)
|
||||
# short requests
|
||||
requests += create_requests(num_requests=2, num_tokens=10)
|
||||
|
||||
for request in requests:
|
||||
scheduler.add_request(request)
|
||||
|
||||
scheduler_output1 = scheduler.schedule()
|
||||
|
||||
if enable_chunked_prefill:
|
||||
# When enable chunked prefill, long requests will be chunked.
|
||||
assert len(scheduler_output1.scheduled_new_reqs) == 2
|
||||
else:
|
||||
# When disable chunked prefill, should not skip the long requests,
|
||||
# and scheduling subsequent short requests in advance,
|
||||
# even though there is still token budgets remaining.
|
||||
assert len(scheduler_output1.scheduled_new_reqs) == 1
|
||||
|
||||
|
||||
def test_preempt_during_execution():
|
||||
# NOTE(woosuk): The actual number of available blocks is 10 instead of 11
|
||||
# because block 0 is reserved as the null block.
|
||||
|
||||
@ -42,6 +42,7 @@ def create_scheduler(
|
||||
model: str = "facebook/opt-125m",
|
||||
max_num_seqs: int = 16,
|
||||
max_num_batched_tokens: int = 8192,
|
||||
enable_chunked_prefill: bool = True,
|
||||
enable_prefix_caching: bool = False,
|
||||
long_prefill_token_threshold: int = 0,
|
||||
disable_chunked_mm_input: bool = False,
|
||||
@ -76,7 +77,7 @@ def create_scheduler(
|
||||
max_model_len=max_model_len,
|
||||
long_prefill_token_threshold=long_prefill_token_threshold,
|
||||
disable_chunked_mm_input=disable_chunked_mm_input,
|
||||
enable_chunked_prefill=True,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
async_scheduling=async_scheduling,
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
|
||||
@ -35,14 +35,22 @@ def temporary_environ(env_vars):
|
||||
|
||||
# test attention backend and cudagraph_mode combo
|
||||
# (backend_name, cudagraph_mode, supported)
|
||||
combo_cases_1 = [
|
||||
("FA3", "FULL", True),
|
||||
("FA3", "FULL_AND_PIECEWISE", True),
|
||||
("FA2", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
|
||||
("FA2", "FULL_AND_PIECEWISE", True),
|
||||
("FlashInfer", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
|
||||
("FlashInfer", "FULL_AND_PIECEWISE", True),
|
||||
]
|
||||
if current_platform.is_rocm():
|
||||
combo_cases_1 = [
|
||||
("RocmAttn", "FULL", True),
|
||||
("RocmAttn", "FULL_AND_PIECEWISE", True),
|
||||
("TritonAttn", "FULL", True),
|
||||
("TritonAttn", "FULL_AND_PIECEWISE", True),
|
||||
]
|
||||
else:
|
||||
combo_cases_1 = [
|
||||
("FA3", "FULL", True),
|
||||
("FA3", "FULL_AND_PIECEWISE", True),
|
||||
("FA2", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
|
||||
("FA2", "FULL_AND_PIECEWISE", True),
|
||||
("FlashInfer", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
|
||||
("FlashInfer", "FULL_AND_PIECEWISE", True),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("backend_name, cudagraph_mode, supported", combo_cases_1)
|
||||
@ -92,18 +100,32 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
|
||||
|
||||
# test cudagraph_mode with different compilation mode.
|
||||
# (backend_name, cudagraph_mode, compilation_mode, supported)
|
||||
combo_cases_2 = [
|
||||
("FA2", "FULL", CompilationMode.NONE, True),
|
||||
("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
|
||||
("FA2", "PIECEWISE", CompilationMode.NONE, False),
|
||||
("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
|
||||
("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
|
||||
("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
|
||||
("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
|
||||
("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
|
||||
("FA2", "NONE", CompilationMode.NONE, True),
|
||||
("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
|
||||
]
|
||||
if current_platform.is_rocm():
|
||||
combo_cases_2 = [
|
||||
("RocmAttn", "FULL", CompilationMode.NONE, True),
|
||||
("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True),
|
||||
("RocmAttn", "PIECEWISE", CompilationMode.NONE, False),
|
||||
("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
|
||||
("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
|
||||
("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
|
||||
("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
|
||||
("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
|
||||
("RocmAttn", "NONE", CompilationMode.NONE, True),
|
||||
("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True),
|
||||
]
|
||||
else:
|
||||
combo_cases_2 = [
|
||||
("FA2", "FULL", CompilationMode.NONE, True),
|
||||
("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
|
||||
("FA2", "PIECEWISE", CompilationMode.NONE, False),
|
||||
("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
|
||||
("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
|
||||
("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
|
||||
("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
|
||||
("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
|
||||
("FA2", "NONE", CompilationMode.NONE, True),
|
||||
("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
77
tests/v1/distributed/test_eagle_dp.py
Normal file
77
tests/v1/distributed/test_eagle_dp.py
Normal file
@ -0,0 +1,77 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
import os
|
||||
from contextlib import AsyncExitStack
|
||||
from dataclasses import replace
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
|
||||
DP_SIZE = int(os.getenv("DP_SIZE", 2))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_eagle_dp():
|
||||
target_model = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
|
||||
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=target_model,
|
||||
tokenizer_mode="auto",
|
||||
enforce_eager=False,
|
||||
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
|
||||
data_parallel_size=DP_SIZE,
|
||||
data_parallel_backend="mp", # ray takes more time
|
||||
trust_remote_code=True,
|
||||
max_model_len=16384,
|
||||
)
|
||||
|
||||
eagle_engine_args = replace(
|
||||
engine_args,
|
||||
speculative_config={
|
||||
"model": draft_model,
|
||||
"method": "eagle",
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
)
|
||||
|
||||
prompt = "This is a test of data parallel with eagle"
|
||||
num_expected_tokens = 100
|
||||
sampling_params = SamplingParams(
|
||||
min_tokens=num_expected_tokens,
|
||||
max_tokens=num_expected_tokens,
|
||||
ignore_eos=True,
|
||||
output_kind=RequestOutputKind.FINAL_ONLY,
|
||||
temperature=0,
|
||||
)
|
||||
|
||||
async def generate_with_timeout(given_engine: AsyncLLM):
|
||||
async for out in given_engine.generate(
|
||||
request_id="test-eagle-dp", prompt=prompt, sampling_params=sampling_params
|
||||
):
|
||||
token_ids = out.outputs[0].token_ids
|
||||
assert len(token_ids) == num_expected_tokens
|
||||
return token_ids
|
||||
|
||||
async def engine_create_and_generate(engine_args: AsyncEngineArgs):
|
||||
async with AsyncExitStack() as after:
|
||||
engine = AsyncLLM.from_engine_args(engine_args)
|
||||
after.callback(engine.shutdown)
|
||||
|
||||
token_ids = await asyncio.wait_for(
|
||||
generate_with_timeout(engine), timeout=30
|
||||
)
|
||||
|
||||
assert not engine.output_processor.has_unfinished_requests()
|
||||
return token_ids
|
||||
|
||||
token_ids_with_eagle = await engine_create_and_generate(eagle_engine_args)
|
||||
token_ids_no_eagle = await engine_create_and_generate(engine_args)
|
||||
|
||||
# Test for correctness
|
||||
assert token_ids_with_eagle == token_ids_no_eagle
|
||||
@ -3,7 +3,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
from dataclasses import fields
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
@ -21,7 +20,6 @@ from vllm.outputs import RequestOutput
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
|
||||
from vllm.sampling_params import (
|
||||
GuidedDecodingParams,
|
||||
SamplingParams,
|
||||
StructuredOutputsParams,
|
||||
)
|
||||
@ -108,23 +106,6 @@ class CarDescription(BaseModel):
|
||||
car_type: CarType
|
||||
|
||||
|
||||
def test_guided_decoding_deprecated():
|
||||
with pytest.warns(DeprecationWarning, match="GuidedDecodingParams is deprecated.*"):
|
||||
guided_decoding = GuidedDecodingParams(json_object=True)
|
||||
|
||||
structured_outputs = StructuredOutputsParams(json_object=True)
|
||||
assert fields(guided_decoding) == fields(structured_outputs)
|
||||
|
||||
with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
|
||||
sp1 = SamplingParams(guided_decoding=guided_decoding)
|
||||
|
||||
with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
|
||||
sp2 = SamplingParams.from_optional(guided_decoding=guided_decoding)
|
||||
|
||||
assert sp1 == sp2
|
||||
assert sp1.structured_outputs == guided_decoding
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, backend, tokenizer_mode, speculative_config",
|
||||
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
|
||||
@ -899,13 +880,11 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
|
||||
output_json = json.loads(generated_text)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"])
|
||||
def test_structured_output_with_structural_tag(
|
||||
guided_decoding_backend: str,
|
||||
):
|
||||
@pytest.mark.parametrize("backend", ["xgrammar"])
|
||||
def test_structured_output_with_structural_tag(backend: str):
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
guided_decoding_backend=guided_decoding_backend,
|
||||
structured_outputs_config=StructuredOutputsConfig(backend=backend),
|
||||
)
|
||||
|
||||
structural_tag_config = {
|
||||
@ -923,7 +902,7 @@ def test_structured_output_with_structural_tag(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=500,
|
||||
guided_decoding=StructuredOutputsParams(
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
structural_tag=json.dumps(structural_tag_config)
|
||||
),
|
||||
)
|
||||
|
||||
@ -294,6 +294,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
|
||||
kv_last_page_lens: torch.Tensor | None = None,
|
||||
sm_scale: float = 1.0,
|
||||
logit_cap: float = 0.0,
|
||||
q_scale: torch.Tensor | None = None,
|
||||
kv_scale: torch.Tensor | None = None,
|
||||
) -> None:
|
||||
from aiter.mla import mla_decode_fwd
|
||||
|
||||
@ -308,6 +310,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
|
||||
max_seqlen_qo,
|
||||
sm_scale=sm_scale,
|
||||
logit_cap=logit_cap,
|
||||
q_scale=q_scale,
|
||||
kv_scale=kv_scale,
|
||||
)
|
||||
|
||||
|
||||
@ -322,6 +326,8 @@ def _rocm_aiter_mla_decode_fwd_fake(
|
||||
kv_last_page_lens: torch.Tensor | None = None,
|
||||
sm_scale: float = 1.0,
|
||||
logit_cap: float = 0.0,
|
||||
q_scale: torch.Tensor | None = None,
|
||||
kv_scale: torch.Tensor | None = None,
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
@ -806,6 +812,8 @@ class rocm_aiter_ops:
|
||||
kv_indices: torch.Tensor | None = None,
|
||||
kv_last_page_lens: torch.Tensor | None = None,
|
||||
logit_cap: float = 0.0,
|
||||
q_scale: torch.Tensor | None = None,
|
||||
kv_scale: torch.Tensor | None = None,
|
||||
):
|
||||
torch.ops.vllm.rocm_aiter_mla_decode_fwd(
|
||||
q,
|
||||
@ -818,6 +826,8 @@ class rocm_aiter_ops:
|
||||
kv_last_page_lens,
|
||||
sm_scale=sm_scale,
|
||||
logit_cap=logit_cap,
|
||||
q_scale=q_scale,
|
||||
kv_scale=kv_scale,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -2201,7 +2201,8 @@ def gather_and_maybe_dequant_cache(
|
||||
dst: torch.Tensor,
|
||||
block_table: torch.Tensor,
|
||||
cu_seq_lens: torch.Tensor,
|
||||
batch_size: int,
|
||||
token_to_seq: torch.Tensor,
|
||||
num_tokens: int,
|
||||
kv_cache_dtype: str,
|
||||
scale: torch.Tensor,
|
||||
seq_starts: torch.Tensor | None = None,
|
||||
@ -2211,7 +2212,8 @@ def gather_and_maybe_dequant_cache(
|
||||
dst,
|
||||
block_table,
|
||||
cu_seq_lens,
|
||||
batch_size,
|
||||
token_to_seq,
|
||||
num_tokens,
|
||||
kv_cache_dtype,
|
||||
scale,
|
||||
seq_starts,
|
||||
|
||||
@ -194,7 +194,6 @@ def _cp_lse_common(
|
||||
cp_attn_lse = cp_attn_lse.contiguous()
|
||||
lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
|
||||
out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
|
||||
assert out.is_contiguous()
|
||||
return out, lse
|
||||
|
||||
|
||||
|
||||
@ -63,13 +63,14 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
|
||||
else:
|
||||
logger.debug("Using InductorAdaptor")
|
||||
return InductorAdaptor()
|
||||
else:
|
||||
assert compilation_config.backend == "eager", (
|
||||
"Custom backends not supported with CompilationMode.VLLM_COMPILE"
|
||||
)
|
||||
|
||||
elif compilation_config.backend == "eager":
|
||||
logger.debug("Using EagerAdaptor")
|
||||
return EagerAdaptor()
|
||||
else:
|
||||
logger.debug("Using custom backend: %s", compilation_config.backend)
|
||||
compiler = resolve_obj_by_qualname(current_platform.get_compile_backend())()
|
||||
assert isinstance(compiler, CompilerInterface)
|
||||
return compiler
|
||||
|
||||
|
||||
class CompilerManager:
|
||||
@ -545,7 +546,10 @@ class VllmBackend:
|
||||
self.prefix = prefix or model_tag
|
||||
|
||||
# Passes to run on the graph post-grad.
|
||||
self.post_grad_pass_manager = PostGradPassManager()
|
||||
self.pass_manager = resolve_obj_by_qualname(
|
||||
current_platform.get_pass_manager_cls()
|
||||
)()
|
||||
self.pass_key = current_platform.pass_key
|
||||
|
||||
self.sym_tensor_indices = []
|
||||
self.input_buffers = []
|
||||
@ -562,24 +566,20 @@ class VllmBackend:
|
||||
|
||||
def configure_post_pass(self):
|
||||
config = self.compilation_config
|
||||
self.post_grad_pass_manager.configure(self.vllm_config)
|
||||
self.pass_manager.configure(self.vllm_config)
|
||||
|
||||
# Post-grad custom passes are run using the post_grad_custom_post_pass
|
||||
# hook. If a pass for that hook exists, add it to the pass manager.
|
||||
inductor_config = config.inductor_compile_config
|
||||
PASS_KEY = "post_grad_custom_post_pass"
|
||||
if PASS_KEY in inductor_config:
|
||||
if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
|
||||
if self.pass_key in inductor_config:
|
||||
if isinstance(inductor_config[self.pass_key], PostGradPassManager):
|
||||
# PassManager already added to config, make sure it's correct
|
||||
assert (
|
||||
inductor_config[PASS_KEY].uuid()
|
||||
== self.post_grad_pass_manager.uuid()
|
||||
)
|
||||
assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid()
|
||||
else:
|
||||
# Config should automatically wrap all inductor passes
|
||||
assert isinstance(inductor_config[PASS_KEY], InductorPass)
|
||||
self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
|
||||
inductor_config[PASS_KEY] = self.post_grad_pass_manager
|
||||
assert isinstance(inductor_config[self.pass_key], InductorPass)
|
||||
self.pass_manager.add(inductor_config[self.pass_key])
|
||||
inductor_config[self.pass_key] = self.pass_manager
|
||||
|
||||
def __call__(
|
||||
self, graph: fx.GraphModule, example_inputs
|
||||
|
||||
@ -331,9 +331,9 @@ class CompilationConfig:
|
||||
We use string to avoid serialization issues when using compilation in a
|
||||
distributed setting. When the compilation mode is 1 or 2, the backend is
|
||||
used for the compilation directly (it sees the whole graph). When the
|
||||
compilation mode is 3, the backend is used for the piecewise compilation
|
||||
(it sees a part of the graph). The backend can not be custom for compilation
|
||||
mode 3, i.e. the backend must be either eager or inductor. Furthermore,
|
||||
compilation mode is 3, the backend supports both whole graph and piecewise
|
||||
compilation, available backends include eager, inductor, and custom backends,
|
||||
the latter of which can be defined via `get_compile_backend`. Furthermore,
|
||||
compilation is only piecewise if splitting ops is set accordingly and
|
||||
use_inductor_graph_partition is off. Note that the default options for
|
||||
splitting ops are sufficient for piecewise compilation.
|
||||
@ -768,7 +768,7 @@ class CompilationConfig:
|
||||
self.backend = "inductor" if self.use_inductor else "eager"
|
||||
|
||||
if self.backend == "":
|
||||
self.backend = current_platform.simple_compile_backend
|
||||
self.backend = current_platform.get_compile_backend()
|
||||
|
||||
def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
|
||||
"""
|
||||
@ -800,9 +800,7 @@ class CompilationConfig:
|
||||
|
||||
assert self.mode == CompilationMode.VLLM_COMPILE
|
||||
if self.backend not in ["eager", "inductor"]:
|
||||
raise ValueError(
|
||||
f"Invalid backend for piecewise compilation: {self.backend}"
|
||||
)
|
||||
logger.info("Using OOT custom backend for compilation.")
|
||||
|
||||
from vllm.compilation.backends import VllmBackend
|
||||
|
||||
|
||||
@ -33,6 +33,7 @@ from vllm.transformers_utils.config import (
|
||||
try_get_safetensors_metadata,
|
||||
try_get_tokenizer_config,
|
||||
uses_mrope,
|
||||
uses_xdrope_dim,
|
||||
)
|
||||
from vllm.transformers_utils.gguf_utils import (
|
||||
maybe_patch_hf_config_from_gguf,
|
||||
@ -585,16 +586,26 @@ class ModelConfig:
|
||||
else: # task == "auto"
|
||||
pass
|
||||
else:
|
||||
debug_info = {
|
||||
"architectures": architectures,
|
||||
"is_generative_model": is_generative_model,
|
||||
"is_pooling_model": is_pooling_model,
|
||||
}
|
||||
raise AssertionError(
|
||||
"The model should be a generative or "
|
||||
"pooling model when task is set to "
|
||||
f"{self.task!r}. Found: {debug_info}"
|
||||
)
|
||||
# Neither generative nor pooling model - try to convert if possible
|
||||
if is_pooling_task:
|
||||
runner = "pooling"
|
||||
convert = _task_to_convert(self.task)
|
||||
msg_hint = (
|
||||
"Please replace this option with `--runner pooling "
|
||||
f"--convert {convert}` to continue using this model "
|
||||
"as a pooling model."
|
||||
)
|
||||
else:
|
||||
debug_info = {
|
||||
"architectures": architectures,
|
||||
"is_generative_model": is_generative_model,
|
||||
"is_pooling_model": is_pooling_model,
|
||||
}
|
||||
raise AssertionError(
|
||||
"The model should be a generative or "
|
||||
"pooling model when task is set to "
|
||||
f"{self.task!r}. Found: {debug_info}"
|
||||
)
|
||||
|
||||
self.runner = runner
|
||||
self.convert = convert
|
||||
@ -1605,6 +1616,10 @@ class ModelConfig:
|
||||
def uses_mrope(self) -> bool:
|
||||
return uses_mrope(self.hf_config)
|
||||
|
||||
@property
|
||||
def uses_xdrope_dim(self) -> int:
|
||||
return uses_xdrope_dim(self.hf_config)
|
||||
|
||||
@property
|
||||
def is_multimodal_model(self) -> bool:
|
||||
return self.multimodal_config is not None
|
||||
|
||||
@ -141,22 +141,6 @@ class ParallelConfig:
|
||||
- "deepep_high_throughput": Use deepep high-throughput kernels
|
||||
- "deepep_low_latency": Use deepep low-latency kernels
|
||||
- "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
|
||||
num_redundant_experts: int | None = None
|
||||
"""`num_redundant_experts` is deprecated and has been replaced with
|
||||
`eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
|
||||
Please use `eplb_config.num_redundant_experts` instead."""
|
||||
eplb_window_size: int | None = None
|
||||
"""`eplb_window_size` is deprecated and has been replaced with
|
||||
`eplb_config.window_size`. This will be removed in v0.12.0.
|
||||
Please use `eplb_config.window_size` instead."""
|
||||
eplb_step_interval: int | None = None
|
||||
"""`eplb_step_interval` is deprecated and has been replaced with
|
||||
`eplb_config.step_interval`. This will be removed in v0.12.0.
|
||||
Please use `eplb_config.step_interval` instead."""
|
||||
eplb_log_balancedness: bool | None = None
|
||||
"""`eplb_log_balancedness` is deprecated and has been replaced with
|
||||
`eplb_config.log_balancedness`. This will be removed in v0.12.0.
|
||||
Please use `eplb_config.log_balancedness` instead."""
|
||||
|
||||
max_parallel_loading_workers: int | None = None
|
||||
"""Maximum number of parallel loading workers when loading model
|
||||
@ -516,40 +500,6 @@ class ParallelConfig:
|
||||
"--all2all-backend command-line argument instead."
|
||||
)
|
||||
|
||||
# Forward deprecated fields to their new location
|
||||
if self.num_redundant_experts is not None:
|
||||
self.eplb_config.num_redundant_experts = self.num_redundant_experts
|
||||
logger.warning_once(
|
||||
"num_redundant_experts is deprecated and has been replaced "
|
||||
"with eplb_config.num_redundant_experts. This will be removed "
|
||||
"in v0.12.0. Changing this field after initialization will "
|
||||
"have no effect."
|
||||
)
|
||||
if self.eplb_window_size is not None:
|
||||
self.eplb_config.window_size = self.eplb_window_size
|
||||
logger.warning_once(
|
||||
"eplb_window_size is deprecated and has been replaced "
|
||||
"with eplb_config.window_size. This will be removed "
|
||||
"in v0.12.0. Changing this field after initialization will "
|
||||
"have no effect."
|
||||
)
|
||||
if self.eplb_step_interval is not None:
|
||||
self.eplb_config.step_interval = self.eplb_step_interval
|
||||
logger.warning_once(
|
||||
"eplb_step_interval is deprecated and has been replaced "
|
||||
"with eplb_config.step_interval. This will be removed "
|
||||
"in v0.12.0. Changing this field after initialization will "
|
||||
"have no effect."
|
||||
)
|
||||
if self.eplb_log_balancedness is not None:
|
||||
self.eplb_config.log_balancedness = self.eplb_log_balancedness
|
||||
logger.warning_once(
|
||||
"eplb_log_balancedness is deprecated and has been replaced "
|
||||
"with eplb_config.log_balancedness. This will be removed "
|
||||
"in v0.12.0. Changing this field after initialization will "
|
||||
"have no effect."
|
||||
)
|
||||
|
||||
# Continue with the rest of the initialization
|
||||
self.world_size = (
|
||||
self.pipeline_parallel_size
|
||||
|
||||
@ -131,7 +131,7 @@ class SymmMemCommunicator:
|
||||
return None
|
||||
if out is None:
|
||||
out = torch.empty_like(inp)
|
||||
self.buffer[: inp.numel()].copy_(inp.view(-1))
|
||||
self.buffer[: inp.numel()].copy_(inp.reshape(-1))
|
||||
|
||||
# Determine which algorithm to use
|
||||
use_multimem = False
|
||||
|
||||
@ -4,7 +4,6 @@ import contextlib
|
||||
import copy
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
@ -810,9 +809,6 @@ class NixlConnectorWorker:
|
||||
self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config(
|
||||
"backends", ["UCX"]
|
||||
)
|
||||
# TODO temporary, once nixl allows for telemetry flag in config
|
||||
# (next release), we can remove this env var.
|
||||
os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
|
||||
|
||||
# Agent.
|
||||
non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
|
||||
@ -828,10 +824,11 @@ class NixlConnectorWorker:
|
||||
if nixl_agent_config is None:
|
||||
config = None
|
||||
else:
|
||||
# Enable telemetry by default for NIXL 0.7.1 and above.
|
||||
config = (
|
||||
nixl_agent_config(backends=self.nixl_backends)
|
||||
nixl_agent_config(backends=self.nixl_backends, capture_telemetry=True)
|
||||
if len(non_ucx_backends) > 0
|
||||
else nixl_agent_config(num_threads=num_threads)
|
||||
else nixl_agent_config(num_threads=num_threads, capture_telemetry=True)
|
||||
)
|
||||
|
||||
self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)
|
||||
|
||||
@ -502,11 +502,6 @@ class EngineArgs:
|
||||
)
|
||||
reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
|
||||
reasoning_parser_plugin: str | None = None
|
||||
# Deprecated guided decoding fields
|
||||
guided_decoding_backend: str | None = None
|
||||
guided_decoding_disable_fallback: bool | None = None
|
||||
guided_decoding_disable_any_whitespace: bool | None = None
|
||||
guided_decoding_disable_additional_properties: bool | None = None
|
||||
|
||||
logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern
|
||||
|
||||
@ -725,19 +720,6 @@ class EngineArgs:
|
||||
"--reasoning-parser-plugin",
|
||||
**structured_outputs_kwargs["reasoning_parser_plugin"],
|
||||
)
|
||||
# Deprecated guided decoding arguments
|
||||
for arg, type in [
|
||||
("--guided-decoding-backend", str),
|
||||
("--guided-decoding-disable-fallback", bool),
|
||||
("--guided-decoding-disable-any-whitespace", bool),
|
||||
("--guided-decoding-disable-additional-properties", bool),
|
||||
]:
|
||||
structured_outputs_group.add_argument(
|
||||
arg,
|
||||
type=type,
|
||||
help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
|
||||
deprecated=True,
|
||||
)
|
||||
|
||||
# Parallel arguments
|
||||
parallel_kwargs = get_kwargs(ParallelConfig)
|
||||
@ -855,30 +837,6 @@ class EngineArgs:
|
||||
"--expert-placement-strategy",
|
||||
**parallel_kwargs["expert_placement_strategy"],
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--num-redundant-experts",
|
||||
type=int,
|
||||
help="[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
|
||||
deprecated=True,
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--eplb-window-size",
|
||||
type=int,
|
||||
help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
|
||||
deprecated=True,
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--eplb-step-interval",
|
||||
type=int,
|
||||
help="[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
|
||||
deprecated=True,
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--eplb-log-balancedness",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
help="[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
|
||||
deprecated=True,
|
||||
)
|
||||
|
||||
parallel_group.add_argument(
|
||||
"--max-parallel-loading-workers",
|
||||
@ -1612,6 +1570,12 @@ class EngineArgs:
|
||||
model_config.skip_tokenizer_init = True
|
||||
logger.info("Skipping tokenizer initialization for tokens-only mode.")
|
||||
|
||||
if self.async_scheduling and not self.disable_nccl_for_dp_synchronization:
|
||||
logger.info(
|
||||
"Disabling NCCL for DP synchronization when using async scheduling."
|
||||
)
|
||||
self.disable_nccl_for_dp_synchronization = True
|
||||
|
||||
# Forward the deprecated CLI args to the EPLB config.
|
||||
if self.num_redundant_experts is not None:
|
||||
self.eplb_config.num_redundant_experts = self.num_redundant_experts
|
||||
@ -1736,21 +1700,6 @@ class EngineArgs:
|
||||
self.reasoning_parser_plugin
|
||||
)
|
||||
|
||||
# Forward the deprecated CLI args to the StructuredOutputsConfig
|
||||
so_config = self.structured_outputs_config
|
||||
if self.guided_decoding_backend is not None:
|
||||
so_config.guided_decoding_backend = self.guided_decoding_backend
|
||||
if self.guided_decoding_disable_fallback is not None:
|
||||
so_config.disable_fallback = self.guided_decoding_disable_fallback
|
||||
if self.guided_decoding_disable_any_whitespace is not None:
|
||||
so_config.disable_any_whitespace = (
|
||||
self.guided_decoding_disable_any_whitespace
|
||||
)
|
||||
if self.guided_decoding_disable_additional_properties is not None:
|
||||
so_config.disable_additional_properties = (
|
||||
self.guided_decoding_disable_additional_properties
|
||||
)
|
||||
|
||||
observability_config = ObservabilityConfig(
|
||||
show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
|
||||
otlp_traces_endpoint=self.otlp_traces_endpoint,
|
||||
|
||||
@ -652,62 +652,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
default=None,
|
||||
description="Additional kwargs for structured outputs",
|
||||
)
|
||||
guided_json: str | dict | BaseModel | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_json` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please pass `json` to `structured_outputs` instead."
|
||||
),
|
||||
)
|
||||
guided_regex: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_regex` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please pass `regex` to `structured_outputs` instead."
|
||||
),
|
||||
)
|
||||
guided_choice: list[str] | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_choice` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please pass `choice` to `structured_outputs` instead."
|
||||
),
|
||||
)
|
||||
guided_grammar: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_grammar` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please pass `grammar` to `structured_outputs` instead."
|
||||
),
|
||||
)
|
||||
structural_tag: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`structural_tag` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please pass `structural_tag` to `structured_outputs` instead."
|
||||
),
|
||||
)
|
||||
guided_decoding_backend: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_decoding_backend` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please remove it from your request."
|
||||
),
|
||||
)
|
||||
guided_whitespace_pattern: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_whitespace_pattern` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please pass `whitespace_pattern` to `structured_outputs` instead."
|
||||
),
|
||||
)
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
description=(
|
||||
@ -717,7 +661,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
),
|
||||
)
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
@ -841,20 +785,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
if prompt_logprobs is None and self.echo:
|
||||
prompt_logprobs = self.top_logprobs
|
||||
|
||||
# Forward deprecated guided_* parameters to structured_outputs
|
||||
if self.structured_outputs is None:
|
||||
kwargs = dict[str, Any](
|
||||
json=self.guided_json,
|
||||
regex=self.guided_regex,
|
||||
choice=self.guided_choice,
|
||||
grammar=self.guided_grammar,
|
||||
whitespace_pattern=self.guided_whitespace_pattern,
|
||||
structural_tag=self.structural_tag,
|
||||
)
|
||||
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
||||
if len(kwargs) > 0:
|
||||
self.structured_outputs = StructuredOutputsParams(**kwargs)
|
||||
|
||||
response_format = self.response_format
|
||||
if response_format is not None:
|
||||
# If structured outputs wasn't already enabled,
|
||||
@ -863,24 +793,23 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
self.structured_outputs = StructuredOutputsParams()
|
||||
|
||||
# Set structured output params for response format
|
||||
if response_format is not None:
|
||||
if response_format.type == "json_object":
|
||||
self.structured_outputs.json_object = True
|
||||
elif response_format.type == "json_schema":
|
||||
json_schema = response_format.json_schema
|
||||
assert json_schema is not None
|
||||
self.structured_outputs.json = json_schema.json_schema
|
||||
elif response_format.type == "structural_tag":
|
||||
structural_tag = response_format
|
||||
assert structural_tag is not None and isinstance(
|
||||
structural_tag,
|
||||
(
|
||||
LegacyStructuralTagResponseFormat,
|
||||
StructuralTagResponseFormat,
|
||||
),
|
||||
)
|
||||
s_tag_obj = structural_tag.model_dump(by_alias=True)
|
||||
self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
|
||||
if response_format.type == "json_object":
|
||||
self.structured_outputs.json_object = True
|
||||
elif response_format.type == "json_schema":
|
||||
json_schema = response_format.json_schema
|
||||
assert json_schema is not None
|
||||
self.structured_outputs.json = json_schema.json_schema
|
||||
elif response_format.type == "structural_tag":
|
||||
structural_tag = response_format
|
||||
assert structural_tag is not None and isinstance(
|
||||
structural_tag,
|
||||
(
|
||||
LegacyStructuralTagResponseFormat,
|
||||
StructuralTagResponseFormat,
|
||||
),
|
||||
)
|
||||
s_tag_obj = structural_tag.model_dump(by_alias=True)
|
||||
self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
|
||||
|
||||
extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
|
||||
if self.kv_transfer_params:
|
||||
@ -1140,58 +1069,6 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
default=None,
|
||||
description="Additional kwargs for structured outputs",
|
||||
)
|
||||
guided_json: str | dict | BaseModel | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_json` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please pass `json` to `structured_outputs` instead."
|
||||
),
|
||||
)
|
||||
guided_regex: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_regex` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please pass `regex` to `structured_outputs` instead."
|
||||
),
|
||||
)
|
||||
guided_choice: list[str] | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_choice` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please pass `choice` to `structured_outputs` instead."
|
||||
),
|
||||
)
|
||||
guided_grammar: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_grammar` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please pass `grammar` to `structured_outputs` instead."
|
||||
),
|
||||
)
|
||||
structural_tag: str | None = Field(
|
||||
default=None,
|
||||
description=("If specified, the output will follow the structural tag schema."),
|
||||
)
|
||||
guided_decoding_backend: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_decoding_backend` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please remove it from your request."
|
||||
),
|
||||
)
|
||||
guided_whitespace_pattern: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"`guided_whitespace_pattern` is deprecated. "
|
||||
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
|
||||
"Please pass `whitespace_pattern` to `structured_outputs` instead."
|
||||
),
|
||||
)
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
description=(
|
||||
@ -1201,7 +1078,7 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
),
|
||||
)
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
@ -1336,35 +1213,31 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
|
||||
echo_without_generation = self.echo and self.max_tokens == 0
|
||||
|
||||
guided_json_object = None
|
||||
if self.response_format is not None:
|
||||
if self.response_format.type == "json_object":
|
||||
guided_json_object = True
|
||||
elif self.response_format.type == "json_schema":
|
||||
json_schema = self.response_format.json_schema
|
||||
response_format = self.response_format
|
||||
if response_format is not None:
|
||||
# If structured outputs wasn't already enabled,
|
||||
# we must enable it for these features to work
|
||||
if self.structured_outputs is None:
|
||||
self.structured_outputs = StructuredOutputsParams()
|
||||
|
||||
# Set structured output params for response format
|
||||
if response_format.type == "json_object":
|
||||
self.structured_outputs.json_object = True
|
||||
elif response_format.type == "json_schema":
|
||||
json_schema = response_format.json_schema
|
||||
assert json_schema is not None
|
||||
self.guided_json = json_schema.json_schema
|
||||
elif self.response_format.type == "structural_tag":
|
||||
structural_tag = self.response_format
|
||||
self.structured_outputs.json = json_schema.json_schema
|
||||
elif response_format.type == "structural_tag":
|
||||
structural_tag = response_format
|
||||
assert structural_tag is not None and isinstance(
|
||||
structural_tag, StructuralTagResponseFormat
|
||||
structural_tag,
|
||||
(
|
||||
LegacyStructuralTagResponseFormat,
|
||||
StructuralTagResponseFormat,
|
||||
),
|
||||
)
|
||||
s_tag_obj = structural_tag.model_dump(by_alias=True)
|
||||
self.structural_tag = json.dumps(s_tag_obj)
|
||||
|
||||
# Forward deprecated guided_* parameters to structured_outputs
|
||||
if self.structured_outputs is None:
|
||||
kwargs = dict[str, Any](
|
||||
json=self.guided_json,
|
||||
json_object=guided_json_object,
|
||||
regex=self.guided_regex,
|
||||
choice=self.guided_choice,
|
||||
grammar=self.guided_grammar,
|
||||
whitespace_pattern=self.guided_whitespace_pattern,
|
||||
)
|
||||
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
||||
if len(kwargs) > 0:
|
||||
self.structured_outputs = StructuredOutputsParams(**kwargs)
|
||||
self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
|
||||
|
||||
extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
|
||||
if self.kv_transfer_params:
|
||||
@ -1502,7 +1375,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
|
||||
),
|
||||
)
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
@ -1597,7 +1470,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
|
||||
),
|
||||
)
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
@ -2019,7 +1892,7 @@ class ClassificationCompletionRequest(OpenAIBaseModel):
|
||||
),
|
||||
)
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
@ -2110,7 +1983,7 @@ class ClassificationChatRequest(OpenAIBaseModel):
|
||||
)
|
||||
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
@ -3221,7 +3094,7 @@ class TranslationResponseVerbose(OpenAIBaseModel):
|
||||
####### Tokens IN <> Tokens OUT #######
|
||||
class GenerateRequest(BaseModel):
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
@ -3278,7 +3151,7 @@ class GenerateResponseChoice(BaseModel):
|
||||
|
||||
class GenerateResponse(BaseModel):
|
||||
request_id: str = Field(
|
||||
default_factory=lambda: f"{random_uuid()}",
|
||||
default_factory=random_uuid,
|
||||
description=(
|
||||
"The request_id related to this request. If the caller does "
|
||||
"not set it, a random_uuid will be generated. This id is used "
|
||||
|
||||
@ -273,6 +273,11 @@ class OpenAIServingChat(OpenAIServing):
|
||||
try:
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
prompt_text, _, _ = self._get_prompt_components(request_prompts[i])
|
||||
# If we are creating sub requests for multiple prompts, ensure that they
|
||||
# have unique request ids.
|
||||
sub_request_id = (
|
||||
request_id if len(engine_prompts) == 1 else f"{request_id}_{i}"
|
||||
)
|
||||
|
||||
if self.default_sampling_params is None:
|
||||
self.default_sampling_params = {}
|
||||
@ -301,7 +306,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
)
|
||||
|
||||
self._log_inputs(
|
||||
request_id,
|
||||
sub_request_id,
|
||||
request_prompts[i],
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
@ -316,14 +321,14 @@ class OpenAIServingChat(OpenAIServing):
|
||||
if isinstance(sampling_params, BeamSearchParams):
|
||||
generator = self.beam_search(
|
||||
prompt=engine_prompt,
|
||||
request_id=request_id,
|
||||
request_id=sub_request_id,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
else:
|
||||
engine_request, tokenization_kwargs = await self._process_inputs(
|
||||
request_id,
|
||||
sub_request_id,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
lora_request=lora_request,
|
||||
@ -334,7 +339,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
generator = self.engine_client.generate(
|
||||
engine_request,
|
||||
sampling_params,
|
||||
request_id,
|
||||
sub_request_id,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
|
||||
@ -1242,16 +1242,19 @@ class OpenAIServing:
|
||||
):
|
||||
prompt_text, _, _ = self._get_prompt_components(request_prompt)
|
||||
orig_priority = priority
|
||||
sub_request = 0
|
||||
while True:
|
||||
# Ensure that each sub-request has a unique request id.
|
||||
sub_request_id = f"{request_id}_{sub_request}"
|
||||
self._log_inputs(
|
||||
request_id,
|
||||
sub_request_id,
|
||||
request_prompt,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
trace_headers = kwargs.get("trace_headers")
|
||||
engine_request, tokenization_kwargs = await self._process_inputs(
|
||||
request_id,
|
||||
sub_request_id,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
lora_request=lora_request,
|
||||
@ -1262,7 +1265,7 @@ class OpenAIServing:
|
||||
generator = self.engine_client.generate(
|
||||
engine_request,
|
||||
sampling_params,
|
||||
request_id,
|
||||
sub_request_id,
|
||||
lora_request=lora_request,
|
||||
priority=priority,
|
||||
prompt_text=prompt_text,
|
||||
@ -1295,6 +1298,7 @@ class OpenAIServing:
|
||||
sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids)
|
||||
# OPTIMIZATION
|
||||
priority = orig_priority - 1
|
||||
sub_request += 1
|
||||
|
||||
def _get_prompt_components(
|
||||
self,
|
||||
@ -1345,11 +1349,12 @@ class OpenAIServing:
|
||||
raw_request: Request | None, default: str | None = None
|
||||
) -> str | None:
|
||||
"""Pulls the request id to use from a header, if provided"""
|
||||
default = default or random_uuid()
|
||||
if raw_request is None:
|
||||
return default
|
||||
if raw_request is not None and (
|
||||
(req_id := raw_request.headers.get("X-Request-Id")) is not None
|
||||
):
|
||||
return req_id
|
||||
|
||||
return raw_request.headers.get("X-Request-Id", default)
|
||||
return random_uuid() if default is None else default
|
||||
|
||||
@staticmethod
|
||||
def _get_data_parallel_rank(raw_request: Request | None) -> int | None:
|
||||
|
||||
@ -201,10 +201,10 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
self.engine_client.generate(
|
||||
prompt,
|
||||
sampling_params,
|
||||
request_id,
|
||||
f"{request_id}_{i}",
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompt in prompts
|
||||
for i, prompt in enumerate(prompts)
|
||||
]
|
||||
except ValueError as e:
|
||||
# TODO: Use a vllm-specific Validation Error
|
||||
|
||||
@ -812,19 +812,19 @@ def override_envs_for_invariance():
|
||||
# "TRITON_MLA",
|
||||
]
|
||||
if curr_attn_backend not in supported_backends:
|
||||
warning = (
|
||||
"Forcibly updating attention backend to"
|
||||
f" {supported_backends[0]} for batch_invariant. "
|
||||
f" Supported backends: {supported_backends}."
|
||||
error = (
|
||||
"VLLM batch_invariant mode requires an attention backend in "
|
||||
f"{supported_backends}, but got '{curr_attn_backend}'. "
|
||||
"Please set the 'VLLM_ATTENTION_BACKEND' environment variable "
|
||||
"to one of the supported backends before enabling batch_invariant."
|
||||
)
|
||||
logger.warning_once(warning)
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = supported_backends[0]
|
||||
raise RuntimeError(error)
|
||||
if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]:
|
||||
warning = (
|
||||
"You are using a decode-invariant form of batch invariance. "
|
||||
"This will not be invariant between prefill and decode."
|
||||
)
|
||||
logger.warning_once(warning)
|
||||
logger.warning_once(warning, scope="local")
|
||||
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
|
||||
|
||||
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
|
||||
|
||||
@ -6,22 +6,7 @@ import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
|
||||
def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
|
||||
d = x.shape[-1] // 2
|
||||
return F.silu(x[..., :d]) * x[..., d:]
|
||||
|
||||
|
||||
def swigluoai_and_mul(
|
||||
x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0
|
||||
) -> torch.Tensor:
|
||||
d = x.shape[-1] // 2
|
||||
gate, up = x[..., :d], x[..., d:]
|
||||
gate = gate.clamp(max=limit)
|
||||
up = up.clamp(min=-limit, max=limit)
|
||||
glu = gate * torch.sigmoid(alpha * gate)
|
||||
return (up + 1) * glu
|
||||
from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
|
||||
|
||||
|
||||
def grouped_topk(
|
||||
@ -227,6 +212,11 @@ class CPUFusedMOE:
|
||||
layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
|
||||
layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
|
||||
|
||||
self.act_to_impl = {
|
||||
"silu": SiluAndMul(),
|
||||
"swigluoai": SwigluOAIAndMul(),
|
||||
}
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
@ -246,7 +236,7 @@ class CPUFusedMOE:
|
||||
apply_router_weight_on_input: bool = False,
|
||||
activation: str = "silu",
|
||||
) -> torch.Tensor:
|
||||
assert activation in {"silu", "swigluoai"}, f"{activation} is not supported."
|
||||
assert activation in self.act_to_impl, f"{activation} is not supported."
|
||||
assert not apply_router_weight_on_input
|
||||
topk_weights, topk_ids = select_experts(
|
||||
hidden_states=x,
|
||||
@ -283,10 +273,7 @@ class CPUFusedMOE:
|
||||
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
|
||||
|
||||
gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
|
||||
if activation == "swigluoai":
|
||||
gate_up = swigluoai_and_mul(gate_up)
|
||||
else:
|
||||
gate_up = silu_and_mul(gate_up)
|
||||
gate_up = self.act_to_impl[activation].forward_native(gate_up)
|
||||
expert_out = layer.down_linear[i](gate_up)
|
||||
outputs.append(expert_out)
|
||||
start_idx = end_idx
|
||||
|
||||
@ -90,10 +90,14 @@ class FusedMoEMethodBase(QuantizeMethodBase):
|
||||
def allow_inplace(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def method_name(self) -> str:
|
||||
return self.__class__.__name__
|
||||
|
||||
@abstractmethod
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
|
||||
@ -66,6 +66,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
|
||||
def allow_inplace(self) -> bool:
|
||||
return self.old_quant_method.allow_inplace
|
||||
|
||||
@property
|
||||
def method_name(self) -> str:
|
||||
return self.old_quant_method.method_name
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
@ -84,7 +88,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -105,42 +109,9 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
# Is getattr needed?
|
||||
zero_expert_num = getattr(layer, "zero_expert_num", 0)
|
||||
zero_expert_type = getattr(layer, "zero_expert_type", None)
|
||||
|
||||
if enable_eplb:
|
||||
if self.supports_eplb:
|
||||
assert expert_load_view is not None
|
||||
assert logical_to_physical_map is not None
|
||||
assert logical_replica_count is not None
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"EPLB is not supported for "
|
||||
f"{self.old_quant_method.__class__.__name__}."
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
enable_eplb=enable_eplb,
|
||||
expert_map=expert_map,
|
||||
expert_load_view=expert_load_view,
|
||||
logical_to_physical_map=logical_to_physical_map,
|
||||
logical_replica_count=logical_replica_count,
|
||||
global_num_experts=global_num_experts,
|
||||
zero_expert_num=zero_expert_num,
|
||||
zero_expert_type=zero_expert_type,
|
||||
)
|
||||
|
||||
result = self.fused_experts(
|
||||
@ -156,7 +127,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
|
||||
expert_map=None if self.disable_expert_map else expert_map,
|
||||
)
|
||||
|
||||
if zero_expert_num != 0 and zero_expert_type is not None:
|
||||
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
|
||||
assert not isinstance(result, tuple), (
|
||||
"Shared + zero experts are mutually exclusive not yet supported"
|
||||
)
|
||||
|
||||
@ -1510,30 +1510,11 @@ class FusedMoE(CustomOp):
|
||||
logits_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def select_experts(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
use_grouped_topk: bool,
|
||||
renormalize: bool,
|
||||
topk_group: int | None = None,
|
||||
num_expert_group: int | None = None,
|
||||
custom_routing_function: Callable | None = None,
|
||||
scoring_func: str = "softmax",
|
||||
routed_scaling_factor: float = 1.0,
|
||||
e_score_correction_bias: torch.Tensor | None = None,
|
||||
indices_type: torch.dtype | None = None,
|
||||
enable_eplb: bool = False,
|
||||
expert_map: torch.Tensor | None = None,
|
||||
expert_load_view: torch.Tensor | None = None,
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
global_num_experts: int | None = None,
|
||||
zero_expert_num: int | None = None,
|
||||
zero_expert_type: str | None = None,
|
||||
num_fused_shared_experts: int = 0,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||
"""
|
||||
Route the input hidden states to the top-k experts based on the
|
||||
router logits.
|
||||
@ -1552,6 +1533,27 @@ class FusedMoE(CustomOp):
|
||||
fused_topk_bias,
|
||||
)
|
||||
|
||||
if self.enable_eplb:
|
||||
if self.quant_method.supports_eplb:
|
||||
if self.expert_load_view is None:
|
||||
raise ValueError(
|
||||
"enable_eplb=True requiere expert_load_view != None"
|
||||
)
|
||||
if self.logical_to_physical_map is None:
|
||||
raise ValueError(
|
||||
"enable_eplb=True requiere logical_to_physical_map != None"
|
||||
)
|
||||
if self.logical_replica_count is None:
|
||||
raise ValueError(
|
||||
"enable_eplb=True requiere logical_replica_count != None"
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"EPLB is not supported for {self.quant_method.method_name}."
|
||||
)
|
||||
|
||||
indices_type = self.quant_method.topk_indices_dtype
|
||||
|
||||
# Check if we should use a routing simulation strategy
|
||||
routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
|
||||
if routing_strategy != "":
|
||||
@ -1559,20 +1561,20 @@ class FusedMoE(CustomOp):
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits,
|
||||
strategy_name=routing_strategy,
|
||||
top_k=top_k,
|
||||
top_k=self.top_k,
|
||||
indices_type=indices_type,
|
||||
)
|
||||
|
||||
# DeepSeekv2 uses grouped_top_k
|
||||
elif use_grouped_topk:
|
||||
assert topk_group is not None
|
||||
assert num_expert_group is not None
|
||||
elif self.use_grouped_topk:
|
||||
assert self.topk_group is not None
|
||||
assert self.num_expert_group is not None
|
||||
if rocm_aiter_ops.is_fused_moe_enabled():
|
||||
if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
|
||||
assert num_fused_shared_experts == 0
|
||||
assert self.num_fused_shared_experts == 0
|
||||
grouped_topk_impl = partial(
|
||||
rocm_aiter_grouped_topk,
|
||||
num_fused_shared_experts=num_fused_shared_experts,
|
||||
num_fused_shared_experts=self.num_fused_shared_experts,
|
||||
)
|
||||
else:
|
||||
grouped_topk_impl = grouped_topk
|
||||
@ -1580,50 +1582,46 @@ class FusedMoE(CustomOp):
|
||||
topk_weights, topk_ids = grouped_topk_impl(
|
||||
hidden_states=hidden_states,
|
||||
gating_output=router_logits,
|
||||
topk=top_k,
|
||||
renormalize=renormalize,
|
||||
num_expert_group=num_expert_group,
|
||||
topk_group=topk_group,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
topk=self.top_k,
|
||||
renormalize=self.renormalize,
|
||||
num_expert_group=self.num_expert_group,
|
||||
topk_group=self.topk_group,
|
||||
scoring_func=self.scoring_func,
|
||||
routed_scaling_factor=self.routed_scaling_factor,
|
||||
e_score_correction_bias=self.e_score_correction_bias,
|
||||
)
|
||||
elif e_score_correction_bias is not None:
|
||||
elif self.e_score_correction_bias is not None:
|
||||
topk_weights, topk_ids = fused_topk_bias(
|
||||
hidden_states=hidden_states,
|
||||
gating_output=router_logits,
|
||||
e_score_correction_bias=e_score_correction_bias.data,
|
||||
topk=top_k,
|
||||
renormalize=renormalize,
|
||||
e_score_correction_bias=self.e_score_correction_bias.data,
|
||||
topk=self.top_k,
|
||||
renormalize=self.renormalize,
|
||||
)
|
||||
if routed_scaling_factor != 1.0:
|
||||
topk_weights *= routed_scaling_factor
|
||||
elif custom_routing_function is None:
|
||||
if self.routed_scaling_factor != 1.0:
|
||||
topk_weights *= self.routed_scaling_factor
|
||||
elif self.custom_routing_function is None:
|
||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
||||
hidden_states=hidden_states,
|
||||
gating_output=router_logits,
|
||||
topk=top_k,
|
||||
renormalize=renormalize,
|
||||
topk=self.top_k,
|
||||
renormalize=self.renormalize,
|
||||
indices_type=indices_type,
|
||||
)
|
||||
else:
|
||||
topk_weights, topk_ids = custom_routing_function(
|
||||
topk_weights, topk_ids = self.custom_routing_function(
|
||||
hidden_states=hidden_states,
|
||||
gating_output=router_logits,
|
||||
topk=top_k,
|
||||
renormalize=renormalize,
|
||||
topk=self.top_k,
|
||||
renormalize=self.renormalize,
|
||||
)
|
||||
|
||||
if enable_eplb:
|
||||
assert expert_load_view is not None
|
||||
assert logical_to_physical_map is not None
|
||||
assert logical_replica_count is not None
|
||||
|
||||
if self.enable_eplb:
|
||||
topk_ids = eplb_map_to_physical_and_record(
|
||||
topk_ids=topk_ids,
|
||||
expert_load_view=expert_load_view,
|
||||
logical_to_physical_map=logical_to_physical_map,
|
||||
logical_replica_count=logical_replica_count,
|
||||
expert_load_view=self.expert_load_view,
|
||||
logical_to_physical_map=self.logical_to_physical_map,
|
||||
logical_replica_count=self.logical_replica_count,
|
||||
)
|
||||
|
||||
if (indices_type is not None) and topk_ids.dtype != indices_type:
|
||||
@ -1633,16 +1631,16 @@ class FusedMoE(CustomOp):
|
||||
|
||||
# Compute zero expert result if needed
|
||||
if (
|
||||
zero_expert_num is not None
|
||||
and zero_expert_num > 0
|
||||
and zero_expert_type is not None
|
||||
and global_num_experts is not None
|
||||
self.zero_expert_num is not None
|
||||
and self.zero_expert_num > 0
|
||||
and self.zero_expert_type is not None
|
||||
and self.global_num_experts is not None
|
||||
):
|
||||
zero_expert_result = zero_experts_compute_triton(
|
||||
expert_indices=topk_ids,
|
||||
expert_scales=topk_weights,
|
||||
num_experts=global_num_experts,
|
||||
zero_expert_type=zero_expert_type,
|
||||
num_experts=self.global_num_experts,
|
||||
zero_expert_type=self.zero_expert_type,
|
||||
hidden_states=hidden_states,
|
||||
)
|
||||
else:
|
||||
@ -1692,6 +1690,10 @@ class FusedMoE(CustomOp):
|
||||
)
|
||||
|
||||
def reduce_output(states: torch.Tensor) -> torch.Tensor:
|
||||
# Slice before all_reduce to enable possible fusion
|
||||
if self.hidden_size != og_hidden_states:
|
||||
states = states[..., :og_hidden_states]
|
||||
|
||||
if (
|
||||
not self.is_sequence_parallel
|
||||
and not self.use_dp_chunking
|
||||
@ -1714,11 +1716,12 @@ class FusedMoE(CustomOp):
|
||||
if self.zero_expert_num is not None and self.zero_expert_num > 0:
|
||||
assert isinstance(fused_output, tuple)
|
||||
fused_output, zero_expert_result = fused_output
|
||||
return (reduce_output(fused_output) + zero_expert_result)[
|
||||
..., :og_hidden_states
|
||||
]
|
||||
return (
|
||||
reduce_output(fused_output)
|
||||
+ zero_expert_result[..., :og_hidden_states]
|
||||
)
|
||||
else:
|
||||
return reduce_output(fused_output)[..., :og_hidden_states]
|
||||
return reduce_output(fused_output)
|
||||
else:
|
||||
if current_platform.is_tpu():
|
||||
# TODO: Once the OOM issue for the TPU backend is resolved, we
|
||||
@ -1731,8 +1734,8 @@ class FusedMoE(CustomOp):
|
||||
hidden_states, router_logits, self.layer_name
|
||||
)
|
||||
return (
|
||||
reduce_output(shared_output)[..., :og_hidden_states],
|
||||
reduce_output(fused_output)[..., :og_hidden_states],
|
||||
reduce_output(shared_output),
|
||||
reduce_output(fused_output),
|
||||
)
|
||||
|
||||
def forward_cuda(
|
||||
|
||||
@ -331,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
|
||||
def forward_cuda(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
|
||||
x: torch.Tensor,
|
||||
use_grouped_topk: bool,
|
||||
top_k: int,
|
||||
@ -352,31 +352,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
zero_expert_num = getattr(layer, "zero_expert_num", 0)
|
||||
zero_expert_type = getattr(layer, "zero_expert_type", None)
|
||||
|
||||
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
enable_eplb=enable_eplb,
|
||||
expert_map=expert_map,
|
||||
expert_load_view=expert_load_view,
|
||||
logical_to_physical_map=logical_to_physical_map,
|
||||
logical_replica_count=logical_replica_count,
|
||||
global_num_experts=global_num_experts,
|
||||
zero_expert_num=zero_expert_num,
|
||||
zero_expert_type=zero_expert_type,
|
||||
num_fused_shared_experts=layer.num_fused_shared_experts,
|
||||
)
|
||||
|
||||
if self.rocm_aiter_moe_enabled:
|
||||
@ -415,7 +393,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
expert_map=expert_map,
|
||||
)
|
||||
|
||||
if zero_expert_num != 0 and zero_expert_type is not None:
|
||||
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
|
||||
assert not isinstance(result, tuple), (
|
||||
"Shared + zero experts are mutually exclusive not yet supported"
|
||||
)
|
||||
@ -425,7 +403,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
|
||||
def forward_cpu(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
|
||||
x: torch.Tensor,
|
||||
use_grouped_topk: bool,
|
||||
top_k: int,
|
||||
@ -474,7 +452,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
|
||||
def forward_xpu(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
|
||||
x: torch.Tensor,
|
||||
use_grouped_topk: bool,
|
||||
top_k: int,
|
||||
@ -515,7 +493,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
|
||||
def forward_tpu(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
|
||||
x: torch.Tensor,
|
||||
use_grouped_topk: bool,
|
||||
top_k: int,
|
||||
|
||||
@ -597,7 +597,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -618,24 +618,11 @@ class AWQMoEMethod(FusedMoEMethodBase):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.")
|
||||
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
return fused_marlin_moe(
|
||||
|
||||
@ -495,7 +495,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -518,25 +518,11 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `BitsAndBytesMoEMethod` yet."
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
# TODO(bnell): Do these need to be called on the hot path?
|
||||
if self.quant_config.load_in_8bit:
|
||||
w13, w2 = self._apply_8bit_dequant(layer)
|
||||
else:
|
||||
|
||||
@ -511,7 +511,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -532,16 +532,17 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
|
||||
)
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
if (
|
||||
self.allow_flashinfer
|
||||
and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
|
||||
):
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
|
||||
)
|
||||
|
||||
return flashinfer_trtllm_fp4_moe(
|
||||
layer=layer,
|
||||
x=x,
|
||||
@ -554,19 +555,9 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
if self.use_marlin:
|
||||
@ -1109,7 +1100,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -1130,31 +1121,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
assert expert_load_view is not None
|
||||
assert logical_to_physical_map is not None
|
||||
assert logical_replica_count is not None
|
||||
assert isinstance(layer, FusedMoE)
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
num_fused_shared_experts=layer.num_fused_shared_experts,
|
||||
enable_eplb=enable_eplb,
|
||||
expert_map=expert_map,
|
||||
expert_load_view=expert_load_view,
|
||||
logical_to_physical_map=logical_to_physical_map,
|
||||
logical_replica_count=logical_replica_count,
|
||||
)
|
||||
|
||||
per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
|
||||
@ -1377,7 +1346,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -1398,26 +1367,11 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
|
||||
)
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
return fused_experts(
|
||||
@ -1738,7 +1692,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -1759,26 +1713,11 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
|
||||
)
|
||||
|
||||
assert activation == "silu", f"{activation} not supported for Marlin MoE."
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
return fused_marlin_moe(
|
||||
@ -2001,7 +1940,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -2022,43 +1961,11 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
if expert_load_view is None:
|
||||
raise ValueError("enable_eplb=True requiere expert_load_view != None")
|
||||
if logical_to_physical_map is None:
|
||||
raise ValueError(
|
||||
"enable_eplb=True requiere logical_to_physical_map != None"
|
||||
)
|
||||
if logical_replica_count is None:
|
||||
raise ValueError(
|
||||
"enable_eplb=True requiere logical_replica_count != None"
|
||||
)
|
||||
if not isinstance(layer, FusedMoE):
|
||||
raise TypeError(
|
||||
"EPLB is only supported when `layer` is a instance of FusedMoE."
|
||||
)
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0),
|
||||
enable_eplb=enable_eplb,
|
||||
expert_map=expert_map,
|
||||
expert_load_view=expert_load_view,
|
||||
logical_to_physical_map=logical_to_physical_map,
|
||||
logical_replica_count=logical_replica_count,
|
||||
)
|
||||
|
||||
return fused_experts(
|
||||
|
||||
@ -137,7 +137,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -158,26 +158,11 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `ExpertsInt8MoEMethod` yet."
|
||||
)
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
return fused_experts(
|
||||
|
||||
@ -28,6 +28,7 @@ from vllm.model_executor.layers.fused_moe import (
|
||||
FusedMoeWeightScaleSupported,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEParallelConfig,
|
||||
FusedMoEQuantConfig,
|
||||
RoutingMethodType,
|
||||
fp8_w8a8_moe_quant_config,
|
||||
@ -118,7 +119,9 @@ class Fp8MoeBackend(Enum):
|
||||
TRITON = 6
|
||||
|
||||
|
||||
def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
|
||||
def get_fp8_moe_backend(
|
||||
block_quant: bool, moe_parallel_config: FusedMoEParallelConfig
|
||||
) -> Fp8MoeBackend:
|
||||
"""
|
||||
Select the primary FP8 MoE backend
|
||||
Note: Shape-specific fallbacks may still occur at runtime.
|
||||
@ -159,8 +162,19 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
|
||||
logger.info_once("Using Marlin backend for FP8 MoE")
|
||||
return Fp8MoeBackend.MARLIN
|
||||
|
||||
# deepGEMM on supported platforms with block-quantized weights
|
||||
if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
|
||||
# Determine if we should use DeepGEMM with block-quantized weights:
|
||||
# - If explicitly set by user, respect their choice
|
||||
# - If not explicitly set (default), disable when TP size is >= 8
|
||||
moe_use_deep_gemm = envs.VLLM_MOE_USE_DEEP_GEMM
|
||||
if not envs.is_set("VLLM_MOE_USE_DEEP_GEMM") and moe_parallel_config.tp_size >= 8:
|
||||
moe_use_deep_gemm = False
|
||||
logger.info_once(
|
||||
"DeepGEMM MoE is disabled by default when TP size is >= 8. "
|
||||
"Set VLLM_MOE_USE_DEEP_GEMM=1 to enable it.",
|
||||
scope="local",
|
||||
)
|
||||
|
||||
if envs.VLLM_USE_DEEP_GEMM and moe_use_deep_gemm and block_quant:
|
||||
if not has_deep_gemm():
|
||||
logger.warning_once(
|
||||
"DeepGEMM backend requested but not available.", scope="local"
|
||||
@ -641,7 +655,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
self.quant_config = quant_config
|
||||
self.weight_block_size = self.quant_config.weight_block_size
|
||||
self.block_quant: bool = self.weight_block_size is not None
|
||||
self.fp8_backend = get_fp8_moe_backend(self.block_quant)
|
||||
self.fp8_backend = get_fp8_moe_backend(
|
||||
self.block_quant, layer.moe_parallel_config
|
||||
)
|
||||
|
||||
self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
|
||||
self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
|
||||
@ -1140,7 +1156,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -1216,31 +1232,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
)
|
||||
|
||||
zero_expert_num = getattr(layer, "zero_expert_num", 0)
|
||||
zero_expert_type = getattr(layer, "zero_expert_type", None)
|
||||
|
||||
select_result = FusedMoE.select_experts(
|
||||
select_result = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
enable_eplb=enable_eplb,
|
||||
expert_map=expert_map,
|
||||
expert_load_view=expert_load_view,
|
||||
logical_to_physical_map=logical_to_physical_map,
|
||||
logical_replica_count=logical_replica_count,
|
||||
global_num_experts=global_num_experts,
|
||||
zero_expert_num=zero_expert_num,
|
||||
zero_expert_type=zero_expert_type,
|
||||
num_fused_shared_experts=layer.num_fused_shared_experts,
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, zero_expert_result = select_result
|
||||
@ -1322,7 +1316,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
self.allow_cutlass_block_scaled_grouped_gemm
|
||||
),
|
||||
)
|
||||
if zero_expert_num != 0 and zero_expert_type is not None:
|
||||
|
||||
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
|
||||
assert not isinstance(result, tuple), (
|
||||
"Shared + zero experts are mutually exclusive not yet supported"
|
||||
)
|
||||
|
||||
@ -621,7 +621,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -642,9 +642,6 @@ class GGUFMoEMethod(FusedMoEMethodBase):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.")
|
||||
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
if apply_router_weight_on_input:
|
||||
raise NotImplementedError(
|
||||
@ -652,19 +649,9 @@ class GGUFMoEMethod(FusedMoEMethodBase):
|
||||
"fused GGUF MoE method."
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
return fused_moe_gguf(
|
||||
x,
|
||||
|
||||
@ -722,7 +722,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -743,26 +743,11 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `GPTQMarlinMoEMethod` yet."
|
||||
)
|
||||
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
return fused_marlin_moe(
|
||||
|
||||
@ -696,7 +696,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -717,12 +717,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `ModelOptFp8MoEMethod` yet."
|
||||
)
|
||||
|
||||
if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
|
||||
if layer.enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `ModelOptFp8MoEMethod` yet."
|
||||
)
|
||||
assert activation == "silu", (
|
||||
f"Expected 'silu' activation but got {activation}"
|
||||
)
|
||||
@ -740,19 +739,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
|
||||
# Expert selection
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
|
||||
@ -1459,7 +1448,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -1480,16 +1469,16 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
|
||||
)
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
if (
|
||||
self.allow_flashinfer
|
||||
and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
|
||||
):
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
|
||||
)
|
||||
return flashinfer_trtllm_fp4_moe(
|
||||
layer=layer,
|
||||
x=x,
|
||||
@ -1502,19 +1491,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
if self.use_marlin:
|
||||
|
||||
@ -359,7 +359,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -380,25 +380,12 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.")
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
return fused_experts(
|
||||
|
||||
@ -132,12 +132,15 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
|
||||
)
|
||||
|
||||
# If FlashInfer is not available, try either Marlin or Triton
|
||||
if (
|
||||
envs.VLLM_MXFP4_USE_MARLIN
|
||||
or current_platform.get_device_capability()[0] < 9
|
||||
or not has_triton_kernels()
|
||||
or not is_torch_equal_or_newer("2.8.0")
|
||||
):
|
||||
triton_kernels_supported = (
|
||||
has_triton_kernels()
|
||||
and is_torch_equal_or_newer("2.8.0")
|
||||
# NOTE: triton_kernels are only confirmed to work on SM90 and SM100
|
||||
# SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
|
||||
# SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
|
||||
and (9, 0) <= current_platform.get_device_capability() < (11, 0)
|
||||
)
|
||||
if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
|
||||
logger.info_once("Using Marlin backend")
|
||||
return Mxfp4Backend.MARLIN
|
||||
else:
|
||||
@ -862,7 +865,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -887,18 +890,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
raise NotImplementedError("EPLB is not supported for mxfp4")
|
||||
|
||||
if self.mxfp4_backend == Mxfp4Backend.MARLIN:
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
)
|
||||
|
||||
return fused_marlin_moe(
|
||||
@ -989,17 +983,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
):
|
||||
from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
)
|
||||
|
||||
# Backend-specific preparation
|
||||
|
||||
@ -334,7 +334,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -355,24 +355,9 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet."
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
if self.rocm_aiter_moe_enabled:
|
||||
@ -609,7 +594,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -630,24 +615,9 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `QuarkOCP_MX_MoEMethod` yet."
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
if not self.emulate:
|
||||
|
||||
@ -356,7 +356,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer: FusedMoE,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
@ -377,22 +377,9 @@ class RTNMoEMethod(FusedMoEMethodBase):
|
||||
logical_to_physical_map: torch.Tensor | None = None,
|
||||
logical_replica_count: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if enable_eplb:
|
||||
raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.")
|
||||
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
use_grouped_topk=use_grouped_topk,
|
||||
top_k=top_k,
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
e_score_correction_bias=e_score_correction_bias,
|
||||
indices_type=self.topk_indices_dtype,
|
||||
)
|
||||
|
||||
return fused_marlin_moe(
|
||||
|
||||
@ -17,6 +17,7 @@ from .llama4_vision_rope import Llama4VisionRotaryEmbedding
|
||||
from .mrope import MRotaryEmbedding
|
||||
from .ntk_scaling_rope import NTKScalingRotaryEmbedding
|
||||
from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding
|
||||
from .xdrope import XDRotaryEmbedding
|
||||
from .yarn_scaling_rope import YaRNScalingRotaryEmbedding
|
||||
|
||||
_ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
|
||||
@ -184,6 +185,18 @@ def get_rope(
|
||||
raise ValueError(
|
||||
"Dynamic rope scaling must contain either 'alpha' or 'factor' field"
|
||||
)
|
||||
elif scaling_type == "xdrope":
|
||||
scaling_alpha = rope_parameters["alpha"]
|
||||
rotary_emb = XDRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_alpha,
|
||||
dtype,
|
||||
xdrope_section=rope_parameters["xdrope_section"],
|
||||
)
|
||||
elif scaling_type == "yarn":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
|
||||
102
vllm/model_executor/layers/rotary_embedding/xdrope.py
Normal file
102
vllm/model_executor/layers/rotary_embedding/xdrope.py
Normal file
@ -0,0 +1,102 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from .common import apply_rotary_emb_dispatch
|
||||
from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
|
||||
|
||||
|
||||
class XDRotaryEmbedding(DynamicNTKAlphaRotaryEmbedding):
|
||||
"""DynamicNTKAlphaRotaryEmbedding extended with MultiModal(XD) Sections.
|
||||
|
||||
Based on the original DynamicNTKAlphaRotaryEmbedding implementation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_alpha: float,
|
||||
dtype: torch.dtype,
|
||||
xdrope_section: list[int],
|
||||
) -> None:
|
||||
self.xdrope_section = xdrope_section
|
||||
super().__init__(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position_embeddings,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_alpha,
|
||||
dtype,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""PyTorch-native implementation equivalent to forward().
|
||||
|
||||
Args:
|
||||
positions:
|
||||
[4, num_tokens] (P/W/H/T positions with multimodal inputs)
|
||||
query: [num_tokens, num_heads * head_size]
|
||||
key: [num_tokens, num_kv_heads * head_size]
|
||||
"""
|
||||
assert positions.ndim == 2
|
||||
assert key is not None
|
||||
|
||||
num_tokens = positions.shape[-1]
|
||||
cos_sin = self.cos_sin_cache[positions]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
cos = torch.cat(
|
||||
[m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1
|
||||
)
|
||||
sin = torch.cat(
|
||||
[m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1
|
||||
)
|
||||
|
||||
query_shape = query.shape
|
||||
query = query.view(num_tokens, -1, self.head_size)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
key_shape = key.shape
|
||||
key = key.view(num_tokens, -1, self.head_size)
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
@staticmethod
|
||||
def get_next_input_positions(
|
||||
context_len: int,
|
||||
seq_len: int,
|
||||
xd_sections: int = 4,
|
||||
) -> list[list[int]]:
|
||||
return [list(range(context_len, seq_len)) for _ in range(xd_sections)]
|
||||
|
||||
@staticmethod
|
||||
def get_next_input_positions_tensor(
|
||||
out: np.ndarray,
|
||||
out_offset: int,
|
||||
context_len: int,
|
||||
num_new_tokens: int,
|
||||
):
|
||||
values = np.arange(
|
||||
context_len,
|
||||
context_len + num_new_tokens,
|
||||
dtype=out.dtype,
|
||||
)
|
||||
out[:, out_offset : out_offset + num_new_tokens] = values
|
||||
@ -576,7 +576,16 @@ class HunYuanDecoderLayer(nn.Module):
|
||||
return hidden_states, residual, ori_kv_states
|
||||
|
||||
|
||||
@support_torch_compile
|
||||
@support_torch_compile(
|
||||
dynamic_arg_dims={
|
||||
"input_ids": 0,
|
||||
# positions is of shape (xd, seq_len) if xdrope is enabled for hunyuan-vl,
|
||||
# otherwise (seq_len, ).
|
||||
"positions": -1,
|
||||
"intermediate_tensors": 0,
|
||||
"inputs_embeds": 0,
|
||||
}
|
||||
)
|
||||
class HunYuanModel(nn.Module):
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
|
||||
1028
vllm/model_executor/models/hunyuan_vision.py
Normal file
1028
vllm/model_executor/models/hunyuan_vision.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -1047,7 +1047,7 @@ class SupportsMRoPE(Protocol):
|
||||
supports_mrope: ClassVar[Literal[True]] = True
|
||||
"""
|
||||
A flag that indicates this model supports M-RoPE.
|
||||
|
||||
|
||||
Note:
|
||||
There is no need to redefine this flag if this class is in the
|
||||
MRO of your model class.
|
||||
@ -1088,3 +1088,52 @@ def supports_mrope(
|
||||
model: type[object] | object,
|
||||
) -> TypeIs[type[SupportsMRoPE]] | TypeIs[SupportsMRoPE]:
|
||||
return isinstance(model, SupportsMRoPE)
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class SupportsXDRoPE(Protocol):
|
||||
"""The interface required for all models that support XD-RoPE."""
|
||||
|
||||
supports_xdrope: ClassVar[Literal[True]] = True
|
||||
"""
|
||||
A flag that indicates this model supports XD-RoPE.
|
||||
|
||||
Note:
|
||||
There is no need to redefine this flag if this class is in the
|
||||
XDRope of your model class.
|
||||
"""
|
||||
|
||||
def get_xdrope_input_positions(
|
||||
self,
|
||||
input_tokens: list[int],
|
||||
mm_features: list["MultiModalFeatureSpec"],
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Get XD-RoPE input positions and delta value for this specific model.
|
||||
|
||||
This method should be implemented by each model that supports XD-RoPE
|
||||
to provide model-specific logic for computing input positions.
|
||||
|
||||
Args:
|
||||
input_tokens: List of input token IDs
|
||||
mm_features: Information about each multi-modal data item
|
||||
|
||||
Returns:
|
||||
llm_positions: Tensor of shape `[xdrope_dim, num_tokens]` with
|
||||
4D(P/W/H/T) or 3D(W/H/T) positions.
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def supports_xdrope(model: type[object]) -> TypeIs[type[SupportsXDRoPE]]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def supports_xdrope(model: object) -> TypeIs[SupportsXDRoPE]: ...
|
||||
|
||||
|
||||
def supports_xdrope(
|
||||
model: type[object] | object,
|
||||
) -> TypeIs[type[SupportsXDRoPE]] | TypeIs[SupportsXDRoPE]:
|
||||
return isinstance(model, SupportsXDRoPE)
|
||||
|
||||
@ -142,6 +142,12 @@ class LlamaModel(nn.Module):
|
||||
# Get drafter's quantization config
|
||||
self.quant_config = get_draft_quant_config(vllm_config)
|
||||
|
||||
eagle_config = getattr(self.config, "eagle_config", None)
|
||||
if eagle_config is not None and "use_aux_hidden_state" in eagle_config:
|
||||
self.use_aux_hidden_state = eagle_config["use_aux_hidden_state"]
|
||||
else:
|
||||
self.use_aux_hidden_state = True
|
||||
|
||||
current_vllm_config = get_current_vllm_config()
|
||||
|
||||
self.embed_tokens = VocabParallelEmbedding(
|
||||
@ -161,20 +167,20 @@ class LlamaModel(nn.Module):
|
||||
for layer_idx in range(self.config.num_hidden_layers)
|
||||
]
|
||||
)
|
||||
if hasattr(self.config, "target_hidden_size"):
|
||||
fc_input_size = self.config.target_hidden_size * 3
|
||||
else:
|
||||
fc_input_size = self.config.hidden_size * 3
|
||||
self.fc = ReplicatedLinear(
|
||||
input_size=fc_input_size,
|
||||
output_size=self.config.hidden_size,
|
||||
bias=False,
|
||||
params_dtype=vllm_config.model_config.dtype,
|
||||
quant_config=self.quant_config,
|
||||
prefix=maybe_prefix(prefix, "fc"),
|
||||
return_bias=False,
|
||||
)
|
||||
|
||||
if self.use_aux_hidden_state:
|
||||
if hasattr(self.config, "target_hidden_size"):
|
||||
fc_input_size = self.config.target_hidden_size * 3
|
||||
else:
|
||||
fc_input_size = self.config.hidden_size * 3
|
||||
self.fc = ReplicatedLinear(
|
||||
input_size=fc_input_size,
|
||||
output_size=self.config.hidden_size,
|
||||
bias=False,
|
||||
params_dtype=vllm_config.model_config.dtype,
|
||||
quant_config=self.quant_config,
|
||||
prefix=maybe_prefix(prefix, "fc"),
|
||||
return_bias=False,
|
||||
)
|
||||
self.norm = RMSNorm(
|
||||
self.config.hidden_size,
|
||||
eps=self.config.rms_norm_eps,
|
||||
@ -332,6 +338,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
if not self.model.use_aux_hidden_state:
|
||||
return hidden_states
|
||||
# combine multiple auxiliary hidden states returned by eagle3
|
||||
return self.model.fc(hidden_states)
|
||||
|
||||
@ -357,6 +365,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
|
||||
skip_substrs.append("draft_id_to_target_id")
|
||||
if not includes_embed_tokens:
|
||||
skip_substrs.append("embed_tokens")
|
||||
if not self.model.use_aux_hidden_state:
|
||||
skip_substrs.append("fc.")
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=None,
|
||||
|
||||
@ -56,10 +56,13 @@ from transformers.utils import is_flash_attn_2_available
|
||||
from vllm.model_executor.layers.conv import Conv2dLayer
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
from vllm.model_executor.models.utils import maybe_prefix
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
|
||||
|
||||
if is_flash_attn_2_available():
|
||||
from flash_attn import flash_attn_varlen_func
|
||||
elif current_platform.is_xpu():
|
||||
from vllm.attention.utils.fa_utils import flash_attn_varlen_func
|
||||
else:
|
||||
flash_attn_varlen_func = None
|
||||
|
||||
@ -106,10 +109,10 @@ def multihead_attention(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
q_cu_seqlens,
|
||||
k_cu_seqlens,
|
||||
max_seqlen_q,
|
||||
max_seqlen_k,
|
||||
cu_seqlens_q=q_cu_seqlens,
|
||||
cu_seqlens_k=k_cu_seqlens,
|
||||
max_seqlen_q=max_seqlen_q,
|
||||
max_seqlen_k=max_seqlen_k,
|
||||
causal=False,
|
||||
)
|
||||
attn_out = attn_out.flatten(start_dim=-2)
|
||||
@ -291,7 +294,12 @@ class Rope2DPosEmb(nn.Module):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda"
|
||||
self,
|
||||
dim: int,
|
||||
max_height: int,
|
||||
max_width: int,
|
||||
theta_base=10000,
|
||||
device=current_platform.device_type,
|
||||
):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
@ -437,7 +445,7 @@ class MoonVitEncoderLayer(nn.Module):
|
||||
self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
|
||||
self.attn_implementation = attn_implementation
|
||||
# use fa2 in vllm by default
|
||||
if is_flash_attn_2_available():
|
||||
if is_flash_attn_2_available() or current_platform.is_xpu():
|
||||
self.attn_implementation = "flash_attention_2"
|
||||
|
||||
self.norm0 = nn.LayerNorm(hidden_dim)
|
||||
|
||||
@ -23,7 +23,6 @@
|
||||
"""Inference-only Qwen2.5-Omni model (thinker part)."""
|
||||
|
||||
from collections.abc import Callable, Iterable, Mapping, Sequence
|
||||
from copy import copy
|
||||
from functools import partial
|
||||
from typing import Annotated, Any, Literal
|
||||
|
||||
@ -387,15 +386,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
|
||||
self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
|
||||
|
||||
use_audio_in_video = False
|
||||
if "video" in mm_kwargs:
|
||||
video_items = [item for item in mm_kwargs["video"] if item is not None]
|
||||
# only check video items (if there are any)
|
||||
if video_items:
|
||||
use_audio_in_video = all(
|
||||
item["use_audio_in_video"].data for item in video_items
|
||||
)
|
||||
|
||||
if is_update_applied:
|
||||
mm_placeholders = self._find_mm_placeholders(
|
||||
prompt_ids,
|
||||
@ -404,7 +394,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
self._validate_mm_placeholders(
|
||||
mm_placeholders,
|
||||
mm_item_counts,
|
||||
use_audio_in_video=use_audio_in_video,
|
||||
)
|
||||
else:
|
||||
prompt_ids, mm_placeholders = self._apply_prompt_updates(
|
||||
@ -414,7 +403,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
self._validate_mm_placeholders(
|
||||
mm_placeholders,
|
||||
mm_item_counts,
|
||||
use_audio_in_video=use_audio_in_video,
|
||||
)
|
||||
|
||||
return prompt_ids, mm_placeholders
|
||||
@ -640,19 +628,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
|
||||
return mm_processed_data
|
||||
|
||||
def _validate_mm_placeholders(
|
||||
self,
|
||||
mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
|
||||
mm_item_counts: Mapping[str, int],
|
||||
use_audio_in_video: bool = False,
|
||||
) -> None:
|
||||
if use_audio_in_video:
|
||||
mm_item_counts = copy(mm_item_counts)
|
||||
if "video" in mm_item_counts:
|
||||
assert "audio" in mm_item_counts
|
||||
mm_item_counts["audio"] -= mm_item_counts["video"]
|
||||
super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
|
||||
|
||||
|
||||
class Qwen2_5OmniConditionalGenerationMixin:
|
||||
def _parse_and_validate_audio_input(
|
||||
|
||||
@ -68,11 +68,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
|
||||
from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
|
||||
from vllm.multimodal.processing import (
|
||||
BaseMultiModalProcessor,
|
||||
MultiModalPromptUpdates,
|
||||
PlaceholderFeaturesInfo,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
@ -87,7 +87,6 @@ from .qwen2_5_omni_thinker import (
|
||||
Qwen2_5OmniConditionalGenerationMixin,
|
||||
Qwen2_5OmniThinkerDummyInputsBuilder,
|
||||
Qwen2_5OmniThinkerMultiModalProcessor,
|
||||
Qwen2_5OmniThinkerProcessingInfo,
|
||||
)
|
||||
from .qwen2_5_vl import (
|
||||
Qwen2_5_VisionAttention,
|
||||
@ -807,24 +806,8 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
||||
else:
|
||||
use_audio_in_video = False
|
||||
|
||||
if use_audio_in_video and "video" in mm_item_counts:
|
||||
assert "audio" in mm_item_counts
|
||||
mm_item_counts["audio"] -= mm_item_counts["video"]
|
||||
|
||||
# Special case with `use_audio_in_video=True`
|
||||
if use_audio_in_video:
|
||||
if is_update_applied:
|
||||
prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video)
|
||||
(
|
||||
prompt_ids,
|
||||
mm_placeholders,
|
||||
) = self._apply_prompt_updates(
|
||||
prompt_ids,
|
||||
mm_prompt_updates,
|
||||
)
|
||||
self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
|
||||
# normal case with `use_audio_in_video=False`
|
||||
elif is_update_applied:
|
||||
if is_update_applied:
|
||||
mm_placeholders = self._find_mm_placeholders(
|
||||
prompt_ids,
|
||||
mm_prompt_updates,
|
||||
@ -834,10 +817,24 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
||||
mm_item_counts,
|
||||
)
|
||||
else:
|
||||
prompt_ids, mm_placeholders = self._apply_prompt_updates(
|
||||
prompt_ids,
|
||||
mm_prompt_updates,
|
||||
)
|
||||
if use_audio_in_video and "audio" in mm_prompt_updates:
|
||||
filtered_updates = {
|
||||
k: v for k, v in mm_prompt_updates.items() if k != "audio"
|
||||
}
|
||||
prompt_ids, mm_placeholders = self._apply_prompt_updates(
|
||||
prompt_ids,
|
||||
filtered_updates,
|
||||
)
|
||||
# Derive audio placeholders from video placeholders
|
||||
mm_placeholders = self._derive_audio_from_video_placeholders(
|
||||
mm_placeholders, mm_prompt_updates
|
||||
)
|
||||
else:
|
||||
prompt_ids, mm_placeholders = self._apply_prompt_updates(
|
||||
prompt_ids,
|
||||
mm_prompt_updates,
|
||||
)
|
||||
|
||||
self._validate_mm_placeholders(
|
||||
mm_placeholders,
|
||||
mm_item_counts,
|
||||
@ -962,7 +959,9 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
||||
|
||||
def get_replacement_qwen2_use_audio_in_video(item_idx: int):
|
||||
nonlocal audio_in_video_item_idx
|
||||
audio_num_features = audio_output_lengths[audio_item_idx + item_idx]
|
||||
audio_num_features = audio_output_lengths[
|
||||
audio_in_video_item_idx + item_idx
|
||||
]
|
||||
video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
|
||||
|
||||
audio_in_video_item_idx += 1
|
||||
@ -971,14 +970,17 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
||||
if second_per_grid_ts:
|
||||
video_second_per_grid_t = second_per_grid_ts[item_idx]
|
||||
else:
|
||||
video_second_per_grid_t = 1.0
|
||||
video_second_per_grid_t = 2.0
|
||||
|
||||
return self.get_updates_use_audio_in_video(
|
||||
placeholder = self.get_updates_use_audio_in_video(
|
||||
thinker_config=thinker_config,
|
||||
audio_len=audio_num_features,
|
||||
video_grid_thw=video_grid_thw,
|
||||
video_second_per_grid_t=video_second_per_grid_t,
|
||||
)
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
placeholder, embed_token_id=video_token_id
|
||||
)
|
||||
|
||||
video_replacement_fn = (
|
||||
get_replacement_qwen2_use_audio_in_video
|
||||
@ -1004,14 +1006,50 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
||||
),
|
||||
]
|
||||
|
||||
def _validate_mm_placeholders(
|
||||
def _derive_audio_from_video_placeholders(
|
||||
self,
|
||||
mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
|
||||
mm_item_counts: Mapping[str, int],
|
||||
) -> None:
|
||||
BaseMultiModalProcessor[
|
||||
Qwen2_5OmniThinkerProcessingInfo
|
||||
]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts)
|
||||
placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
|
||||
mm_prompt_updates: MultiModalPromptUpdates,
|
||||
) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
|
||||
"""
|
||||
Helper to derive audio placeholders from video placeholders when
|
||||
use_audio_in_video=True.
|
||||
"""
|
||||
if "video" not in placeholders:
|
||||
return placeholders
|
||||
|
||||
# Validate audio and video counts match
|
||||
num_videos = len(placeholders["video"])
|
||||
num_audios = len(mm_prompt_updates.get("audio", []))
|
||||
if num_audios != num_videos:
|
||||
raise ValueError(
|
||||
f"use_audio_in_video requires equal number of audio and video items, "
|
||||
f"got {num_audios=}, {num_videos=}"
|
||||
)
|
||||
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
processor = self.info.get_hf_processor()
|
||||
audio_token_id = tokenizer.get_vocab()[processor.audio_token]
|
||||
|
||||
result_placeholders = dict(placeholders)
|
||||
audio_placeholders = []
|
||||
|
||||
# Each video is paired with one audio
|
||||
for video_idx, video_placeholder in enumerate(placeholders["video"]):
|
||||
# Create is_embed mask selecting only audio tokens
|
||||
audio_is_embed = torch.tensor(video_placeholder.tokens) == audio_token_id
|
||||
|
||||
audio_placeholder = PlaceholderFeaturesInfo(
|
||||
modality="audio",
|
||||
item_idx=video_idx,
|
||||
start_idx=video_placeholder.start_idx,
|
||||
tokens=video_placeholder.tokens,
|
||||
is_embed=audio_is_embed,
|
||||
)
|
||||
audio_placeholders.append(audio_placeholder)
|
||||
|
||||
result_placeholders["audio"] = audio_placeholders
|
||||
return result_placeholders
|
||||
|
||||
def _get_raw_input_ids(
|
||||
self,
|
||||
@ -1454,7 +1492,11 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
||||
)
|
||||
|
||||
if not len(second_per_grid_ts) and len(video_grid_thw):
|
||||
second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
|
||||
second_per_grid_ts = 2.0
|
||||
second_per_grids = (
|
||||
torch.ones(len(video_grid_thw), dtype=torch.float32)
|
||||
* second_per_grid_ts
|
||||
)
|
||||
else:
|
||||
second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
|
||||
|
||||
|
||||
@ -287,6 +287,10 @@ _MULTIMODAL_MODELS = {
|
||||
"GraniteSpeechForConditionalGeneration",
|
||||
),
|
||||
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
|
||||
"HunYuanVLForConditionalGeneration": (
|
||||
"hunyuan_vision",
|
||||
"HunYuanVLForConditionalGeneration",
|
||||
),
|
||||
"InternVLChatModel": ("internvl", "InternVLChatModel"),
|
||||
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
|
||||
"OpenCUAForConditionalGeneration": (
|
||||
|
||||
@ -134,6 +134,11 @@ class Platform:
|
||||
|
||||
_global_graph_pool: Any | None = None
|
||||
|
||||
@property
|
||||
def pass_key(self) -> str:
|
||||
"""Inductor config key for the PassManager custom pass"""
|
||||
return "post_grad_custom_post_pass"
|
||||
|
||||
@property
|
||||
def supported_dtypes(self) -> list[torch.dtype]:
|
||||
"""Returns the supported dtypes for the current platform."""
|
||||
@ -177,6 +182,21 @@ class Platform:
|
||||
# all ROCm platforms for now.
|
||||
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
|
||||
|
||||
@classmethod
|
||||
def get_pass_manager_cls(cls) -> str:
|
||||
"""
|
||||
Get the pass manager class for this platform.
|
||||
It will be registered as a custom pass under the current_platform.pass_key.
|
||||
"""
|
||||
return "vllm.compilation.pass_manager.PostGradPassManager"
|
||||
|
||||
@classmethod
|
||||
def get_compile_backend(cls) -> str:
|
||||
"""
|
||||
Get the custom compile backend for current platform.
|
||||
"""
|
||||
return cls.simple_compile_backend
|
||||
|
||||
@classmethod
|
||||
def device_id_to_physical_device_id(cls, device_id: int):
|
||||
# Treat empty device control env var as unset. This is a valid
|
||||
|
||||
@ -262,6 +262,10 @@ class RocmPlatform(Platform):
|
||||
f"is not MLA type while requested for MLA backend."
|
||||
)
|
||||
|
||||
if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
|
||||
logger.info("Using FlexAttention backend.")
|
||||
return AttentionBackendEnum.FLEX_ATTENTION.get_path()
|
||||
|
||||
if selected_backend == AttentionBackendEnum.TRITON_ATTN:
|
||||
logger.info("Using Triton Attention backend on V1 engine.")
|
||||
return AttentionBackendEnum.TRITON_ATTN.get_path()
|
||||
@ -317,8 +321,8 @@ class RocmPlatform(Platform):
|
||||
return AttentionBackendEnum.TRITON_ATTN.get_path()
|
||||
|
||||
raise RuntimeError(
|
||||
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
|
||||
"to select a supported backend."
|
||||
f"Attention backend {selected_backend.name} is not supported on "
|
||||
"ROCm. Note that V0 attention backends have been removed."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
|
||||
@ -57,7 +57,7 @@ class PoolingParams(
|
||||
## Internal use only
|
||||
task: PoolingTask | None = None
|
||||
requires_token_ids: bool = False
|
||||
skip_reading_prefix_cache: bool = None
|
||||
skip_reading_prefix_cache: bool | None = None
|
||||
extra_kwargs: dict[str, Any] | None = None
|
||||
output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
|
||||
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
"""Sampling parameters for text generation."""
|
||||
|
||||
import copy
|
||||
import warnings
|
||||
from dataclasses import field
|
||||
from enum import Enum, IntEnum
|
||||
from functools import cached_property
|
||||
@ -100,19 +99,6 @@ class StructuredOutputsParams:
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GuidedDecodingParams(StructuredOutputsParams):
|
||||
def __post_init__(self):
|
||||
warnings.warn(
|
||||
"GuidedDecodingParams is deprecated. This will be removed in "
|
||||
"v0.12.0 or v1.0.0, which ever is soonest. Please use "
|
||||
"StructuredOutputsParams instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return super().__post_init__()
|
||||
|
||||
|
||||
class RequestOutputKind(Enum):
|
||||
# Return entire output so far in every RequestOutput
|
||||
CUMULATIVE = 0
|
||||
@ -234,8 +220,6 @@ class SamplingParams(
|
||||
# Fields used to construct logits processors
|
||||
structured_outputs: StructuredOutputsParams | None = None
|
||||
"""Parameters for configuring structured outputs."""
|
||||
guided_decoding: GuidedDecodingParams | None = None
|
||||
"""Deprecated alias for structured_outputs."""
|
||||
logit_bias: dict[int, float] | None = None
|
||||
"""If provided, the engine will construct a logits processor that applies
|
||||
these logit biases."""
|
||||
@ -254,7 +238,7 @@ class SamplingParams(
|
||||
generated token can complete the sequence."""
|
||||
_bad_words_token_ids: list[list[int]] | None = None
|
||||
|
||||
skip_reading_prefix_cache: bool = None
|
||||
skip_reading_prefix_cache: bool | None = None
|
||||
|
||||
@staticmethod
|
||||
def from_optional(
|
||||
@ -283,7 +267,6 @@ class SamplingParams(
|
||||
truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None,
|
||||
output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
|
||||
structured_outputs: StructuredOutputsParams | None = None,
|
||||
guided_decoding: GuidedDecodingParams | None = None,
|
||||
logit_bias: dict[int, float] | dict[str, float] | None = None,
|
||||
allowed_token_ids: list[int] | None = None,
|
||||
extra_args: dict[str, Any] | None = None,
|
||||
@ -295,16 +278,6 @@ class SamplingParams(
|
||||
int(token): min(100.0, max(-100.0, bias))
|
||||
for token, bias in logit_bias.items()
|
||||
}
|
||||
if guided_decoding is not None:
|
||||
warnings.warn(
|
||||
"guided_decoding is deprecated. This will be removed in "
|
||||
"v0.12.0 or v1.0.0, which ever is soonest. Please use "
|
||||
"structured_outputs instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
structured_outputs = guided_decoding
|
||||
guided_decoding = None
|
||||
|
||||
return SamplingParams(
|
||||
n=1 if n is None else n,
|
||||
@ -387,17 +360,6 @@ class SamplingParams(
|
||||
# eos_token_id is added to this by the engine
|
||||
self._all_stop_token_ids.update(self.stop_token_ids)
|
||||
|
||||
if self.guided_decoding is not None:
|
||||
warnings.warn(
|
||||
"guided_decoding is deprecated. This will be removed in "
|
||||
"v0.12.0 or v1.0.0, which ever is soonest. Please use "
|
||||
"structured_outputs instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
self.structured_outputs = self.guided_decoding
|
||||
self.guided_decoding = None
|
||||
|
||||
if self.skip_reading_prefix_cache is None:
|
||||
# If prefix caching is enabled,
|
||||
# the output of prompt logprobs may less than n_prompt_tokens,
|
||||
|
||||
@ -86,6 +86,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
|
||||
deepseek_vl_v2="DeepseekVLV2Config",
|
||||
deepseek_v32="DeepseekV3Config",
|
||||
flex_olmo="FlexOlmoConfig",
|
||||
hunyuan_vl="HunYuanVLConfig",
|
||||
kimi_linear="KimiLinearConfig",
|
||||
kimi_vl="KimiVLConfig",
|
||||
RefinedWeb="RWConfig", # For tiiuae/falcon-40b(-instruct)
|
||||
@ -549,6 +550,23 @@ def thinker_uses_mrope(config: PretrainedConfig) -> bool:
|
||||
return uses_mrope(thinker_text_config)
|
||||
|
||||
|
||||
def uses_xdrope_dim(config: PretrainedConfig) -> int:
|
||||
"""Detect if the model with this config uses XD-ROPE."""
|
||||
xdrope_section = getattr(config, "xdrope_section", None)
|
||||
if xdrope_section is not None and isinstance(xdrope_section, list):
|
||||
return len(xdrope_section)
|
||||
rope_scaling = getattr(config, "rope_scaling", None)
|
||||
if rope_scaling is None:
|
||||
return 0
|
||||
|
||||
if isinstance(rope_scaling, dict) and "xdrope_section" in rope_scaling:
|
||||
xdrope_section = rope_scaling["xdrope_section"]
|
||||
if xdrope_section is not None and isinstance(xdrope_section, list):
|
||||
return len(xdrope_section)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def is_encoder_decoder(config: PretrainedConfig) -> bool:
|
||||
"""Detect if the model with this config is used as an encoder/decoder."""
|
||||
|
||||
|
||||
@ -23,6 +23,11 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||
from vllm.transformers_utils.configs.falcon import RWConfig
|
||||
from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
|
||||
from vllm.transformers_utils.configs.hunyuan_vl import (
|
||||
HunYuanVLConfig,
|
||||
HunYuanVLTextConfig,
|
||||
HunYuanVLVisionConfig,
|
||||
)
|
||||
from vllm.transformers_utils.configs.jais import JAISConfig
|
||||
from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
|
||||
from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
|
||||
@ -53,6 +58,9 @@ __all__ = [
|
||||
"DotsOCRConfig",
|
||||
"EAGLEConfig",
|
||||
"FlexOlmoConfig",
|
||||
"HunYuanVLConfig",
|
||||
"HunYuanVLTextConfig",
|
||||
"HunYuanVLVisionConfig",
|
||||
"RWConfig",
|
||||
"JAISConfig",
|
||||
"Lfm2MoeConfig",
|
||||
|
||||
322
vllm/transformers_utils/configs/hunyuan_vl.py
Normal file
322
vllm/transformers_utils/configs/hunyuan_vl.py
Normal file
@ -0,0 +1,322 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/configuration_hunyuan_vl.py
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class HunYuanVLVisionConfig(PretrainedConfig):
|
||||
model_type = "hunyuan_vl"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_act="gelu",
|
||||
hidden_size=1152,
|
||||
intermediate_size=4304,
|
||||
interpolate_mode="bilinear",
|
||||
rms_norm_eps=1e-05,
|
||||
learnable_mlp_pooling_size=0,
|
||||
num_attention_heads=16,
|
||||
num_key_value_heads=None,
|
||||
num_channels=3,
|
||||
num_hidden_layers=27,
|
||||
out_hidden_size=4096,
|
||||
patch_size=16,
|
||||
remove_prenorm=True,
|
||||
spatial_merge_size=2,
|
||||
temporal_patch_size=1,
|
||||
resize_resolution=2048,
|
||||
img_max_token_num=4096,
|
||||
max_image_size=2048,
|
||||
video_max_image_size=768,
|
||||
video_min_image_size=256,
|
||||
min_image_size=512,
|
||||
anyres_vit_max_image_size=2048,
|
||||
max_vit_seq_len=16384,
|
||||
text_hidden_size=3072,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.interpolate_mode = interpolate_mode
|
||||
self.learnable_mlp_pooling_size = learnable_mlp_pooling_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
if not num_key_value_heads:
|
||||
self.num_key_value_heads = num_attention_heads
|
||||
else:
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.num_channels = num_channels
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.out_hidden_size = out_hidden_size
|
||||
self.patch_size = patch_size
|
||||
self.remove_prenorm = remove_prenorm
|
||||
self.spatial_merge_size = spatial_merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
|
||||
self.resize_resolution = resize_resolution
|
||||
self.img_max_token_num = img_max_token_num
|
||||
self.max_image_size = max_image_size
|
||||
self.min_image_size = min_image_size
|
||||
self.video_max_image_size = video_max_image_size
|
||||
self.video_min_image_size = video_min_image_size
|
||||
self.anyres_vit_max_image_size = anyres_vit_max_image_size
|
||||
self.max_vit_seq_len = max_vit_seq_len
|
||||
self.text_hidden_size = text_hidden_size
|
||||
|
||||
|
||||
class HunYuanVLTextConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`HunYuanVLTextConfig`]. It is used to instantiate an
|
||||
HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the HunYuan-7B.
|
||||
Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 290943):
|
||||
Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`HunYuanVLTextConfig`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 11008):
|
||||
Dimension of the MLP representations or shared MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*, defaults to 0):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
End of stream token id.
|
||||
eod_token_id (int, *optional*, defaults to 3):
|
||||
Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
|
||||
Example: In multi-document processing, this token helps the model distinguish between separate documents.
|
||||
pretraining_tp (`int`, *optional*, defaults to 1):
|
||||
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
|
||||
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
|
||||
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
|
||||
issue](https://github.com/pytorch/pytorch/issues/76232).
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
|
||||
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
|
||||
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
|
||||
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
|
||||
these scaling strategies behave:
|
||||
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
|
||||
experimental feature, subject to breaking API changes in future versions.
|
||||
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
head_dim (`int`, *optional*, defaults to 128):
|
||||
The attention head dimension.
|
||||
""" # noqa: E501
|
||||
|
||||
model_type = "hunyuan_vl_text"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=290943,
|
||||
hidden_size=4096,
|
||||
intermediate_size: int = 11008,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
eod_token_id=3,
|
||||
pretraining_tp=1,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
head_dim=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.head_dim = head_dim
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.pretraining_tp = pretraining_tp
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
# self._rope_scaling_validation() # TODO: Need validation?
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _rope_scaling_validation(self):
|
||||
"""
|
||||
Validate the `rope_scaling` configuration.
|
||||
"""
|
||||
if self.rope_scaling is None:
|
||||
return
|
||||
|
||||
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
||||
raise ValueError(
|
||||
"`rope_scaling` must be a dictionary with with two fields, `type` and "
|
||||
f"`factor` or `type` and `alpha`, got {self.rope_scaling}"
|
||||
)
|
||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||
rope_scaling_alpha = self.rope_scaling.get("alpha", None)
|
||||
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], "
|
||||
f"got {rope_scaling_type}"
|
||||
)
|
||||
if rope_scaling_factor is None and rope_scaling_alpha is None:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s factor or alpha field must be have one, "
|
||||
"got both of none"
|
||||
)
|
||||
if rope_scaling_factor is not None and (
|
||||
not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0
|
||||
):
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s factor field must be a float > 1.0, "
|
||||
f"got {rope_scaling_factor}"
|
||||
)
|
||||
if rope_scaling_alpha is not None and (
|
||||
not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0
|
||||
):
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s alpha field must be a float > 1.0, "
|
||||
f"got {rope_scaling_alpha}"
|
||||
)
|
||||
|
||||
|
||||
class HunYuanVLConfig(PretrainedConfig):
|
||||
model_type = "hunyuan_vl"
|
||||
sub_configs = {
|
||||
"vision_config": HunYuanVLVisionConfig,
|
||||
"text_config": HunYuanVLTextConfig,
|
||||
}
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_config=None,
|
||||
vision_config=None,
|
||||
im_start_id=120118,
|
||||
im_end_id=120119,
|
||||
image_token_id=120120,
|
||||
im_newline_id=120121,
|
||||
video_start_id=120122,
|
||||
video_end_id=120123,
|
||||
**kwargs,
|
||||
):
|
||||
# We need to init super() here so that it does not reset values
|
||||
# that are in text config to the BaseClass defaults. The Base
|
||||
# config has many text related defaults and not all defaults are
|
||||
# same as for `HunYuanVLTextConfig`.
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if isinstance(vision_config, dict):
|
||||
self.vision_config = self.sub_configs["vision_config"](**vision_config)
|
||||
elif vision_config is None:
|
||||
self.vision_config = self.sub_configs["vision_config"]()
|
||||
|
||||
if isinstance(text_config, dict):
|
||||
self.text_config = self.sub_configs["text_config"](**text_config)
|
||||
elif text_config is None:
|
||||
# For BC use all kwargs to init `TextConfig`
|
||||
self.text_config = self.sub_configs["text_config"](**kwargs)
|
||||
|
||||
self.image_token_id = image_token_id
|
||||
self.im_start_id = im_start_id
|
||||
self.im_end_id = im_end_id
|
||||
self.im_newline_id = im_newline_id
|
||||
self.video_start_id = video_start_id
|
||||
self.video_end_id = video_end_id
|
||||
|
||||
self.vision_config.text_hidden_size = self.text_config.hidden_size
|
||||
|
||||
# Attention implementation to use. It sets it recursively on sub-configs
|
||||
# so we call it again in the end.
|
||||
self._attn_implementation = kwargs.pop("attn_implementation", None)
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
if (
|
||||
(text_config := super().__getattribute__("__dict__").get("text_config"))
|
||||
is not None
|
||||
and key not in ["dtype", "_attn_implementation_internal"]
|
||||
and key in text_config.__dict__
|
||||
):
|
||||
setattr(text_config, key, value)
|
||||
else:
|
||||
super().__setattr__(key, value)
|
||||
|
||||
def __getattribute__(self, key):
|
||||
if "text_config" in super().__getattribute__("__dict__") and key not in [
|
||||
"_name_or_path",
|
||||
"model_type",
|
||||
"dtype",
|
||||
"_attn_implementation_internal",
|
||||
]:
|
||||
text_config = super().__getattribute__("text_config")
|
||||
if key in text_config.__dict__:
|
||||
return getattr(text_config, key)
|
||||
|
||||
return super().__getattribute__(key)
|
||||
@ -9,7 +9,15 @@ reasons:
|
||||
"""
|
||||
|
||||
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
|
||||
from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
|
||||
from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
|
||||
from vllm.transformers_utils.processors.ovis import OvisProcessor
|
||||
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
|
||||
|
||||
__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
|
||||
__all__ = [
|
||||
"DeepseekVLV2Processor",
|
||||
"HunYuanVLProcessor",
|
||||
"HunYuanVLImageProcessor",
|
||||
"OvisProcessor",
|
||||
"Ovis2_5Processor",
|
||||
]
|
||||
|
||||
233
vllm/transformers_utils/processors/hunyuan_vl.py
Normal file
233
vllm/transformers_utils/processors/hunyuan_vl.py
Normal file
@ -0,0 +1,233 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/processing_hunyuan_vl.py
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoProcessor
|
||||
from transformers.feature_extraction_utils import BatchFeature
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from transformers.video_utils import VideoInput
|
||||
|
||||
|
||||
class HunYuanVLProcessor(ProcessorMixin):
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
valid_kwargs = ["chat_template"]
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer" # ("AutoTokenizer", None)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_processor=None,
|
||||
tokenizer=None,
|
||||
video_processor=None,
|
||||
chat_template=None,
|
||||
**kwargs,
|
||||
):
|
||||
# TODO Fix the init
|
||||
self.tokenizer = tokenizer
|
||||
self.image_token_id = 120120 # self.tokenizer.image_token_id
|
||||
self.image_token = self.tokenizer.convert_ids_to_tokens(self.image_token_id)
|
||||
self.im_start_token_id = 120118 # self.tokenizer.im_start_id
|
||||
self.im_start_token = self.tokenizer.convert_ids_to_tokens(
|
||||
self.im_start_token_id
|
||||
)
|
||||
self.im_end_token_id = 120119 # self.tokenizer.im_end_id
|
||||
self.im_end_token = self.tokenizer.convert_ids_to_tokens(self.im_end_token_id)
|
||||
self.placeholder_token = self.tokenizer.convert_ids_to_tokens(
|
||||
self.tokenizer.vocab_size - 1
|
||||
)
|
||||
self.pad_id = 120002 # self.tokenizer.pad_token_id
|
||||
|
||||
super().__init__(
|
||||
image_processor, tokenizer, video_processor, chat_template=chat_template
|
||||
)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
text: TextInput
|
||||
| PreTokenizedInput
|
||||
| list[TextInput]
|
||||
| list[PreTokenizedInput] = None,
|
||||
videos: VideoInput = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
image_inputs = {}
|
||||
if images is not None:
|
||||
image_inputs = self.image_processor(images=images)
|
||||
image_grid_thw = image_inputs["image_grid_thw"]
|
||||
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
|
||||
text = text.copy() # below lines change text in-place
|
||||
|
||||
image_tokens_cumsum = [0]
|
||||
if images is not None:
|
||||
index = 0
|
||||
for i in range(len(text)):
|
||||
while self.image_token in text[i]:
|
||||
grid_h, grid_w = image_grid_thw[index][-2:]
|
||||
patch_h = grid_h // self.image_processor.merge_size
|
||||
patch_w = grid_w // self.image_processor.merge_size
|
||||
num_image_tokens = patch_h * (patch_w + 1) + 2
|
||||
image_tokens_cumsum.append(
|
||||
image_tokens_cumsum[-1] + num_image_tokens
|
||||
)
|
||||
# text[i] = text[i].replace(self.image_token, self.im_start_token + self.placeholder_token * num_image_tokens + self.im_end_token, 1) # noqa: E501
|
||||
text[i] = text[i].replace(
|
||||
self.image_token, self.placeholder_token * num_image_tokens, 1
|
||||
)
|
||||
index += 1
|
||||
text[i] = text[i].replace(self.placeholder_token, self.image_token)
|
||||
# text[i] = self.tokenizer.bos_token + text[i]
|
||||
|
||||
text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
|
||||
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
||||
|
||||
input_ids = text_inputs["input_ids"]
|
||||
position_ids = torch.arange(len(input_ids[0]))
|
||||
position_ids_w = torch.arange(len(input_ids[0]))
|
||||
position_ids_h = torch.arange(len(input_ids[0]))
|
||||
position_ids_t = torch.arange(len(input_ids[0]))
|
||||
|
||||
if images is not None:
|
||||
image_token_pos_indices = torch.where(input_ids[0] == self.image_token_id)[
|
||||
0
|
||||
]
|
||||
for i in range(len(image_grid_thw)):
|
||||
grid_h, grid_w = image_grid_thw[i][-2:]
|
||||
patch_h = grid_h // self.image_processor.merge_size
|
||||
patch_w = grid_w // self.image_processor.merge_size
|
||||
start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1
|
||||
replace_num = (patch_w + 1) * patch_h
|
||||
position_ids_w[start_pos : start_pos + replace_num] = torch.tensor(
|
||||
list(range(patch_w + 1)) * patch_h, dtype=torch.int64
|
||||
)
|
||||
patch_h_list = []
|
||||
for h in range(patch_h):
|
||||
patch_h_list += [h] * (patch_w + 1)
|
||||
position_ids_h[start_pos : start_pos + replace_num] = torch.tensor(
|
||||
patch_h_list, dtype=torch.int64
|
||||
)
|
||||
position_ids_t[start_pos : start_pos + replace_num] = 0
|
||||
|
||||
position_ids = torch.stack(
|
||||
[position_ids, position_ids_w, position_ids_h, position_ids_t]
|
||||
).unsqueeze(0)
|
||||
text_inputs["position_ids"] = position_ids
|
||||
|
||||
attention_mask = input_ids.ne(self.pad_id)
|
||||
text_inputs["attention_mask"] = attention_mask
|
||||
text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids)]
|
||||
# image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
|
||||
|
||||
return_tensors = kwargs.pop("return_tensors", None)
|
||||
return BatchFeature(
|
||||
data={**text_inputs, **image_inputs},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
def decode(self, *args, **kwargs):
|
||||
return self.tokenizer.decode(*args, **kwargs)
|
||||
|
||||
def post_process_image_text_to_text(
|
||||
self,
|
||||
generated_outputs,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False,
|
||||
**kwargs,
|
||||
):
|
||||
assert 0
|
||||
|
||||
def apply_chat_template(self, *args, **kwargs):
|
||||
token_ids = self.tokenizer.apply_chat_template(*args, **kwargs)
|
||||
return token_ids
|
||||
|
||||
def get_imgs_pos(self, doc_ids):
|
||||
doc_ids = np.array(doc_ids, dtype=np.int64)
|
||||
img_begin_index = np.where(doc_ids == self.im_start_token_id)[0]
|
||||
img_end_index = np.where(doc_ids == self.im_end_token_id)[0]
|
||||
imgs_pos = np.concatenate(
|
||||
(
|
||||
np.reshape(img_begin_index + 1, (-1, 1)),
|
||||
np.reshape(img_end_index, (-1, 1)),
|
||||
),
|
||||
axis=-1,
|
||||
).tolist()
|
||||
return imgs_pos
|
||||
|
||||
@property
|
||||
def model_input_names(self):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||
|
||||
|
||||
def split_image_into_patch_blocks(
|
||||
pixel_values: torch.Tensor, # shape: [batch_size, 3, H, W]
|
||||
patch_size: int = 16, # e.g. 16
|
||||
adaptor_patch_div: int = 4, # e.g. 4 --> each patch_size is cut into 4x4 small regions, i.e. patch_size // 4 # noqa: E501
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Split the input image tensor (supporting batch) into large patches of size `patch_size`,
|
||||
and then further divide each large patch into smaller regions of size
|
||||
(patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div).
|
||||
Each small region is extracted as a tensor of shape [3, patch_size, patch_size].
|
||||
The final output contains all such small region tensors.
|
||||
|
||||
Args:
|
||||
pixel_values: Input image tensor of shape [batch_size, 3, H, W].
|
||||
patch_size: Size of the large patch, e.g., 16.
|
||||
adaptor_patch_div: Each large patch is divided into
|
||||
(patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div)
|
||||
smaller regions.
|
||||
|
||||
Returns:
|
||||
patches: A tensor of shape [N, 3, patch_size, patch_size],
|
||||
where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2.
|
||||
Each element in the batch corresponds to one small image region.
|
||||
""" # noqa: E501
|
||||
batch_size, channels, height, width = pixel_values.shape
|
||||
assert channels == 3, "Pixel values must have 3 channels in dim=1"
|
||||
assert height % patch_size == 0 and width % patch_size == 0, (
|
||||
"H and W must be divisible by patch_size"
|
||||
)
|
||||
|
||||
patch_height_num = height // patch_size
|
||||
patch_width_num = width // patch_size
|
||||
|
||||
# Reshape to [B, 3, ph, ps, pw, ps]
|
||||
img = pixel_values.reshape(
|
||||
batch_size, 3, patch_height_num, patch_size, patch_width_num, patch_size
|
||||
)
|
||||
|
||||
# Further split each psxps patch into (ps//aps)x(ps//aps) small regions
|
||||
img = img.reshape(
|
||||
batch_size,
|
||||
3,
|
||||
patch_height_num,
|
||||
patch_size // adaptor_patch_div, # ps // aps
|
||||
adaptor_patch_div,
|
||||
patch_width_num,
|
||||
patch_size // adaptor_patch_div, # ps // aps
|
||||
adaptor_patch_div,
|
||||
)
|
||||
|
||||
# Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps]
|
||||
img = img.permute(0, 2, 5, 3, 6, 1, 4, 7)
|
||||
|
||||
# Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size]
|
||||
patches = img.reshape(-1, 3, patch_size, patch_size)
|
||||
|
||||
return patches
|
||||
|
||||
|
||||
AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor)
|
||||
477
vllm/transformers_utils/processors/hunyuan_vl_image.py
Normal file
477
vllm/transformers_utils/processors/hunyuan_vl_image.py
Normal file
@ -0,0 +1,477 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/image_processing_hunyuan_vl.py
|
||||
"""Image processor class for HunYuanVL."""
|
||||
|
||||
# isort conflicts with ruff for transformers imports
|
||||
# isort: skip_file
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import torchvision.transforms as transforms
|
||||
from transformers import AutoImageProcessor
|
||||
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
|
||||
from transformers.image_transforms import (
|
||||
convert_to_rgb,
|
||||
)
|
||||
from transformers.image_utils import (
|
||||
OPENAI_CLIP_MEAN,
|
||||
OPENAI_CLIP_STD,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
make_flat_list_of_images,
|
||||
make_list_of_images,
|
||||
valid_images,
|
||||
validate_preprocess_arguments,
|
||||
)
|
||||
from transformers.utils import TensorType, logging
|
||||
from transformers.video_utils import VideoInput, make_batched_videos
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def smart_resize(
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = 16,
|
||||
min_pixels: int = 512 * 512,
|
||||
max_pixels: int = 2048 * 2048,
|
||||
):
|
||||
"""Rescales the image so that the following conditions are met:
|
||||
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
|
||||
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
||||
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
|
||||
"""
|
||||
if max(height, width) / min(height, width) > 200:
|
||||
raise ValueError(
|
||||
"absolute aspect ratio must be smaller than 200, got "
|
||||
f"{max(height, width) / min(height, width)}"
|
||||
)
|
||||
h_bar = round(height / factor) * factor
|
||||
w_bar = round(width / factor) * factor
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = max(factor, math.floor(height / beta / factor) * factor)
|
||||
w_bar = max(factor, math.floor(width / beta / factor) * factor)
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = math.ceil(height * beta / factor) * factor
|
||||
w_bar = math.ceil(width * beta / factor) * factor
|
||||
return h_bar, w_bar
|
||||
|
||||
|
||||
class HunYuanVLImageProcessor(BaseImageProcessor):
|
||||
model_input_names = [
|
||||
"pixel_values",
|
||||
"image_grid_thw",
|
||||
"pixel_values_videos",
|
||||
"video_grid_thw",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
size: dict[str, int] | None = None,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: int | float = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: float | list[float] | None = None,
|
||||
image_std: float | list[float] | None = None,
|
||||
do_convert_rgb: bool = True,
|
||||
min_pixels: int | None = None,
|
||||
max_pixels: int | None = None,
|
||||
patch_size: int = 16,
|
||||
temporal_patch_size: int = 2,
|
||||
merge_size: int = 2,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
if size is not None and (
|
||||
"shortest_edge" not in size or "longest_edge" not in size
|
||||
):
|
||||
raise ValueError(
|
||||
"size must contain 'shortest_edge' and 'longest_edge' keys."
|
||||
)
|
||||
else:
|
||||
size = {"shortest_edge": 512 * 512, "longest_edge": 2048 * 2048}
|
||||
# backward compatibility: override size with min_pixels and max_pixels
|
||||
# if they are provided.
|
||||
if min_pixels is not None:
|
||||
size["shortest_edge"] = min_pixels
|
||||
if max_pixels is not None:
|
||||
size["longest_edge"] = max_pixels
|
||||
self.min_pixels = size["shortest_edge"]
|
||||
self.max_pixels = size["longest_edge"]
|
||||
self.size = size
|
||||
|
||||
self.do_resize = do_resize
|
||||
self.resample = resample
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
||||
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
||||
|
||||
self.patch_size = patch_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.merge_size = merge_size
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
# hard-code
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: ImageInput | VideoInput,
|
||||
do_resize: bool | None = None,
|
||||
size: dict[str, int] | None = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: bool | None = None,
|
||||
rescale_factor: float | None = None,
|
||||
do_normalize: bool | None = None,
|
||||
image_mean: float | list[float] | None = None,
|
||||
image_std: float | list[float] | None = None,
|
||||
patch_size: int = 16,
|
||||
temporal_patch_size: int = 2,
|
||||
merge_size: int = 2,
|
||||
do_convert_rgb: bool | None = None,
|
||||
data_format: ChannelDimension | None = ChannelDimension.FIRST,
|
||||
input_data_format: str | ChannelDimension | None = None,
|
||||
):
|
||||
"""
|
||||
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Scale factor to use if rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
||||
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
patch_size (`int`, *optional*, defaults to `self.patch_size`):
|
||||
The spatial patch size of the vision encoder.
|
||||
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
|
||||
The temporal patch size of the vision encoder.
|
||||
merge_size (`int`, *optional*, defaults to `self.merge_size`):
|
||||
The merge size of the vision encoder to llm encoder.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
""" # noqa: E501
|
||||
images = make_list_of_images(images)
|
||||
|
||||
if do_convert_rgb:
|
||||
images = [convert_to_rgb(image) for image in images]
|
||||
|
||||
width, height = images[0].width, images[0].height
|
||||
resized_width, resized_height = width, height
|
||||
processed_images = []
|
||||
for image in images:
|
||||
if do_resize:
|
||||
resized_width, resized_height = smart_resize(
|
||||
width,
|
||||
height,
|
||||
factor=patch_size * merge_size,
|
||||
min_pixels=self.min_pixels,
|
||||
max_pixels=self.max_pixels,
|
||||
)
|
||||
image = image.resize((resized_width, resized_height))
|
||||
|
||||
if do_normalize:
|
||||
image = transforms.Compose(
|
||||
[
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(self.image_mean, self.image_std),
|
||||
]
|
||||
)(image)
|
||||
processed_images.append(image)
|
||||
|
||||
patches = np.array(processed_images)
|
||||
channel = patches.shape[1]
|
||||
grid_t = patches.shape[0] // temporal_patch_size
|
||||
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||
patches = patches.reshape(
|
||||
1,
|
||||
channel,
|
||||
grid_h // merge_size,
|
||||
merge_size,
|
||||
patch_size,
|
||||
grid_w // merge_size,
|
||||
merge_size,
|
||||
patch_size,
|
||||
)
|
||||
patches = patches.transpose(0, 2, 3, 5, 6, 1, 4, 7)
|
||||
flatten_patches = patches.reshape(
|
||||
1 * grid_h * grid_w, channel * patch_size * patch_size
|
||||
)
|
||||
|
||||
return flatten_patches, (grid_t, grid_h, grid_w)
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
videos: VideoInput = None,
|
||||
do_resize: bool | None = None,
|
||||
size: dict[str, int] | None = None,
|
||||
min_pixels: int | None = None,
|
||||
max_pixels: int | None = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: bool | None = None,
|
||||
rescale_factor: float | None = None,
|
||||
do_normalize: bool | None = None,
|
||||
image_mean: float | list[float] | None = None,
|
||||
image_std: float | list[float] | None = None,
|
||||
patch_size: int | None = None,
|
||||
temporal_patch_size: int | None = None,
|
||||
merge_size: int | None = None,
|
||||
do_convert_rgb: bool | None = None,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
data_format: ChannelDimension | None = ChannelDimension.FIRST,
|
||||
input_data_format: str | ChannelDimension | None = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
videos (`VideoInput`):
|
||||
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
|
||||
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio.
|
||||
resample (`int`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
||||
has an effect if `do_resize` is set to `True`.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
||||
`True`.
|
||||
min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
|
||||
The min pixels of the image to resize the image.
|
||||
max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
|
||||
The max pixels of the image to resize the image.
|
||||
patch_size (`int`, *optional*, defaults to `self.patch_size`):
|
||||
The spatial patch size of the vision encoder.
|
||||
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
|
||||
The temporal patch size of the vision encoder.
|
||||
merge_size (`int`, *optional*, defaults to `self.merge_size`):
|
||||
The merge size of the vision encoder to llm encoder.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Can be one of:
|
||||
- Unset: Return a list of `np.ndarray`.
|
||||
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
|
||||
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
|
||||
""" # noqa: E501
|
||||
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||
|
||||
if size is not None:
|
||||
if "shortest_edge" not in size or "longest_edge" not in size:
|
||||
raise ValueError(
|
||||
"size must contain 'shortest_edge' and 'longest_edge' keys."
|
||||
)
|
||||
min_pixels = size["shortest_edge"]
|
||||
elif min_pixels is not None and max_pixels is not None:
|
||||
# backward compatibility: override size with min_pixels and max_pixels
|
||||
# if they are provided.
|
||||
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
|
||||
else:
|
||||
size = {**self.size}
|
||||
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
|
||||
resample = resample if resample is not None else self.resample
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = (
|
||||
rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
)
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
patch_size = patch_size if patch_size is not None else self.patch_size
|
||||
temporal_patch_size = (
|
||||
temporal_patch_size
|
||||
if temporal_patch_size is not None
|
||||
else self.temporal_patch_size
|
||||
)
|
||||
merge_size = merge_size if merge_size is not None else self.merge_size
|
||||
do_convert_rgb = (
|
||||
do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
)
|
||||
|
||||
if images is not None:
|
||||
images = make_flat_list_of_images(images)
|
||||
|
||||
if images is not None and not valid_images(images):
|
||||
raise ValueError(
|
||||
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
||||
"torch.Tensor, tf.Tensor or jax.ndarray."
|
||||
)
|
||||
|
||||
validate_preprocess_arguments(
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
)
|
||||
|
||||
data = {}
|
||||
if images is not None:
|
||||
pixel_values, vision_grid_thws = [], []
|
||||
for image in images:
|
||||
patches, image_grid_thw = self._preprocess(
|
||||
image,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
patch_size=patch_size,
|
||||
temporal_patch_size=temporal_patch_size,
|
||||
merge_size=merge_size,
|
||||
data_format=data_format,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
pixel_values.extend(patches)
|
||||
vision_grid_thws.append(image_grid_thw)
|
||||
pixel_values = np.array(pixel_values)
|
||||
vision_grid_thws = np.array(vision_grid_thws)
|
||||
data.update(
|
||||
{"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
|
||||
)
|
||||
|
||||
# kept for BC only and should be removed after v5.0
|
||||
if videos is not None:
|
||||
logger.warning(
|
||||
"`HunYuanVLV1ImageProcessor` works only with image inputs "
|
||||
"and doesn't process videos anymore. "
|
||||
"This is a deprecated behavior and will be removed in v5.0. "
|
||||
"Your videos should be forwarded to `HunYuanVLV1VideoProcessor`. "
|
||||
)
|
||||
videos = make_batched_videos(videos)
|
||||
pixel_values_videos, vision_grid_thws_videos = [], []
|
||||
for images in videos:
|
||||
patches, video_grid_thw = self._preprocess(
|
||||
images,
|
||||
do_resize=do_resize,
|
||||
size=size,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
patch_size=patch_size,
|
||||
temporal_patch_size=temporal_patch_size,
|
||||
merge_size=merge_size,
|
||||
data_format=data_format,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
pixel_values_videos.extend(patches)
|
||||
vision_grid_thws_videos.append(video_grid_thw)
|
||||
data.update(
|
||||
{
|
||||
"pixel_values_videos": np.array(pixel_values_videos),
|
||||
"video_grid_thw": np.array(vision_grid_thws_videos),
|
||||
}
|
||||
)
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
|
||||
"""
|
||||
A utility that returns number of image patches for a given image size.
|
||||
|
||||
Args:
|
||||
height (`int`):
|
||||
Height of the input image.
|
||||
width (`int`):
|
||||
Width of the input image.
|
||||
images_kwargs (`dict`, *optional*):
|
||||
Any kwargs to override defaults of the image processor.
|
||||
Returns:
|
||||
`int`: Number of image patches per image.
|
||||
"""
|
||||
min_pixels = (
|
||||
images_kwargs["min_pixels"]
|
||||
if "min_pixels" in images_kwargs
|
||||
else self.size["shortest_edge"]
|
||||
)
|
||||
max_pixels = (
|
||||
images_kwargs["max_pixels"]
|
||||
if "max_pixels" in images_kwargs
|
||||
else self.size["longest_edge"]
|
||||
)
|
||||
patch_size = images_kwargs.get("patch_size", self.patch_size)
|
||||
merge_size = images_kwargs.get("merge_size", self.merge_size)
|
||||
|
||||
factor = patch_size * merge_size
|
||||
resized_height, resized_width = smart_resize(
|
||||
height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
|
||||
)
|
||||
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
||||
return grid_h * (grid_w + 1) + 2
|
||||
|
||||
|
||||
AutoImageProcessor.register("HunYuanVLImageProcessor", HunYuanVLImageProcessor)
|
||||
@ -52,9 +52,11 @@ STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
|
||||
STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
|
||||
STR_INVALID_VAL: str = "INVALID"
|
||||
|
||||
MASK_64_BITS = (1 << 64) - 1
|
||||
|
||||
|
||||
def random_uuid() -> str:
|
||||
return str(uuid.uuid4().hex)
|
||||
return f"{uuid.uuid4().int & MASK_64_BITS:016x}" # 16 hex chars
|
||||
|
||||
|
||||
def length_from_prompt_token_ids_or_embeds(
|
||||
|
||||
@ -7,6 +7,7 @@ import torch
|
||||
from vllm.attention.backends.abstract import AttentionBackend
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.v1.attention.backends.utils import (
|
||||
AttentionCGSupport,
|
||||
AttentionMetadataBuilder,
|
||||
CommonAttentionMetadata,
|
||||
split_decodes_and_prefills,
|
||||
@ -35,6 +36,8 @@ class LinearAttentionMetadata:
|
||||
class LinearAttentionMetadataBuilder(AttentionMetadataBuilder[LinearAttentionMetadata]):
|
||||
reorder_batch_threshold: int = 1
|
||||
|
||||
_cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kv_cache_spec: AttentionSpec,
|
||||
|
||||
@ -340,6 +340,8 @@ class MLACommonPrefillMetadata:
|
||||
max_seq_lens: list[int]
|
||||
seq_lens: torch.Tensor
|
||||
workspace: torch.Tensor
|
||||
token_to_seq: torch.Tensor
|
||||
chunk_total_token: list[int]
|
||||
|
||||
# for mla DCP
|
||||
padded_local_chunk_seq_lens: list[list[int]] | None = None
|
||||
@ -839,6 +841,19 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
|
||||
torch.cumsum(
|
||||
chunk_seq_lens, dim=1, out=cu_seq_lens_cpu[:, 1:], dtype=torch.int32
|
||||
)
|
||||
chunk_total_token = cu_seq_lens_cpu[:, -1]
|
||||
|
||||
max_token_num_over_chunk = chunk_total_token.max().item()
|
||||
token_to_seq_tensor_cpu = torch.zeros(
|
||||
[num_chunks, max_token_num_over_chunk], dtype=torch.int32
|
||||
)
|
||||
range_idx = torch.arange(num_prefills, dtype=torch.int32)
|
||||
for i in range(num_chunks):
|
||||
chunk_token_to_seq_tensor = torch.repeat_interleave(
|
||||
range_idx, chunk_seq_lens[i]
|
||||
)
|
||||
chunk_len = chunk_token_to_seq_tensor.shape[0]
|
||||
token_to_seq_tensor_cpu[i, :chunk_len] = chunk_token_to_seq_tensor
|
||||
|
||||
if self.dcp_world_size > 1:
|
||||
local_context_lens_allranks = get_dcp_local_seq_lens(
|
||||
@ -906,6 +921,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
|
||||
seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(),
|
||||
max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
|
||||
seq_lens=chunk_seq_lens,
|
||||
token_to_seq=token_to_seq_tensor_cpu.to(
|
||||
device, non_blocking=True
|
||||
),
|
||||
chunk_total_token=chunk_total_token.tolist(),
|
||||
workspace=self.chunked_prefill_workspace,
|
||||
padded_local_chunk_seq_lens=padded_local_chunk_seq_lens.tolist(),
|
||||
local_context_lens_allranks=local_context_lens_allranks.tolist(),
|
||||
@ -922,6 +941,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
|
||||
seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
|
||||
max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
|
||||
seq_lens=chunk_seq_lens,
|
||||
token_to_seq=token_to_seq_tensor_cpu.to(
|
||||
device, non_blocking=True
|
||||
),
|
||||
chunk_total_token=chunk_total_token,
|
||||
workspace=self.chunked_prefill_workspace,
|
||||
)
|
||||
|
||||
@ -1638,16 +1661,15 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
|
||||
output = None
|
||||
iters = len(prefill_metadata.chunked_context.seq_tot)
|
||||
workspace = prefill_metadata.chunked_context.workspace
|
||||
|
||||
for i in range(iters):
|
||||
toks = prefill_metadata.chunked_context.seq_tot[i]
|
||||
|
||||
ops.gather_and_maybe_dequant_cache(
|
||||
src_cache=kv_c_and_k_pe_cache,
|
||||
dst=workspace,
|
||||
block_table=prefill_metadata.block_table,
|
||||
cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i],
|
||||
batch_size=attn_metadata.num_prefills,
|
||||
token_to_seq=prefill_metadata.chunked_context.token_to_seq[i],
|
||||
num_tokens=prefill_metadata.chunked_context.chunk_total_token[i],
|
||||
kv_cache_dtype=self.kv_cache_dtype,
|
||||
scale=k_scale,
|
||||
seq_starts=prefill_metadata.chunked_context.starts[i],
|
||||
|
||||
@ -49,6 +49,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
|
||||
paged_kv_last_page_len: torch.Tensor | None = None
|
||||
# The query indptr, shape : [num_decode + 1]
|
||||
qo_indptr: torch.Tensor | None = None
|
||||
# The dtype of MLA out tensor
|
||||
attn_out_dtype: torch.dtype = torch.bfloat16
|
||||
|
||||
|
||||
class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
|
||||
@ -74,6 +76,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
|
||||
)
|
||||
|
||||
self.compilation_config = vllm_config.compilation_config
|
||||
self.decode_attn_out_dtype = vllm_config.model_config.dtype
|
||||
# kernel block size is always 1.
|
||||
max_num_pages_per_req = vllm_config.model_config.max_model_len
|
||||
max_num_reqs = vllm_config.scheduler_config.max_num_seqs
|
||||
@ -162,6 +165,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
|
||||
paged_kv_last_page_len=paged_kv_last_page_len,
|
||||
qo_indptr=qo_indptr,
|
||||
dcp_tot_seq_lens=dcp_tot_seq_lens_device,
|
||||
attn_out_dtype=self.decode_attn_out_dtype,
|
||||
)
|
||||
|
||||
return attn_metadata
|
||||
@ -242,7 +246,11 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
|
||||
assert isinstance(q, torch.Tensor)
|
||||
B = q.shape[0]
|
||||
o = torch.zeros(
|
||||
B, self.num_heads, self.kv_lora_rank, dtype=q.dtype, device=q.device
|
||||
B,
|
||||
self.num_heads,
|
||||
self.kv_lora_rank,
|
||||
dtype=attn_metadata.decode.attn_out_dtype,
|
||||
device=q.device,
|
||||
)
|
||||
|
||||
kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
|
||||
@ -260,6 +268,8 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
|
||||
attn_metadata.decode.paged_kv_indptr,
|
||||
attn_metadata.decode.paged_kv_indices,
|
||||
attn_metadata.decode.paged_kv_last_page_len,
|
||||
q_scale=layer._q_scale,
|
||||
kv_scale=layer._k_scale,
|
||||
)
|
||||
|
||||
return o, None
|
||||
|
||||
@ -971,7 +971,16 @@ def _get_kv_cache_groups_uniform_page_size(
|
||||
# is the minimum number of layers among all attention types. Need a better
|
||||
# strategy if we want to support more complex patterns (e.g., 20 full + 30
|
||||
# sw, where the group size should be 10).
|
||||
group_size = min([len(layers) for layers in same_type_layers.values()])
|
||||
min_num_layers = min([len(layers) for layers in same_type_layers.values()])
|
||||
group_size = min_num_layers
|
||||
max_num_layers = max([len(layers) for layers in same_type_layers.values()])
|
||||
if max_num_layers < min_num_layers * 1.25:
|
||||
# If the number of layers is not much larger than the minimum number of layers,
|
||||
# use the maximum number of layers as the group size to avoid too many padding
|
||||
# layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We
|
||||
# pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a
|
||||
# magic number to avoid too many padding layers.
|
||||
group_size = max_num_layers
|
||||
grouped_layers = []
|
||||
for layers in same_type_layers.values():
|
||||
num_padding_layers = group_size - len(layers) % group_size
|
||||
|
||||
@ -508,9 +508,9 @@ class Scheduler(SchedulerInterface):
|
||||
not self.scheduler_config.enable_chunked_prefill
|
||||
and num_new_tokens > token_budget
|
||||
):
|
||||
self.waiting.pop_request()
|
||||
skipped_waiting_requests.prepend_request(request)
|
||||
continue
|
||||
# If chunked_prefill is disabled,
|
||||
# we can stop the scheduling here.
|
||||
break
|
||||
|
||||
num_new_tokens = min(num_new_tokens, token_budget)
|
||||
assert num_new_tokens > 0
|
||||
|
||||
@ -440,57 +440,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
||||
# Setting default values
|
||||
self.record_sleep_state()
|
||||
|
||||
# GPU cache
|
||||
#
|
||||
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
|
||||
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
|
||||
# TODO: remove in 0.12.0
|
||||
if self.show_hidden_metrics:
|
||||
gauge_gpu_cache_usage = self._gauge_cls(
|
||||
name="vllm:gpu_cache_usage_perc",
|
||||
documentation=(
|
||||
"GPU KV-cache usage. 1 means 100 percent usage."
|
||||
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."
|
||||
),
|
||||
multiprocess_mode="mostrecent",
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.gauge_gpu_cache_usage = make_per_engine(
|
||||
gauge_gpu_cache_usage, engine_indexes, model_name
|
||||
)
|
||||
|
||||
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
|
||||
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
|
||||
# TODO: remove in 0.12.0
|
||||
if self.show_hidden_metrics:
|
||||
counter_gpu_prefix_cache_queries = self._counter_cls(
|
||||
name="vllm:gpu_prefix_cache_queries",
|
||||
documentation=(
|
||||
"GPU prefix cache queries, in terms of number of queried"
|
||||
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
|
||||
),
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.counter_gpu_prefix_cache_queries = make_per_engine(
|
||||
counter_gpu_prefix_cache_queries, engine_indexes, model_name
|
||||
)
|
||||
|
||||
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
|
||||
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
|
||||
# TODO: remove in 0.12.0
|
||||
if self.show_hidden_metrics:
|
||||
counter_gpu_prefix_cache_hits = self._counter_cls(
|
||||
name="vllm:gpu_prefix_cache_hits",
|
||||
documentation=(
|
||||
"GPU prefix cache hits, in terms of number of cached "
|
||||
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
|
||||
),
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.counter_gpu_prefix_cache_hits = make_per_engine(
|
||||
counter_gpu_prefix_cache_hits, engine_indexes, model_name
|
||||
)
|
||||
|
||||
gauge_kv_cache_usage = self._gauge_cls(
|
||||
name="vllm:kv_cache_usage_perc",
|
||||
documentation="KV-cache usage. 1 means 100 percent usage.",
|
||||
@ -735,39 +684,41 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
||||
)
|
||||
|
||||
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
|
||||
# TODO: in 0.12, only enable if show_hidden_metrics=True
|
||||
histogram_time_per_output_token = self._histogram_cls(
|
||||
name="vllm:time_per_output_token_seconds",
|
||||
documentation=(
|
||||
"Histogram of time per output token in seconds."
|
||||
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."
|
||||
),
|
||||
buckets=[
|
||||
0.01,
|
||||
0.025,
|
||||
0.05,
|
||||
0.075,
|
||||
0.1,
|
||||
0.15,
|
||||
0.2,
|
||||
0.3,
|
||||
0.4,
|
||||
0.5,
|
||||
0.75,
|
||||
1.0,
|
||||
2.5,
|
||||
5.0,
|
||||
7.5,
|
||||
10.0,
|
||||
20.0,
|
||||
40.0,
|
||||
80.0,
|
||||
],
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.histogram_time_per_output_token = make_per_engine(
|
||||
histogram_time_per_output_token, engine_indexes, model_name
|
||||
)
|
||||
# With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
|
||||
# TODO: remove in 0.13.0
|
||||
if self.show_hidden_metrics:
|
||||
histogram_time_per_output_token = self._histogram_cls(
|
||||
name="vllm:time_per_output_token_seconds",
|
||||
documentation=(
|
||||
"Histogram of time per output token in seconds."
|
||||
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."
|
||||
),
|
||||
buckets=[
|
||||
0.01,
|
||||
0.025,
|
||||
0.05,
|
||||
0.075,
|
||||
0.1,
|
||||
0.15,
|
||||
0.2,
|
||||
0.3,
|
||||
0.4,
|
||||
0.5,
|
||||
0.75,
|
||||
1.0,
|
||||
2.5,
|
||||
5.0,
|
||||
7.5,
|
||||
10.0,
|
||||
20.0,
|
||||
40.0,
|
||||
80.0,
|
||||
],
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.histogram_time_per_output_token = make_per_engine(
|
||||
histogram_time_per_output_token, engine_indexes, model_name
|
||||
)
|
||||
|
||||
histogram_inter_token_latency = self._histogram_cls(
|
||||
name="vllm:inter_token_latency_seconds",
|
||||
@ -966,20 +917,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
||||
self.gauge_scheduler_waiting[engine_idx].set(
|
||||
scheduler_stats.num_waiting_reqs
|
||||
)
|
||||
if self.show_hidden_metrics:
|
||||
self.gauge_gpu_cache_usage[engine_idx].set(
|
||||
scheduler_stats.kv_cache_usage
|
||||
)
|
||||
self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
|
||||
|
||||
if self.show_hidden_metrics:
|
||||
self.counter_gpu_prefix_cache_queries[engine_idx].inc(
|
||||
scheduler_stats.prefix_cache_stats.queries
|
||||
)
|
||||
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
|
||||
scheduler_stats.prefix_cache_stats.hits
|
||||
)
|
||||
|
||||
self.counter_prefix_cache_queries[engine_idx].inc(
|
||||
scheduler_stats.prefix_cache_stats.queries
|
||||
)
|
||||
@ -1050,7 +989,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
||||
self.histogram_time_to_first_token[engine_idx].observe(ttft)
|
||||
for itl in iteration_stats.inter_token_latencies_iter:
|
||||
self.histogram_inter_token_latency[engine_idx].observe(itl)
|
||||
self.histogram_time_per_output_token[engine_idx].observe(itl)
|
||||
if self.show_hidden_metrics:
|
||||
self.histogram_time_per_output_token[engine_idx].observe(itl)
|
||||
|
||||
for finished_request in iteration_stats.finished_requests:
|
||||
self.counter_request_success[finished_request.finish_reason][
|
||||
|
||||
@ -40,6 +40,7 @@ from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.sample.sampler import _SAMPLING_EPS
|
||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
from vllm.v1.utils import CpuGpuBuffer
|
||||
from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
|
||||
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -65,6 +66,7 @@ class EagleProposer:
|
||||
self.dtype = vllm_config.model_config.dtype
|
||||
self.max_model_len = vllm_config.model_config.max_model_len
|
||||
self.block_size = vllm_config.cache_config.block_size
|
||||
self.dp_rank = vllm_config.parallel_config.data_parallel_rank
|
||||
self.num_speculative_tokens = self.speculative_config.num_speculative_tokens
|
||||
self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
|
||||
self.token_arange_np = np.arange(self.max_num_tokens)
|
||||
@ -83,6 +85,9 @@ class EagleProposer:
|
||||
self.draft_indexer_metadata_builder: AttentionMetadataBuilder | None = None
|
||||
self.attn_layer_names: list[str] = []
|
||||
self.indexer_layer_names: list[str] = []
|
||||
self.eagle3_use_aux_hidden_state: bool = (
|
||||
self._get_eagle3_use_aux_hidden_state_from_config()
|
||||
)
|
||||
|
||||
self.use_cuda_graph = False
|
||||
|
||||
@ -268,15 +273,24 @@ class EagleProposer:
|
||||
assert draft_indexer_metadata is not None
|
||||
per_layer_attn_metadata[layer_name] = draft_indexer_metadata
|
||||
|
||||
num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
|
||||
num_tokens_unpadded=num_tokens,
|
||||
num_tokens_padded=num_tokens,
|
||||
)
|
||||
|
||||
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
||||
if (
|
||||
self.use_cuda_graph
|
||||
and num_tokens <= self.compilation_config.max_cudagraph_capture_size
|
||||
and num_tokens_dp_padded
|
||||
<= self.compilation_config.max_cudagraph_capture_size
|
||||
):
|
||||
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
|
||||
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens_dp_padded)
|
||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||
else:
|
||||
num_input_tokens = num_tokens
|
||||
num_input_tokens = num_tokens_dp_padded
|
||||
if num_tokens_across_dp is not None:
|
||||
num_tokens_across_dp[self.dp_rank] = num_input_tokens
|
||||
|
||||
# copy inputs to buffer for cudagraph
|
||||
self._set_positions(num_tokens, target_positions)
|
||||
self.hidden_states[:num_tokens] = target_hidden_states
|
||||
@ -300,6 +314,7 @@ class EagleProposer:
|
||||
per_layer_attn_metadata,
|
||||
self.vllm_config,
|
||||
num_tokens=num_input_tokens,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
cudagraph_runtime_mode=cudagraph_runtime_mode,
|
||||
):
|
||||
ret_hidden_states = self.model(
|
||||
@ -362,15 +377,23 @@ class EagleProposer:
|
||||
# Generate the remaining draft tokens.
|
||||
draft_token_ids_list = [draft_token_ids]
|
||||
|
||||
batch_size_dp_padded, batch_size_across_dp = self._pad_batch_across_dp(
|
||||
num_tokens_unpadded=batch_size,
|
||||
num_tokens_padded=batch_size,
|
||||
)
|
||||
|
||||
if (
|
||||
self.use_cuda_graph
|
||||
and batch_size <= self.compilation_config.max_cudagraph_capture_size
|
||||
and batch_size_dp_padded
|
||||
<= self.compilation_config.max_cudagraph_capture_size
|
||||
):
|
||||
input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
|
||||
input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size_dp_padded)
|
||||
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
|
||||
else:
|
||||
input_batch_size = batch_size
|
||||
input_batch_size = batch_size_dp_padded
|
||||
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
||||
if batch_size_across_dp is not None:
|
||||
batch_size_across_dp[self.dp_rank] = input_batch_size
|
||||
|
||||
common_attn_metadata.num_actual_tokens = batch_size
|
||||
common_attn_metadata.max_query_len = 1
|
||||
@ -471,6 +494,7 @@ class EagleProposer:
|
||||
per_layer_attn_metadata,
|
||||
self.vllm_config,
|
||||
num_tokens=input_batch_size,
|
||||
num_tokens_across_dp=batch_size_across_dp,
|
||||
cudagraph_runtime_mode=cudagraph_runtime_mode,
|
||||
):
|
||||
ret_hidden_states = self.model(
|
||||
@ -1113,36 +1137,56 @@ class EagleProposer:
|
||||
self,
|
||||
num_tokens: int,
|
||||
use_cudagraphs=True,
|
||||
is_graph_capturing=False,
|
||||
) -> None:
|
||||
# Determine if CUDA graphs should be used for this run.
|
||||
cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
|
||||
if (
|
||||
cudagraphs_enabled
|
||||
and num_tokens <= self.compilation_config.max_cudagraph_capture_size
|
||||
):
|
||||
num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
|
||||
|
||||
with set_forward_context(
|
||||
None,
|
||||
self.vllm_config,
|
||||
num_tokens=num_tokens,
|
||||
cudagraph_runtime_mode=(
|
||||
CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE
|
||||
),
|
||||
# FIXME: when using tree-based specdec, adjust number of forward-passes
|
||||
# according to the depth of the tree.
|
||||
for fwd_idx in range(
|
||||
self.num_speculative_tokens if not is_graph_capturing else 1
|
||||
):
|
||||
if self.supports_mm_inputs:
|
||||
input_ids = None
|
||||
inputs_embeds = self.inputs_embeds[:num_tokens]
|
||||
else:
|
||||
input_ids = self.input_ids[:num_tokens]
|
||||
inputs_embeds = None
|
||||
if fwd_idx <= 1:
|
||||
num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
|
||||
num_tokens_unpadded=num_tokens,
|
||||
num_tokens_padded=num_tokens,
|
||||
)
|
||||
if (
|
||||
cudagraphs_enabled
|
||||
and num_tokens_dp_padded
|
||||
<= self.compilation_config.max_cudagraph_capture_size
|
||||
):
|
||||
num_input_tokens = self.vllm_config.pad_for_cudagraph(
|
||||
num_tokens_dp_padded
|
||||
)
|
||||
else:
|
||||
num_input_tokens = num_tokens_dp_padded
|
||||
if num_tokens_across_dp is not None:
|
||||
num_tokens_across_dp[self.dp_rank] = num_input_tokens
|
||||
|
||||
self.model(
|
||||
input_ids=input_ids,
|
||||
positions=self._get_positions(num_tokens),
|
||||
hidden_states=self.hidden_states[:num_tokens],
|
||||
inputs_embeds=inputs_embeds,
|
||||
)
|
||||
with set_forward_context(
|
||||
None,
|
||||
self.vllm_config,
|
||||
num_tokens=num_input_tokens,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE
|
||||
if cudagraphs_enabled
|
||||
else CUDAGraphMode.NONE,
|
||||
):
|
||||
if self.supports_mm_inputs:
|
||||
input_ids = None
|
||||
inputs_embeds = self.inputs_embeds[:num_input_tokens]
|
||||
else:
|
||||
input_ids = self.input_ids[:num_input_tokens]
|
||||
inputs_embeds = None
|
||||
|
||||
self.model(
|
||||
input_ids=input_ids,
|
||||
positions=self._get_positions(num_input_tokens),
|
||||
hidden_states=self.hidden_states[:num_input_tokens],
|
||||
inputs_embeds=inputs_embeds,
|
||||
)
|
||||
|
||||
def _get_attention_metadata_builder(self) -> AttentionMetadataBuilder:
|
||||
"""Find and return the attention metadata builders for EAGLE layers.
|
||||
@ -1169,6 +1213,22 @@ class EagleProposer:
|
||||
)
|
||||
return builder
|
||||
|
||||
def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool:
|
||||
"""
|
||||
Some eagle3 heads (e.g., nvidia/gpt-oss-120b-Eagle3-v2) do not use auxiliary
|
||||
hidden states and directly uses the last layer output just like eagle1.
|
||||
They might indicate this by setting "use_aux_hidden_state" to False
|
||||
inside the "eagle_config" dict of their hf_config.
|
||||
"""
|
||||
if self.method != "eagle3":
|
||||
return False
|
||||
# Assume that eagle3 heads use aux hidden states by default
|
||||
use_aux_hidden_state = True
|
||||
eagle_config = getattr(self.draft_model_config.hf_config, "eagle_config", None)
|
||||
if eagle_config is not None:
|
||||
use_aux_hidden_state = eagle_config.get("use_aux_hidden_state", True)
|
||||
return use_aux_hidden_state
|
||||
|
||||
def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
|
||||
"""
|
||||
Validate that all eagle layers belong to the same KVCacheGroup.
|
||||
@ -1192,6 +1252,28 @@ class EagleProposer:
|
||||
== 1
|
||||
), "All eagle layers should belong to the same kv cache group"
|
||||
|
||||
def _pad_batch_across_dp(
|
||||
self,
|
||||
num_tokens_unpadded: int,
|
||||
num_tokens_padded: int,
|
||||
) -> tuple[int, torch.Tensor]:
|
||||
# TODO(Flechman): support DBO ubatching
|
||||
ubatch_slices, num_toks_across_dp = coordinate_batch_across_dp(
|
||||
num_tokens_unpadded=num_tokens_unpadded,
|
||||
parallel_config=self.vllm_config.parallel_config,
|
||||
allow_microbatching=False,
|
||||
allow_dp_padding=self.use_cuda_graph,
|
||||
num_tokens_padded=num_tokens_padded,
|
||||
uniform_decode=None,
|
||||
num_scheduled_tokens_per_request=None,
|
||||
)
|
||||
assert ubatch_slices is None, "DBO ubatching not implemented for EAGLE"
|
||||
|
||||
num_tokens_dp_padded = num_tokens_padded
|
||||
if num_toks_across_dp is not None:
|
||||
num_tokens_dp_padded = int(num_toks_across_dp[self.dp_rank].item())
|
||||
return num_tokens_dp_padded, num_toks_across_dp
|
||||
|
||||
|
||||
# NOTE(woosuk): Currently, the below code is not used and we always use argmax
|
||||
# to sample the draft tokens. We will use this after we find a way to manage
|
||||
|
||||
@ -4,7 +4,6 @@ from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import numba
|
||||
import numba.types as types
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
@ -37,6 +36,9 @@ class InputBuffers:
|
||||
self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
|
||||
self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
|
||||
|
||||
# Spec decoding.
|
||||
self.next_prefill_tokens = self._make_buffer(max_num_reqs, dtype=torch.int32)
|
||||
|
||||
# Structured outputs.
|
||||
self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
|
||||
self.grammar_bitmask = self._make_buffer(
|
||||
@ -144,80 +146,42 @@ class InputBatch:
|
||||
)
|
||||
|
||||
|
||||
# NOTE: With the type annotations, this function is pre-compiled
|
||||
# before the first call.
|
||||
@numba.jit(
|
||||
[
|
||||
types.none(
|
||||
types.int32[:], # idx_mapping
|
||||
types.int32[:], # num_scheduled_tokens
|
||||
types.int32[:, :], # prefill_token_ids
|
||||
types.int32[:], # num_computed_prefill_tokens
|
||||
types.int32[:], # prefill_len
|
||||
types.int32[:], # input_ids
|
||||
types.int32[:], # query_start_loc
|
||||
)
|
||||
],
|
||||
nopython=True,
|
||||
cache=True,
|
||||
)
|
||||
@numba.njit(cache=True)
|
||||
def _prepare_prefill_inputs(
|
||||
idx_mapping: np.ndarray, # batch_idx -> req_idx
|
||||
num_scheduled_tokens: np.ndarray, # [B]
|
||||
idx_mapping: np.ndarray, # [B]
|
||||
query_lens: np.ndarray, # [B]
|
||||
query_start_loc: np.ndarray, # [B + 1]
|
||||
prefill_token_ids: np.ndarray, # [N, max_model_len]
|
||||
num_computed_prefill_tokens: np.ndarray, # [N]
|
||||
prefill_len: np.ndarray, # [N]
|
||||
input_ids: np.ndarray, # [num_input_tokens]
|
||||
query_start_loc: np.ndarray, # [B + 1]
|
||||
) -> None:
|
||||
num_reqs = num_scheduled_tokens.shape[0]
|
||||
query_start_loc[0] = 0
|
||||
|
||||
cu_num_tokens = 0
|
||||
num_reqs = idx_mapping.shape[0]
|
||||
query_starts = query_start_loc[:num_reqs]
|
||||
query_ends = query_start_loc[1 : num_reqs + 1]
|
||||
starts = num_computed_prefill_tokens[idx_mapping]
|
||||
ends = starts + query_lens
|
||||
for i in range(num_reqs):
|
||||
req_idx = idx_mapping[i]
|
||||
query_len = num_scheduled_tokens[i]
|
||||
|
||||
start = num_computed_prefill_tokens[req_idx]
|
||||
end = min(start + query_len, prefill_len[req_idx])
|
||||
n = end - start
|
||||
|
||||
start_idx = cu_num_tokens
|
||||
input_ids[start_idx : start_idx + n] = prefill_token_ids[req_idx, start:end]
|
||||
|
||||
cu_num_tokens = start_idx + query_len
|
||||
query_start_loc[i + 1] = cu_num_tokens
|
||||
|
||||
# Pad the inputs for CUDA graphs.
|
||||
# Note: pad query_start_loc to be non-decreasing, as kernels
|
||||
# like FlashAttention requires that
|
||||
query_start_loc[num_reqs + 1 :].fill(cu_num_tokens)
|
||||
input_ids[query_starts[i] : query_ends[i]] = prefill_token_ids[
|
||||
idx_mapping[i], starts[i] : ends[i]
|
||||
]
|
||||
|
||||
|
||||
def prepare_prefill_inputs(
|
||||
idx_mapping: np.ndarray,
|
||||
num_scheduled_tokens: np.ndarray,
|
||||
total_num_tokens: int,
|
||||
query_start_loc: np.ndarray,
|
||||
prefill_token_ids: np.ndarray,
|
||||
num_computed_prefill_tokens: np.ndarray,
|
||||
prefill_len: np.ndarray,
|
||||
input_ids: CpuGpuBuffer,
|
||||
query_start_loc: CpuGpuBuffer,
|
||||
input_ids: np.ndarray,
|
||||
) -> None:
|
||||
_prepare_prefill_inputs(
|
||||
idx_mapping,
|
||||
num_scheduled_tokens,
|
||||
query_start_loc,
|
||||
prefill_token_ids,
|
||||
num_computed_prefill_tokens,
|
||||
prefill_len,
|
||||
input_ids.np,
|
||||
query_start_loc.np,
|
||||
input_ids,
|
||||
)
|
||||
input_ids.copy_to_gpu(total_num_tokens)
|
||||
# NOTE(woosuk): We should copy the whole query_start_loc and seq_lens
|
||||
# tensors from CPU to GPU, because they may include paddings needed
|
||||
# for full CUDA graph mode.
|
||||
query_start_loc.copy_to_gpu()
|
||||
|
||||
|
||||
@triton.jit
|
||||
@ -380,8 +344,8 @@ def _post_update_kernel(
|
||||
sampled_tokens_ptr,
|
||||
sampled_tokens_stride,
|
||||
num_sampled_ptr,
|
||||
num_rejected_ptr,
|
||||
query_start_loc_ptr,
|
||||
cu_num_logits_ptr,
|
||||
):
|
||||
req_id = tl.program_id(0)
|
||||
req_state_idx = tl.load(idx_mapping_ptr + req_id)
|
||||
@ -396,17 +360,10 @@ def _post_update_kernel(
|
||||
query_start = tl.load(query_start_loc_ptr + req_id)
|
||||
query_end = tl.load(query_start_loc_ptr + req_id + 1)
|
||||
query_len = query_end - query_start
|
||||
num_rejected = tl.load(num_rejected_ptr + req_id)
|
||||
|
||||
num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
|
||||
num_computed += query_len
|
||||
# Consider the rejected tokens in spec decoding.
|
||||
if num_sampled > 0:
|
||||
# NOTE(woosuk): We must skip num_sampled == 0 to account for chunked prefills.
|
||||
logits_start = tl.load(cu_num_logits_ptr + req_id)
|
||||
logits_end = tl.load(cu_num_logits_ptr + req_id + 1)
|
||||
num_logits = logits_end - logits_start
|
||||
num_rejected = num_logits - num_sampled
|
||||
num_computed -= num_rejected
|
||||
num_computed += query_len - num_rejected
|
||||
tl.store(num_computed_tokens_ptr + req_state_idx, num_computed)
|
||||
|
||||
|
||||
@ -421,10 +378,10 @@ def post_update(
|
||||
sampled_tokens: torch.Tensor,
|
||||
# [num_reqs]
|
||||
num_sampled: torch.Tensor,
|
||||
# [num_reqs]
|
||||
num_rejected: torch.Tensor,
|
||||
# [num_reqs + 1]
|
||||
query_start_loc: torch.Tensor,
|
||||
# [num_reqs + 1]
|
||||
cu_num_logits: torch.Tensor,
|
||||
) -> None:
|
||||
num_reqs = idx_mapping.shape[0]
|
||||
_post_update_kernel[(num_reqs,)](
|
||||
@ -434,7 +391,7 @@ def post_update(
|
||||
sampled_tokens,
|
||||
sampled_tokens.stride(0),
|
||||
num_sampled,
|
||||
num_rejected,
|
||||
query_start_loc,
|
||||
cu_num_logits,
|
||||
num_warps=1,
|
||||
)
|
||||
|
||||
@ -45,7 +45,11 @@ from vllm.v1.worker.gpu.input_batch import (
|
||||
prepare_prefill_inputs,
|
||||
)
|
||||
from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
|
||||
from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
|
||||
from vllm.v1.worker.gpu.spec_decode import init_speculator
|
||||
from vllm.v1.worker.gpu.spec_decode.rejection_sample import (
|
||||
get_num_rejected,
|
||||
rejection_sample,
|
||||
)
|
||||
from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
|
||||
from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
|
||||
from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
|
||||
@ -97,16 +101,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
if self.use_async_scheduling:
|
||||
self.input_prep_event = torch.cuda.Event()
|
||||
self.structured_outputs_event = torch.cuda.Event()
|
||||
self.spec_decode_event = torch.cuda.Event()
|
||||
else:
|
||||
self.input_prep_event = None
|
||||
self.structured_outputs_event = None
|
||||
self.spec_decode_event = None
|
||||
|
||||
if self.speculative_config is not None:
|
||||
self.do_spec_decode = True
|
||||
self.num_speculative_steps = self.speculative_config.num_speculative_tokens
|
||||
self.speculator = init_speculator(self.vllm_config, self.device)
|
||||
else:
|
||||
self.do_spec_decode = False
|
||||
self.num_speculative_steps = 0
|
||||
self.speculator = None
|
||||
|
||||
self.req_states = RequestState(
|
||||
max_num_reqs=self.max_num_reqs,
|
||||
@ -153,6 +161,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
self.vllm_config,
|
||||
self.device,
|
||||
)
|
||||
if self.do_spec_decode:
|
||||
self.speculator.load_model(self.model)
|
||||
time_after_load = time.perf_counter()
|
||||
|
||||
self.model_memory_usage = m.consumed_memory
|
||||
@ -285,6 +295,35 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
logits = self.model.compute_logits(hidden_states)
|
||||
self.sampler(logits, sampling_metadata)
|
||||
|
||||
@torch.inference_mode()
|
||||
def _dummy_speculator_run(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
aux_hidden_states: list[torch.Tensor] | None,
|
||||
) -> None:
|
||||
num_tokens = hidden_states.shape[0]
|
||||
num_reqs = min(num_tokens, self.max_num_reqs)
|
||||
input_batch = InputBatch.make_dummy(
|
||||
num_reqs=num_reqs,
|
||||
num_tokens=num_tokens,
|
||||
input_buffers=self.input_buffers,
|
||||
device=self.device,
|
||||
)
|
||||
sampling_metadata = SamplingMetadata.make_dummy(
|
||||
num_reqs=num_reqs,
|
||||
device=self.device,
|
||||
)
|
||||
num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
|
||||
num_rejected = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
|
||||
self.propose_draft(
|
||||
input_batch=input_batch,
|
||||
sampling_metadata=sampling_metadata,
|
||||
last_hidden_states=hidden_states,
|
||||
aux_hidden_states=aux_hidden_states,
|
||||
num_sampled=num_sampled,
|
||||
num_rejected=num_rejected,
|
||||
)
|
||||
|
||||
@torch.inference_mode()
|
||||
def profile_run(self) -> None:
|
||||
hidden_states, sample_hidden_states = self._dummy_run(
|
||||
@ -292,6 +331,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
skip_attn=True,
|
||||
)
|
||||
self._dummy_sampler_run(sample_hidden_states)
|
||||
if self.do_spec_decode:
|
||||
self._dummy_speculator_run(hidden_states, None)
|
||||
torch.cuda.synchronize()
|
||||
del hidden_states, sample_hidden_states
|
||||
gc.collect()
|
||||
@ -466,20 +507,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
# Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
|
||||
block_tables = self.block_tables.gather_block_tables(idx_mapping)
|
||||
|
||||
# Copy prefill tokens from CPU to GPU and get query_start_loc.
|
||||
# Get query_start_loc.
|
||||
np.cumsum(
|
||||
num_scheduled_tokens,
|
||||
out=self.input_buffers.query_start_loc.np[1 : num_reqs + 1],
|
||||
)
|
||||
# Pad for full CUDA graph mode.
|
||||
# Some attention backends like FA3 require query_start_loc to be non-decreasing.
|
||||
self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
|
||||
self.input_buffers.query_start_loc.copy_to_gpu()
|
||||
query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
|
||||
query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
|
||||
|
||||
# Copy prefill tokens from CPU to GPU.
|
||||
prepare_prefill_inputs(
|
||||
idx_mapping_np,
|
||||
num_scheduled_tokens,
|
||||
num_tokens,
|
||||
query_start_loc_np,
|
||||
self.req_states.prefill_token_ids,
|
||||
self.req_states.num_computed_prefill_tokens,
|
||||
self.req_states.prefill_len.np,
|
||||
self.input_buffers.input_ids,
|
||||
self.input_buffers.query_start_loc,
|
||||
self.input_buffers.input_ids.np,
|
||||
)
|
||||
query_start_loc = self.input_buffers.query_start_loc
|
||||
query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
|
||||
query_start_loc_np = query_start_loc.np[: num_reqs + 1]
|
||||
self.input_buffers.input_ids.copy_to_gpu(num_tokens)
|
||||
|
||||
# Prepare positions and seq_lens.
|
||||
prepare_pos_seq_lens(
|
||||
@ -562,7 +611,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
input_batch: InputBatch,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
grammar_output: GrammarOutput | None,
|
||||
) -> tuple[SamplerOutput, torch.Tensor]:
|
||||
) -> tuple[SamplerOutput, torch.Tensor, torch.Tensor]:
|
||||
sample_hidden_states = hidden_states[input_batch.logits_indices]
|
||||
logits = self.model.compute_logits(sample_hidden_states)
|
||||
if grammar_output is not None:
|
||||
@ -588,6 +637,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
# No draft tokens (common case).
|
||||
# 0 if chunked-prefilling, 1 if not.
|
||||
num_sampled = (~is_chunked_prefilling).int()
|
||||
num_rejected = torch.zeros_like(num_sampled)
|
||||
else:
|
||||
# Draft tokens for spec decoding.
|
||||
input_ids = input_batch.input_ids[input_batch.logits_indices]
|
||||
@ -598,9 +648,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
self.num_speculative_steps,
|
||||
)
|
||||
num_sampled *= ~is_chunked_prefilling
|
||||
num_rejected = get_num_rejected(
|
||||
input_batch.cu_num_logits,
|
||||
num_sampled,
|
||||
)
|
||||
sampler_output.sampled_token_ids = sampled_tokens
|
||||
# TODO(woosuk): Support logprobs with spec decoding.
|
||||
return sampler_output, num_sampled
|
||||
return sampler_output, num_sampled, num_rejected
|
||||
|
||||
def compute_prompt_logprobs(
|
||||
self,
|
||||
@ -706,6 +760,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
input_batch: InputBatch,
|
||||
sampled_tokens: torch.Tensor,
|
||||
num_sampled: torch.Tensor,
|
||||
num_rejected: torch.Tensor,
|
||||
) -> None:
|
||||
# Update the number of computed tokens.
|
||||
post_update(
|
||||
@ -714,8 +769,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
self.req_states.last_sampled_tokens,
|
||||
sampled_tokens,
|
||||
num_sampled,
|
||||
num_rejected,
|
||||
input_batch.query_start_loc,
|
||||
input_batch.cu_num_logits,
|
||||
)
|
||||
|
||||
# Update the number of computed prefill tokens.
|
||||
@ -727,6 +782,43 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
self.req_states.prefill_len.np[idx_mapping_np],
|
||||
)
|
||||
|
||||
@torch.inference_mode()
|
||||
def propose_draft(
|
||||
self,
|
||||
input_batch: InputBatch,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
last_hidden_states: torch.Tensor,
|
||||
aux_hidden_states: list[torch.Tensor] | None,
|
||||
num_sampled: torch.Tensor,
|
||||
num_rejected: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
num_reqs = input_batch.num_reqs
|
||||
idx_mapping_np = input_batch.idx_mapping_np
|
||||
with async_barrier(self.spec_decode_event):
|
||||
self.input_buffers.next_prefill_tokens.np[:num_reqs] = (
|
||||
self.req_states.prefill_token_ids[
|
||||
idx_mapping_np,
|
||||
self.req_states.num_computed_prefill_tokens[idx_mapping_np],
|
||||
]
|
||||
)
|
||||
next_prefill_tokens = self.input_buffers.next_prefill_tokens.copy_to_gpu(
|
||||
num_reqs
|
||||
)
|
||||
|
||||
assert self.speculator is not None
|
||||
draft_tokens = self.speculator.propose(
|
||||
input_batch,
|
||||
sampling_metadata,
|
||||
last_hidden_states,
|
||||
aux_hidden_states,
|
||||
num_sampled,
|
||||
num_rejected,
|
||||
self.req_states.last_sampled_tokens,
|
||||
next_prefill_tokens,
|
||||
)
|
||||
self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
|
||||
return draft_tokens
|
||||
|
||||
def get_cudagraph_and_dp_padding(
|
||||
self,
|
||||
scheduler_output: SchedulerOutput,
|
||||
@ -879,7 +971,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
self.execute_model_state = None # type: ignore
|
||||
assert sampling_metadata is not None
|
||||
|
||||
sampler_output, num_sampled_tokens = self.sample(
|
||||
sampler_output, num_sampled, num_rejected = self.sample(
|
||||
hidden_states, input_batch, sampling_metadata, grammar_output
|
||||
)
|
||||
prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch)
|
||||
@ -900,7 +992,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
async_output = AsyncOutput(
|
||||
model_runner_output=model_runner_output,
|
||||
sampler_output=sampler_output,
|
||||
num_sampled_tokens=num_sampled_tokens,
|
||||
num_sampled_tokens=num_sampled,
|
||||
copy_stream=self.output_copy_stream,
|
||||
copy_event=self.output_copy_event,
|
||||
)
|
||||
@ -911,8 +1003,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
# This sequencing may slightly reduce latency as async D2H copy does not
|
||||
# need to wait for the postprocess to finish.
|
||||
self.postprocess(
|
||||
input_batch, sampler_output.sampled_token_ids, num_sampled_tokens
|
||||
input_batch, sampler_output.sampled_token_ids, num_sampled, num_rejected
|
||||
)
|
||||
if self.do_spec_decode:
|
||||
_ = self.propose_draft(
|
||||
input_batch,
|
||||
sampling_metadata,
|
||||
hidden_states,
|
||||
None, # aux_hidden_states
|
||||
num_sampled,
|
||||
num_rejected,
|
||||
)
|
||||
|
||||
if self.use_async_scheduling:
|
||||
return async_output
|
||||
|
||||
@ -100,8 +100,9 @@ def _gumbel_sample_kernel(
|
||||
mask=mask,
|
||||
other=float("-inf"),
|
||||
)
|
||||
logits = logits.to(tl.float32)
|
||||
|
||||
temp = tl.load(temp_ptr + req_idx)
|
||||
temp = tl.load(temp_ptr + req_idx).to(tl.float32)
|
||||
if temp != 0.0:
|
||||
# Calculate the seed for gumbel noise.
|
||||
seed = tl.load(seeds_ptr + req_idx)
|
||||
@ -116,7 +117,7 @@ def _gumbel_sample_kernel(
|
||||
# Apply temperature.
|
||||
if APPLY_TEMPERATURE:
|
||||
# NOTE(woosuk): Use div_rn to match the behavior of torch.
|
||||
logits = tl.div_rn(logits, temp.to(tl.float32))
|
||||
logits = tl.div_rn(logits, temp)
|
||||
|
||||
# Apply gumbel noise.
|
||||
logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
|
||||
|
||||
@ -0,0 +1,18 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
|
||||
|
||||
def init_speculator(
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
):
|
||||
speculative_config = vllm_config.speculative_config
|
||||
assert speculative_config is not None
|
||||
if speculative_config.use_eagle():
|
||||
from vllm.v1.worker.gpu.spec_decode.eagle import EagleSpeculator
|
||||
|
||||
return EagleSpeculator(vllm_config, device)
|
||||
raise NotImplementedError(f"{speculative_config.method} is not supported yet.")
|
||||
209
vllm/v1/worker/gpu/spec_decode/eagle.py
Normal file
209
vllm/v1/worker/gpu/spec_decode/eagle.py
Normal file
@ -0,0 +1,209 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.compilation import CUDAGraphMode
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.triton_utils import tl, triton
|
||||
from vllm.v1.worker.gpu.input_batch import InputBatch
|
||||
from vllm.v1.worker.gpu.sampler import gumbel_sample
|
||||
from vllm.v1.worker.gpu.states import SamplingMetadata
|
||||
|
||||
|
||||
class EagleSpeculator:
|
||||
def __init__(self, vllm_config: VllmConfig, device: torch.device):
|
||||
self.vllm_config = vllm_config
|
||||
self.device = device
|
||||
|
||||
self.speculative_config = vllm_config.speculative_config
|
||||
assert self.speculative_config is not None
|
||||
self.method = self.speculative_config.method
|
||||
self.num_speculative_steps = self.speculative_config.num_speculative_tokens
|
||||
self.draft_model_config = self.speculative_config.draft_model_config
|
||||
|
||||
self.scheduler_config = vllm_config.scheduler_config
|
||||
self.max_num_reqs = self.scheduler_config.max_num_seqs
|
||||
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
|
||||
self.input_ids = torch.zeros(
|
||||
self.max_num_tokens, dtype=torch.int32, device=device
|
||||
)
|
||||
self.positions = torch.zeros(
|
||||
self.max_num_tokens, dtype=torch.int64, device=device
|
||||
)
|
||||
|
||||
def load_model(self, target_model: nn.Module) -> None:
|
||||
from vllm.compilation.backends import set_model_tag
|
||||
|
||||
with set_model_tag("eagle_head"):
|
||||
self.model = get_model(
|
||||
vllm_config=self.vllm_config, model_config=self.draft_model_config
|
||||
)
|
||||
|
||||
share_lm_head = True
|
||||
if share_lm_head and hasattr(target_model, "lm_head"):
|
||||
if hasattr(self.model, "lm_head"):
|
||||
del self.model.lm_head
|
||||
self.model.lm_head = target_model.lm_head
|
||||
|
||||
@torch.inference_mode()
|
||||
def propose(
|
||||
self,
|
||||
input_batch: InputBatch,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
# [num_tokens, hidden_size]
|
||||
last_hidden_states: torch.Tensor,
|
||||
# num_layers x [num_tokens, hidden_size]
|
||||
aux_hidden_states: list[torch.Tensor] | None,
|
||||
# [num_reqs]
|
||||
num_sampled: torch.Tensor,
|
||||
# [num_reqs]
|
||||
num_rejected: torch.Tensor,
|
||||
# [max_num_reqs, 1]
|
||||
last_sampled: torch.Tensor,
|
||||
# [num_reqs]
|
||||
next_prefill_tokens: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
# NOTE(woosuk): To avoid CPU-GPU synchronization without CPU knowing the
|
||||
# number of rejected tokens, we maintain the size of eagle's input_ids and
|
||||
# hidden_states the same as the target model's. This means, we pad each
|
||||
# request's query length to include any rejected positions. By doing so,
|
||||
# we can also reuse the attention metadata (e.g., query_start_loc,
|
||||
# seq_lens) of the target model.
|
||||
if aux_hidden_states:
|
||||
assert self.method == "eagle3"
|
||||
hidden_states = self.model.combine_hidden_states(
|
||||
torch.cat(aux_hidden_states, dim=-1)
|
||||
)
|
||||
else:
|
||||
hidden_states = last_hidden_states
|
||||
|
||||
# Get the input ids and last token indices for the speculator.
|
||||
last_token_indices = prepare_eagle_inputs(
|
||||
self.input_ids,
|
||||
input_batch,
|
||||
num_sampled,
|
||||
num_rejected,
|
||||
last_sampled,
|
||||
next_prefill_tokens,
|
||||
)
|
||||
input_ids = self.input_ids[: input_batch.num_tokens_after_padding]
|
||||
|
||||
# Prefill: Run the eagle speculator with eager mode.
|
||||
with set_forward_context(
|
||||
input_batch.attn_metadata,
|
||||
self.vllm_config,
|
||||
num_tokens=input_batch.num_tokens_after_padding,
|
||||
cudagraph_runtime_mode=CUDAGraphMode.NONE,
|
||||
):
|
||||
ret_hidden_states = self.model(
|
||||
input_ids=input_ids,
|
||||
positions=input_batch.positions,
|
||||
hidden_states=hidden_states,
|
||||
)
|
||||
if self.method == "mtp":
|
||||
last_hidden_states = ret_hidden_states
|
||||
hidden_states = ret_hidden_states
|
||||
else:
|
||||
last_hidden_states, hidden_states = ret_hidden_states
|
||||
sample_hidden_states = last_hidden_states[last_token_indices]
|
||||
logits = self.model.compute_logits(sample_hidden_states)
|
||||
|
||||
num_reqs = input_batch.num_reqs
|
||||
cu_num_logits = input_batch.cu_num_logits[:num_reqs]
|
||||
temperature = sampling_metadata.temperature[cu_num_logits]
|
||||
seed = sampling_metadata.seeds[cu_num_logits]
|
||||
# NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
|
||||
# used for draft and target sampling.
|
||||
pos = input_batch.positions[last_token_indices] + 1
|
||||
# NOTE(woosuk): For draft sampling, we only consider the temperature
|
||||
# and ignore the other sampling parameters such as top_k and top_p,
|
||||
# for simplicity and performance.
|
||||
# While this may slightly degrade the acceptance rate, it does not
|
||||
# affect the output distribution after rejection sampling.
|
||||
draft_tokens = gumbel_sample(
|
||||
logits, temperature, seed, pos, apply_temperature=True
|
||||
)
|
||||
if self.num_speculative_steps == 1:
|
||||
# Early exit.
|
||||
return draft_tokens.view(-1, 1)
|
||||
raise NotImplementedError("num_speculative_steps > 1 is not supported yet.")
|
||||
|
||||
|
||||
@triton.jit
|
||||
def _prepare_eagle_inputs_kernel(
|
||||
last_token_indices_ptr,
|
||||
eagle_input_ids_ptr,
|
||||
target_input_ids_ptr,
|
||||
idx_mapping_ptr,
|
||||
last_sampled_ptr,
|
||||
next_prefill_tokens_ptr,
|
||||
num_sampled_ptr,
|
||||
num_rejected_ptr,
|
||||
query_start_loc_ptr,
|
||||
BLOCK_SIZE: tl.constexpr,
|
||||
):
|
||||
batch_idx = tl.program_id(0)
|
||||
query_start = tl.load(query_start_loc_ptr + batch_idx)
|
||||
query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
|
||||
query_len = query_end - query_start
|
||||
|
||||
# Get the true query length and next token after accounting for rejected tokens.
|
||||
num_rejected = tl.load(num_rejected_ptr + batch_idx)
|
||||
query_len -= num_rejected
|
||||
|
||||
num_sampled = tl.load(num_sampled_ptr + batch_idx)
|
||||
if num_sampled > 0:
|
||||
req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
|
||||
next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32)
|
||||
else:
|
||||
# Chunked prefilling.
|
||||
# Get the next prefill token.
|
||||
next_token = tl.load(next_prefill_tokens_ptr + batch_idx)
|
||||
|
||||
# Shift target_input_ids by one.
|
||||
for i in range(1, query_len, BLOCK_SIZE):
|
||||
block = i + tl.arange(0, BLOCK_SIZE)
|
||||
mask = block < query_len
|
||||
input_ids = tl.load(target_input_ids_ptr + query_start + block, mask=mask)
|
||||
tl.store(eagle_input_ids_ptr + query_start + block - 1, input_ids, mask=mask)
|
||||
|
||||
last_token_index = query_start + query_len - 1
|
||||
tl.store(last_token_indices_ptr + batch_idx, last_token_index)
|
||||
tl.store(eagle_input_ids_ptr + last_token_index, next_token)
|
||||
|
||||
|
||||
def prepare_eagle_inputs(
|
||||
eagle_input_ids: torch.Tensor,
|
||||
input_batch: InputBatch,
|
||||
# [num_reqs]
|
||||
num_sampled: torch.Tensor,
|
||||
# [num_reqs]
|
||||
num_rejected: torch.Tensor,
|
||||
# [max_num_reqs, 1]
|
||||
last_sampled: torch.Tensor,
|
||||
# [max_num_reqs]
|
||||
next_prefill_tokens: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
num_reqs = input_batch.num_reqs
|
||||
last_token_indices = torch.empty(
|
||||
num_reqs,
|
||||
dtype=torch.int64,
|
||||
device=eagle_input_ids.device,
|
||||
)
|
||||
_prepare_eagle_inputs_kernel[(num_reqs,)](
|
||||
last_token_indices,
|
||||
eagle_input_ids,
|
||||
input_batch.input_ids,
|
||||
input_batch.idx_mapping,
|
||||
last_sampled,
|
||||
next_prefill_tokens,
|
||||
num_sampled,
|
||||
num_rejected,
|
||||
input_batch.query_start_loc,
|
||||
BLOCK_SIZE=1024,
|
||||
)
|
||||
return last_token_indices
|
||||
@ -69,3 +69,15 @@ def rejection_sample(
|
||||
num_warps=1,
|
||||
)
|
||||
return sampled, num_sampled
|
||||
|
||||
|
||||
@torch.compile(dynamic=True)
|
||||
def get_num_rejected(
|
||||
cu_num_logits: torch.Tensor,
|
||||
num_sampled: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
num_logits = cu_num_logits[1:] - cu_num_logits[:-1]
|
||||
num_rejected = num_logits - num_sampled
|
||||
# No token is rejected for chunked prefills.
|
||||
num_rejected *= num_sampled > 0
|
||||
return num_rejected
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user