Merge branch 'main' into imarkov/eplb_optimizations

This commit is contained in:
ilmarkov 2025-11-25 16:18:25 +00:00
commit 691f09036c
102 changed files with 4369 additions and 1380 deletions

View File

@ -192,6 +192,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@ -970,6 +971,7 @@ steps:
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- vllm/model_executor/layers/fused_moe/layer.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
@ -1116,6 +1118,7 @@ steps:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py

View File

@ -136,7 +136,7 @@ elseif(HIP_FOUND)
# ROCm 5.X and 6.X
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
"expected for ROCm build, saw ${Torch_VERSION} instead.")
endif()

View File

@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
const double scale, const std::string& kv_cache_dtype);
void gather_and_maybe_dequant_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, const std::string& kv_cache_dtype,
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
int64_t num_tokens, const std::string& kv_cache_dtype,
torch::Tensor const& scale,
std::optional<torch::Tensor> seq_starts = std::nullopt);

View File

@ -905,91 +905,79 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
namespace vllm {
// grid is launched with dimensions (batch, num_splits)
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
int ENTRY_SIZE, int CTA_SIZE>
__global__ void gather_and_maybe_dequant_cache(
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
// ENTRIES...]
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
const int32_t block_size, const int32_t entry_size,
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
// ENTRIES...]
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
const int32_t* __restrict__ token_to_seq, // [MAX_TOKEN_ACROSS_CHUNK]
const int32_t num_tokens, const int32_t block_size,
const int64_t block_table_stride, const int64_t cache_block_stride,
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
const float* __restrict__ scale,
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
// batch
constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
using ltype = vllm::vec_n_t<cache_t, vec_size>;
using stype = vllm::vec_n_t<scalar_t, vec_size>;
// We are adding this for code readability which will be optimized out when
// build in release.
assert(CTA_SIZE == blockDim.x);
const int64_t bid = blockIdx.x; // Batch ID
const int32_t num_splits = gridDim.y;
const int32_t split = blockIdx.y;
const int32_t seq_start = cu_seq_lens[bid];
const int32_t seq_end = cu_seq_lens[bid + 1];
const int32_t seq_len = seq_end - seq_start;
const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
#pragma unroll
for (int token_id = blockIdx.x; token_id < num_tokens;
token_id += gridDim.x) {
int64_t batch_id = token_to_seq[token_id];
int64_t batch_start = cu_seq_lens[batch_id];
int64_t batch_end = cu_seq_lens[batch_id + 1];
int32_t batch_offset = token_id - batch_start;
const int32_t split_start = split * split_blocks;
const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
if (token_id >= batch_end) return;
int32_t offset = 0;
if (seq_starts != nullptr) {
offset = seq_starts[batch_id];
}
batch_offset += offset;
int32_t block_table_id = batch_offset / block_size;
int32_t slot_id = batch_offset % block_size;
int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
int32_t block_id = block_table[block_table_offset];
int64_t cache_offset =
block_id * cache_block_stride + slot_id * cache_entry_stride;
constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
scalar_t* dst_ = dst + token_id * dst_entry_stride;
cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
const bool is_active_split = (split_start < tot_blocks);
const bool is_last_split = (split_end == tot_blocks);
if (!is_active_split) return;
int32_t full_blocks_end = split_end;
int32_t partial_block_size = 0;
// Adjust the pointer for the block_table for this batch.
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
// page_size)
const int32_t batch_offset = bid * block_table_stride;
int32_t offset = 0;
if (seq_starts != nullptr) {
offset = seq_starts[bid] / block_size;
}
const int32_t* batch_block_table = block_table + batch_offset + offset;
// Adjust dst pointer based on the cumulative sequence lengths.
dst += seq_start * dst_entry_stride;
if (is_last_split) {
partial_block_size = seq_len % block_size;
if (partial_block_size) full_blocks_end -= 1;
}
auto copy_entry = [&](const cache_t* __restrict__ _src,
scalar_t* __restrict__ _dst) {
for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
#pragma unroll
for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
_dst[i] = static_cast<scalar_t>(_src[i]);
reinterpret_cast<stype*>(dst_)[idx] =
static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
} else {
_dst[i] =
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
stype store_val;
#pragma unroll
for (int j = 0; j < vec_size; ++j) {
store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
loaded_val.val[j], *scale);
}
reinterpret_cast<stype*>(dst_)[idx] = store_val;
}
}
};
const auto loop_end =
std::min((int64_t)full_blocks_end, block_table_stride - offset);
for (int pid = split_start; pid < loop_end; ++pid) {
auto block_id = batch_block_table[pid];
auto block_start_ptr = src_cache + block_id * cache_block_stride;
auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
for (int eid = 0; eid < block_size; ++eid) {
copy_entry(block_start_ptr + eid * cache_entry_stride,
block_dst_ptr + eid * dst_entry_stride);
}
}
if (partial_block_size) {
if (offset + full_blocks_end < block_table_stride) {
auto block_id = batch_block_table[full_blocks_end];
auto block_start_ptr = src_cache + block_id * cache_block_stride;
auto block_dst_ptr =
dst + full_blocks_end * block_size * dst_entry_stride;
for (int eid = 0; eid < partial_block_size; ++eid) {
copy_entry(block_start_ptr + eid * cache_entry_stride,
block_dst_ptr + eid * dst_entry_stride);
// process tail
constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
dst_ = dst_ + ENTRY_SIZE - tail_cnt;
src_ = src_ + ENTRY_SIZE - tail_cnt;
#pragma unroll
for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
dst_[idx] = static_cast<scalar_t>(src_[idx]);
} else {
dst_[idx] =
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
}
}
}
@ -1001,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache(
// SCALAR_T is the data type of the destination tensor.
// CACHE_T is the stored data type of kv-cache.
// KV_DTYPE is the real data type of kv-cache.
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
block_size, entry_size, block_table_stride, cache_block_stride, \
cache_entry_stride, dst_entry_stride, \
reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \
thread_block_size> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
token_to_seq.data_ptr<int32_t>(), num_tokens, block_size, \
block_table_stride, cache_block_stride, cache_entry_stride, \
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
seq_starts_ptr);
// Gather sequences from the cache into the destination tensor.
// - cu_seq_lens contains the cumulative sequence lengths for each batch
// - block_table contains the cache block indices for each sequence
// - token_to_seq contains the back mapping from token_id to batch_id
// - Optionally, seq_starts (if provided) offsets the starting block index by
// (seq_starts[bid] / page_size)
void gather_and_maybe_dequant_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, const std::string& kv_cache_dtype,
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
int64_t num_tokens, const std::string& kv_cache_dtype,
torch::Tensor const& scale,
std::optional<torch::Tensor> seq_starts = std::nullopt) {
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int32_t block_size = src_cache.size(1);
int32_t entry_size = src_cache.flatten(2, -1).size(2);
int32_t head_dim = dst.size(-1);
TORCH_CHECK(block_table.dtype() == torch::kInt32,
"block_table must be int32");
@ -1038,6 +1030,9 @@ void gather_and_maybe_dequant_cache(
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
"seq_starts must be int32");
}
TORCH_CHECK(head_dim == 576,
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
"for better performance")
TORCH_CHECK(src_cache.device() == dst.device(),
"src_cache and dst must be on the same device");
@ -1055,10 +1050,9 @@ void gather_and_maybe_dequant_cache(
int64_t cache_entry_stride = src_cache.stride(1);
int64_t dst_entry_stride = dst.stride(0);
// Decide on the number of splits based on the batch size.
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
dim3 grid(batch_size, num_splits);
dim3 block(1024);
constexpr int32_t thread_block_size = 64;
dim3 grid(num_tokens);
dim3 block(thread_block_size);
const int32_t* seq_starts_ptr =
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;

View File

@ -695,7 +695,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
cache_ops.def(
"gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, "
" Tensor block_table, Tensor cu_seq_lens, "
" int batch_size, "
" Tensor token_to_seq, "
" int num_tokens, "
" str kv_cache_dtype, "
" Tensor scale, Tensor? seq_starts) -> ()");
cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,

View File

@ -20,8 +20,8 @@ ARG PYTHON_VERSION=3.12
# glibc version is baked into the distro, and binaries built with one glibc
# version are not backwards compatible with OSes that use an earlier version.
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# TODO: Restore to base image after FlashInfer AOT wheel fixed
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
# their own mirror. When doing so, we don't benefit from the transparent
@ -85,7 +85,20 @@ ARG GET_PIP_URL
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
&& apt-get install -y --no-install-recommends \
ccache \
software-properties-common \
git \
curl \
sudo \
python3-pip \
libibverbs-dev \
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
gcc-10 \
g++-10 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
&& rm -rf /var/lib/apt/lists/* \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
@ -110,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
RUN <<EOF
gcc --version
EOF
@ -268,7 +277,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_LINK_MODE=copy
# Install libnuma-dev, required by fastsafetensors (fixes #20384)
RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt
@ -305,8 +314,15 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y software-properties-common curl sudo python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& apt-get install -y --no-install-recommends \
software-properties-common \
curl \
sudo \
python3-pip \
ffmpeg \
libsm6 \
libxext6 \
libgl1 \
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
mkdir -p -m 0755 /etc/apt/keyrings ; \
@ -321,13 +337,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
done ; \
fi \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& apt-get install -y --no-install-recommends \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev \
python${PYTHON_VERSION}-venv \
libibverbs-dev \
&& rm -rf /var/lib/apt/lists/* \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install CUDA development tools and build essentials for runtime JIT compilation
# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
apt-get update -y && \
apt-get install -y --no-install-recommends \
cuda-nvcc-${CUDA_VERSION_DASH} \
cuda-cudart-${CUDA_VERSION_DASH} \
cuda-nvrtc-${CUDA_VERSION_DASH} \
cuda-cuobjdump-${CUDA_VERSION_DASH} \
libcublas-${CUDA_VERSION_DASH} && \
rm -rf /var/lib/apt/lists/*
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL

View File

@ -1,4 +1,4 @@
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
@ -25,10 +25,14 @@ RUN apt clean && apt-get update -y && \
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
SHELL ["bash", "-c"]
CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
@ -72,6 +76,7 @@ RUN python3 -m pip install -e tests/vllm_test_utils
ENV NIXL_VERSION=0.7.0
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
# remove torch bundled oneccl to avoid conflicts
RUN --mount=type=cache,target=/root/.cache/pip \
pip uninstall oneccl oneccl-devel -y

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 KiB

After

Width:  |  Height:  |  Size: 146 KiB

View File

@ -22,9 +22,6 @@ export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxab
vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
```
!!! tip
If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
!!! tip
When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
`VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.

View File

@ -7,7 +7,7 @@ This document shows you some examples of the different options that are
available to generate structured outputs.
!!! warning
If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
If you are still using the following deprecated API fields which were removed in v0.12.0, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
- `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
- `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`

View File

@ -680,6 +680,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |

15
examples/offline_inference/audio_language.py Normal file → Executable file
View File

@ -425,6 +425,13 @@ def parse_args():
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args()
@ -434,6 +441,12 @@ def main(args):
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {args.tensor_parallel_size}"
)
audio_count = args.num_audios
req_data = model_example_map[model](
question_per_audio_count[audio_count], audio_count
@ -446,6 +459,8 @@ def main(args):
)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args)
# We set temperature to 0.2 so that outputs can be different

View File

@ -0,0 +1,170 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference
with the correct prompt format on Qwen2.5-Omni (thinker only).
"""
from typing import NamedTuple
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.multimodal.image import convert_image_mode
from vllm.utils.argparse_utils import FlexibleArgumentParser
class QueryResult(NamedTuple):
inputs: dict
limit_mm_per_prompt: dict[str, int]
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech."
)
def get_mixed_modalities_query() -> QueryResult:
question = (
"What is recited in the audio? "
"What is the content of this image? Why is this video funny?"
)
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
"<|vision_start|><|image_pad|><|vision_end|>"
"<|vision_start|><|video_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
"image": convert_image_mode(
ImageAsset("cherry_blossom").pil_image, "RGB"
),
"video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
},
},
limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
)
def get_use_audio_in_video_query() -> QueryResult:
question = (
"Describe the content of the video in details, then convert what the "
"baby say into text."
)
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
asset = VideoAsset(name="baby_reading", num_frames=16)
audio = asset.get_audio(sampling_rate=16000)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"video": asset.np_ndarrays,
"audio": audio,
},
"mm_processor_kwargs": {
"use_audio_in_video": True,
},
},
limit_mm_per_prompt={"audio": 1, "video": 1},
)
def get_multi_audios_query() -> QueryResult:
question = "Are these two audio clips the same?"
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
"<|audio_start|><|audio_pad|><|audio_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio": [
AudioAsset("winning_call").audio_and_sample_rate,
AudioAsset("mary_had_lamb").audio_and_sample_rate,
],
},
},
limit_mm_per_prompt={
"audio": 2,
},
)
query_map = {
"mixed_modalities": get_mixed_modalities_query,
"use_audio_in_video": get_use_audio_in_video_query,
"multi_audios": get_multi_audios_query,
}
def main(args):
model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
query_result = query_map[args.query_type]()
llm = LLM(
model=model_name,
max_model_len=12800,
max_num_seqs=5,
limit_mm_per_prompt=query_result.limit_mm_per_prompt,
seed=args.seed,
)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2, max_tokens=256)
outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using vLLM for offline inference with "
"audio language models"
)
parser.add_argument(
"--query-type",
"-q",
type=str,
default="mixed_modalities",
choices=query_map.keys(),
help="Query type.",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
main(args)

41
examples/offline_inference/vision_language.py Normal file → Executable file
View File

@ -538,6 +538,31 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
)
# HunyuanOCR
def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "tencent/HunyuanOCR"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
limit_mm_per_prompt={modality: 1},
)
placeholder = "<hy_place▁holder▁no▁100><hy_place▁holder▁no▁102><hy_place▁holder▁no▁101>" # noqa: E501
prompts = [
f"<hy_begin▁of▁sentence>{placeholder}{question}<hy_User>"
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=None,
)
# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
def run_hyperclovax_seed_vision(
questions: list[str], modality: str
@ -1820,6 +1845,7 @@ model_example_map = {
"glm4_5v": run_glm4_5v,
"glm4_5v_fp8": run_glm4_5v_fp8,
"h2ovl_chat": run_h2ovl,
"hunyuan_vl": run_hunyuan_vl,
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
"idefics3": run_idefics3,
"interns1": run_interns1,
@ -2038,6 +2064,13 @@ def parse_args():
help="If True, will send all requests in a second batch with empty mm "
"data to verify cache hits with UUIDs.",
)
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args()
@ -2046,6 +2079,12 @@ def main(args):
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {args.tensor_parallel_size}"
)
modality = args.modality
mm_input = get_multi_modal_input(args)
data = mm_input["data"]
@ -2063,6 +2102,8 @@ def main(args):
"seed": args.seed,
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
}
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args)
# Don't want to check the flag multiple times, so just hijack `prompts`.

View File

@ -1352,10 +1352,18 @@ model_example_map = {
}
def run_generate(model, question: str, image_urls: list[str], seed: int | None):
def run_generate(
model,
question: str,
image_urls: list[str],
seed: int | None,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
engine_args = asdict(req_data.engine_args) | {"seed": seed}
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args)
sampling_params = SamplingParams(
@ -1378,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
print("-" * 50)
def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
def run_chat(
model: str,
question: str,
image_urls: list[str],
seed: int | None,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls)
# Disable other modalities to save memory
@ -1388,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args)
sampling_params = (
@ -1463,6 +1479,13 @@ def parse_args():
default=2,
help="Number of images to use for the demo.",
)
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args()
@ -1470,13 +1493,20 @@ def main(args: Namespace):
model = args.model_type
method = args.method
seed = args.seed
tensor_parallel_size = args.tensor_parallel_size
if tensor_parallel_size is not None and tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {tensor_parallel_size}"
)
image_urls = IMAGE_URLS[: args.num_images]
if method == "generate":
run_generate(model, QUESTION, image_urls, seed)
run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
elif method == "chat":
run_chat(model, QUESTION, image_urls, seed)
run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
else:
raise ValueError(f"Invalid method: {method}")

View File

@ -1,2 +1,2 @@
lmcache
nixl >= 0.6.0 # Required for disaggregated prefill
nixl >= 0.7.1 # Required for disaggregated prefill

View File

@ -45,3 +45,7 @@ multiprocess==0.70.16
# Plugins test
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
torchgeo==0.7.0
# Required for suffix decoding test
arctic-inference == 0.1.1

View File

@ -10,9 +10,9 @@ wheel
jinja2>=3.1.6
datasets # for benchmark scripts
numba == 0.61.2 # Required for N-gram speculative decoding
torch==2.8.0+xpu
--extra-index-url=https://download.pytorch.org/whl/xpu
torch==2.9.0+xpu
torchaudio
torchvision
--extra-index-url=https://download.pytorch.org/whl/xpu
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl

View File

@ -111,6 +111,17 @@ if current_platform.is_cuda():
async_tp=96, # MLP is MoE, half the fusions of dense
),
),
ModelBackendTestCase(
model_name="openai/gpt-oss-20b",
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.FLASHINFER,
matches=Matches(
attention_fusion=0,
allreduce_fusion=49,
sequence_parallel=49,
async_tp=48,
),
),
]
elif current_platform.is_rocm():

View File

@ -183,9 +183,6 @@ async def test_metrics_counts(
EXPECTED_METRICS_V1 = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits",
"vllm:kv_cache_usage_perc",
"vllm:prefix_cache_queries",
"vllm:prefix_cache_hits",

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import json
import time
@ -35,6 +35,10 @@ GET_WEATHER_SCHEMA = {
@pytest.fixture(scope="module")
def server():
assert importlib.util.find_spec("gpt_oss") is not None, (
"Harmony tests require gpt_oss package to be installed"
)
args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
env_dict = dict(
VLLM_ENABLE_RESPONSES_API_STORE="1",

View File

@ -921,12 +921,16 @@ def test_gather_and_maybe_dequant_cache_mla(
)
_fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
seq_len_tensor = torch.randint(0, max_seq_len + 1, (batch_size,), device=device)
seq_len_tensor = torch.randint(
max_seq_len, max_seq_len + 1, (batch_size,), device=device
)
total_tokens = seq_len_tensor.sum()
cu_seq_lens = torch.empty((batch_size + 1), dtype=torch.int32, device=device)
cu_seq_lens[0] = 0
cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
token_to_seq = torch.arange(0, batch_size, dtype=torch.int32, device=device)
token_to_seq = torch.repeat_interleave(token_to_seq, seq_len_tensor)
print("seq_len_tensor", seq_len_tensor)
tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
@ -977,7 +981,8 @@ def test_gather_and_maybe_dequant_cache_mla(
dst,
block_table,
cu_seq_lens,
batch_size,
token_to_seq,
total_tokens,
kv_cache_dtype,
scale,
None,
@ -990,7 +995,8 @@ def test_gather_and_maybe_dequant_cache_mla(
dst,
block_table,
cu_seq_lens,
batch_size,
token_to_seq,
total_tokens,
kv_cache_dtype,
scale,
None,

View File

@ -11,7 +11,6 @@ from vllm.model_executor.layers.fused_moe.config import (
fp8_w8a8_moe_quant_config,
)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
apply_flashinfer_per_tensor_scale_fp8,
flashinfer_cutlass_moe_fp8,
@ -151,14 +150,11 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids = Llama4MoE.custom_routing_function(
hidden_states=td.hidden_states,
router_logits=score,
use_grouped_topk=False,
top_k=topk,
gating_output=score,
topk=topk,
renormalize=False,
custom_routing_function=Llama4MoE.custom_routing_function,
scoring_func="softmax",
)
quant_config = fp8_w8a8_moe_quant_config(
@ -219,14 +215,11 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
)
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids = Llama4MoE.custom_routing_function(
hidden_states=td.hidden_states,
router_logits=score,
use_grouped_topk=False,
top_k=topk,
gating_output=score,
topk=topk,
renormalize=False,
custom_routing_function=Llama4MoE.custom_routing_function,
scoring_func="softmax",
)
quant_config = fp8_w8a8_moe_quant_config(

View File

@ -0,0 +1,221 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import Mock
import pytest
from transformers import PretrainedConfig
from vllm.multimodal.processing import InputProcessingContext
# Helper function to print input IDs with coalesced audio/video tokens.
def print_input_ids(input_ids):
"""
Print input IDs, compressing consecutive special tokens.
- 151675: <|audio_pad|>
- 151656: <|video_pad|>
"""
if not input_ids:
print("[]")
return
result = []
i = 0
while i < len(input_ids):
current_id = input_ids[i]
# Check if it's a special token that should be compressed
if current_id in [151675, 151656]:
# Count consecutive occurrences
count = 1
while i + count < len(input_ids) and input_ids[i + count] == current_id:
count += 1
# Add compressed representation
token_name = "<|audio_pad|>" if current_id == 151675 else "<|video_pad|>"
result.append(f"{token_name} * {count}")
i += count
else:
# Regular token, just add it
result.append(str(current_id))
i += 1
print(", ".join(result))
@pytest.fixture
def mock_qwen3_omni_config():
"""Create a mock Qwen3OmniMoeThinker config."""
config = Mock(spec=PretrainedConfig)
# Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
config.audio_token_id = 151675 # <|audio_pad|>
config.video_token_id = 151656 # <|video_pad|>
config.image_token_id = 151655 # <|image_pad|>
config.audio_start_token_id = 151669 # <|audio_start|>
config.audio_end_token_id = 151670 # <|audio_end|>
config.vision_start_token_id = 151652 # <|vision_start|>
config.position_id_per_seconds = 12.5
# Vision config
vision_config = Mock()
vision_config.spatial_merge_size = 2
config.vision_config = vision_config
return config
@pytest.fixture
def mock_processor():
"""Create a mock HF processor."""
from transformers.models.whisper import WhisperFeatureExtractor
processor = Mock()
processor.audio_token = "<|audio_pad|>"
processor.image_token = "<|image_pad|>"
processor.video_token = "<|video_pad|>"
# Create a real WhisperFeatureExtractor instance for the feature_extractor attribute
feature_extractor = WhisperFeatureExtractor()
processor.feature_extractor = feature_extractor
return processor
@pytest.fixture
def mock_tokenizer():
"""Create a mock tokenizer."""
tokenizer = Mock()
# Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
tokenizer.get_vocab = Mock(
return_value={
"<|audio_pad|>": 151675,
"<|video_pad|>": 151656,
"<|image_pad|>": 151655,
"<|audio_start|>": 151669,
"<|audio_end|>": 151670,
"<|vision_start|>": 151652,
"<|vision_end|>": 151653,
}
)
tokenizer.encode = Mock(
side_effect=lambda x: {
"<|vision_start|>": [151652],
"<|vision_end|>": [151653],
"<|audio_start|>": [151669],
"<|audio_end|>": [151670],
"<|audio_pad|>": [151675],
"<|image_pad|>": [151655],
"<|video_pad|>": [151656],
}.get(x, [0])
)
tokenizer.vision_bos_token = "<|vision_start|>"
tokenizer.vision_eos_token = "<|vision_end|>"
tokenizer.audio_bos_token = "<|audio_start|>"
tokenizer.audio_eos_token = "<|audio_end|>"
return tokenizer
@pytest.fixture
def mock_image_processor():
"""Create a mock image processor."""
image_processor = Mock()
image_processor.merge_size = 2
return image_processor
def test_qwen3_omni_get_updates_use_audio_in_video(
mock_qwen3_omni_config,
mock_processor,
mock_tokenizer,
mock_image_processor,
):
"""Test the get_updates_use_audio_in_video method directly."""
from vllm.model_executor.models.qwen3_omni_moe_thinker import (
Qwen3OmniMoeThinkerMultiModalProcessor,
Qwen3OmniMoeThinkerProcessingInfo,
)
# Create a mock context
mock_ctx = Mock(spec=InputProcessingContext)
# Create processing info
info = Qwen3OmniMoeThinkerProcessingInfo(mock_ctx)
info.get_hf_config = Mock(return_value=mock_qwen3_omni_config)
info.get_hf_processor = Mock(return_value=mock_processor)
info.get_tokenizer = Mock(return_value=mock_tokenizer)
info.get_image_processor = Mock(return_value=mock_image_processor)
# Create a mock dummy_inputs builder
mock_dummy_inputs = Mock()
# Create the processor
processor = Qwen3OmniMoeThinkerMultiModalProcessor(info, mock_dummy_inputs)
# Test parameters from reference video
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4
audio_len = 85
video_grid_thw = [6, 36, 64]
video_second_per_grid_t = 2.0
# Call the method
updates = processor.get_updates_use_audio_in_video(
thinker_config=mock_qwen3_omni_config,
audio_len=audio_len,
video_grid_thw=video_grid_thw,
video_second_per_grid_t=video_second_per_grid_t,
)
# Updated input ids should align with HF implementation.
# 151669,
# <|video_pad|> * 576, <|audio_pad|> * 25,
# <|video_pad|> * 576, <|audio_pad|> * 25,
# <|video_pad|> * 576, <|audio_pad|> * 25,
# <|video_pad|> * 576, <|audio_pad|> * 10,
# <|video_pad|> * 1152,
# 151670
print_input_ids(updates)
# Verify structure
assert isinstance(updates, list)
assert len(updates) > 0
# Verify start and end tokens
audio_start_token_id = mock_qwen3_omni_config.audio_start_token_id
audio_end_token_id = mock_qwen3_omni_config.audio_end_token_id
assert updates[0] == audio_start_token_id
assert updates[-1] == audio_end_token_id
# Verify both audio and video tokens are present
audio_token_id = mock_qwen3_omni_config.audio_token_id
video_token_id = mock_qwen3_omni_config.video_token_id
audio_count = updates.count(audio_token_id)
video_count = updates.count(video_token_id)
assert audio_count == audio_len, (
f"Expected {audio_len} audio tokens, got {audio_count}"
)
# Calculate expected video token count
spatial_merge_size = mock_qwen3_omni_config.vision_config.spatial_merge_size
height = video_grid_thw[1] // spatial_merge_size
width = video_grid_thw[2] // spatial_merge_size
expected_video_count = video_grid_thw[0] * height * width
assert video_count == expected_video_count, (
f"Expected {expected_video_count} video tokens, got {video_count}"
)
# Total tokens should be: 1 (start) + audio_len + video_count + 1 (end)
expected_total = 1 + audio_len + expected_video_count + 1
assert len(updates) == expected_total, (
f"Expected {expected_total} total tokens, got {len(updates)}"
)
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@ -626,6 +626,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
trust_remote_code=True,
),
"HunYuanVLForConditionalGeneration": _HfExamplesInfo(
"tencent/HunyuanOCR",
is_available_online=False,
),
"Idefics3ForConditionalGeneration": _HfExamplesInfo(
"HuggingFaceM4/Idefics3-8B-Llama3",
extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},

View File

@ -9,9 +9,16 @@ different routing strategies and analyze their performance, including
integration tests with FusedMoE layer.
"""
import tempfile
import pytest
import torch
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.distributed import (
init_distributed_environment,
initialize_model_parallel,
)
from vllm.model_executor.layers.fused_moe.routing_simulator import (
DistributionBasedRouting,
RoutingSimulator,
@ -89,6 +96,28 @@ def test_routing_strategy_integration(monkeypatch, device):
# Test different routing strategies
strategies = RoutingSimulator.get_available_strategies()
vllm_config = VllmConfig()
with set_current_vllm_config(vllm_config):
temp_file = tempfile.mkstemp()[1]
init_distributed_environment(
world_size=1,
rank=0,
local_rank=0,
distributed_init_method=f"file://{temp_file}",
)
initialize_model_parallel(
tensor_model_parallel_size=1,
pipeline_model_parallel_size=1,
)
fused_moe = FusedMoE(
num_experts=num_experts,
top_k=top_k,
hidden_size=hidden_size,
intermediate_size=0,
use_grouped_topk=False,
renormalize=True,
)
for strategy in strategies:
# Set environment variable
env_name = "VLLM_MOE_ROUTING_SIMULATION_STRATEGY"
@ -98,13 +127,9 @@ def test_routing_strategy_integration(monkeypatch, device):
envs.environment_variables[env_name] = lambda s=strategy: s
# Test the select_experts method
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = fused_moe.select_experts(
hidden_states=hidden_states,
router_logits=router_logits,
top_k=top_k,
use_grouped_topk=False,
renormalize=True,
indices_type=torch.long,
)
# Verify output shapes

View File

@ -36,6 +36,12 @@ def mock_on_gfx9():
@pytest.mark.parametrize(
"env_vars, selected_backend, expected_backend_path",
[
# Test Case: Explicit FLEX_ATTENTION backend
(
{},
"FLEX_ATTENTION",
AttentionBackendEnum.FLEX_ATTENTION.get_path(),
),
# Test Case 1: Default (no env vars, no explicit backend)
(
{},

View File

@ -340,4 +340,11 @@ full_cg_backend_configs = {
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
),
"RocmAttn": BackendConfig(
name="RocmAttn",
env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
comp_config={
"cudagraph_mode": "FULL",
},
),
}

View File

@ -1436,6 +1436,65 @@ def test_get_kv_cache_config_one_worker():
],
)
# 6 full + 5 sliding, pad to 6 full + 6 sliding. This is a typical case for gpt-oss
# eagle where there is only one more full attention layer than sliding window layers
kv_cache_specs_hybrid = {
"layer_1": new_kv_cache_spec(),
"layer_2": new_kv_cache_spec(),
"layer_3": new_kv_cache_spec(),
"layer_4": new_kv_cache_spec(),
"layer_5": new_kv_cache_spec(),
"layer_6": new_kv_cache_spec(),
"layer_7": new_sliding_window_spec(),
"layer_8": new_sliding_window_spec(),
"layer_9": new_sliding_window_spec(),
"layer_10": new_sliding_window_spec(),
"layer_11": new_sliding_window_spec(),
}
kv_cache_config_hybrid = get_kv_cache_configs(
vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 6 * 32]
)[0]
print(kv_cache_config_hybrid)
assert kv_cache_config_hybrid == KVCacheConfig(
num_blocks=32,
kv_cache_tensors=[
KVCacheTensor(
size=mem_per_block_per_layer * 32,
shared_by=["layer_1", "layer_7"],
),
KVCacheTensor(
size=mem_per_block_per_layer * 32,
shared_by=["layer_2", "layer_8"],
),
KVCacheTensor(
size=mem_per_block_per_layer * 32,
shared_by=["layer_3", "layer_9"],
),
KVCacheTensor(
size=mem_per_block_per_layer * 32,
shared_by=["layer_4", "layer_10"],
),
KVCacheTensor(
size=mem_per_block_per_layer * 32,
shared_by=["layer_5", "layer_11"],
),
KVCacheTensor(
size=mem_per_block_per_layer * 32,
shared_by=["layer_6"],
),
],
kv_cache_groups=[
KVCacheGroupSpec(
["layer_1", "layer_2", "layer_3", "layer_4", "layer_5", "layer_6"],
new_kv_cache_spec(),
),
KVCacheGroupSpec(
["layer_7", "layer_8", "layer_9", "layer_10", "layer_11"],
new_sliding_window_spec(),
),
],
)
# different hidden size
kv_cache_specs_hybrid = {
"layer_1": new_kv_cache_spec(head_size=128),

View File

@ -641,6 +641,34 @@ def test_schedule_concurrent_batches(
scheduler.update_from_output(scheduler_output1, model_runner_output)
@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
def test_schedule_order(enable_chunked_prefill: bool):
scheduler = create_scheduler(
max_num_batched_tokens=1024,
max_num_seqs=3,
enable_chunked_prefill=enable_chunked_prefill,
)
# long requests
requests = create_requests(num_requests=2, num_tokens=800)
# short requests
requests += create_requests(num_requests=2, num_tokens=10)
for request in requests:
scheduler.add_request(request)
scheduler_output1 = scheduler.schedule()
if enable_chunked_prefill:
# When enable chunked prefill, long requests will be chunked.
assert len(scheduler_output1.scheduled_new_reqs) == 2
else:
# When disable chunked prefill, should not skip the long requests,
# and scheduling subsequent short requests in advance,
# even though there is still token budgets remaining.
assert len(scheduler_output1.scheduled_new_reqs) == 1
def test_preempt_during_execution():
# NOTE(woosuk): The actual number of available blocks is 10 instead of 11
# because block 0 is reserved as the null block.

View File

@ -42,6 +42,7 @@ def create_scheduler(
model: str = "facebook/opt-125m",
max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192,
enable_chunked_prefill: bool = True,
enable_prefix_caching: bool = False,
long_prefill_token_threshold: int = 0,
disable_chunked_mm_input: bool = False,
@ -76,7 +77,7 @@ def create_scheduler(
max_model_len=max_model_len,
long_prefill_token_threshold=long_prefill_token_threshold,
disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=True,
enable_chunked_prefill=enable_chunked_prefill,
async_scheduling=async_scheduling,
)
model_config = ModelConfig(

View File

@ -35,14 +35,22 @@ def temporary_environ(env_vars):
# test attention backend and cudagraph_mode combo
# (backend_name, cudagraph_mode, supported)
combo_cases_1 = [
("FA3", "FULL", True),
("FA3", "FULL_AND_PIECEWISE", True),
("FA2", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
("FA2", "FULL_AND_PIECEWISE", True),
("FlashInfer", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
("FlashInfer", "FULL_AND_PIECEWISE", True),
]
if current_platform.is_rocm():
combo_cases_1 = [
("RocmAttn", "FULL", True),
("RocmAttn", "FULL_AND_PIECEWISE", True),
("TritonAttn", "FULL", True),
("TritonAttn", "FULL_AND_PIECEWISE", True),
]
else:
combo_cases_1 = [
("FA3", "FULL", True),
("FA3", "FULL_AND_PIECEWISE", True),
("FA2", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
("FA2", "FULL_AND_PIECEWISE", True),
("FlashInfer", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
("FlashInfer", "FULL_AND_PIECEWISE", True),
]
@pytest.mark.parametrize("backend_name, cudagraph_mode, supported", combo_cases_1)
@ -92,18 +100,32 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
# test cudagraph_mode with different compilation mode.
# (backend_name, cudagraph_mode, compilation_mode, supported)
combo_cases_2 = [
("FA2", "FULL", CompilationMode.NONE, True),
("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
("FA2", "PIECEWISE", CompilationMode.NONE, False),
("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
("FA2", "NONE", CompilationMode.NONE, True),
("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
]
if current_platform.is_rocm():
combo_cases_2 = [
("RocmAttn", "FULL", CompilationMode.NONE, True),
("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True),
("RocmAttn", "PIECEWISE", CompilationMode.NONE, False),
("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
("RocmAttn", "NONE", CompilationMode.NONE, True),
("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True),
]
else:
combo_cases_2 = [
("FA2", "FULL", CompilationMode.NONE, True),
("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
("FA2", "PIECEWISE", CompilationMode.NONE, False),
("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
("FA2", "NONE", CompilationMode.NONE, True),
("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
]
@pytest.mark.parametrize(

View File

@ -0,0 +1,77 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
from contextlib import AsyncExitStack
from dataclasses import replace
import pytest
from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
DP_SIZE = int(os.getenv("DP_SIZE", 2))
@pytest.mark.asyncio
async def test_run_eagle_dp():
target_model = "meta-llama/Llama-3.1-8B-Instruct"
draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
engine_args = AsyncEngineArgs(
model=target_model,
tokenizer_mode="auto",
enforce_eager=False,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
data_parallel_size=DP_SIZE,
data_parallel_backend="mp", # ray takes more time
trust_remote_code=True,
max_model_len=16384,
)
eagle_engine_args = replace(
engine_args,
speculative_config={
"model": draft_model,
"method": "eagle",
"num_speculative_tokens": 3,
},
)
prompt = "This is a test of data parallel with eagle"
num_expected_tokens = 100
sampling_params = SamplingParams(
min_tokens=num_expected_tokens,
max_tokens=num_expected_tokens,
ignore_eos=True,
output_kind=RequestOutputKind.FINAL_ONLY,
temperature=0,
)
async def generate_with_timeout(given_engine: AsyncLLM):
async for out in given_engine.generate(
request_id="test-eagle-dp", prompt=prompt, sampling_params=sampling_params
):
token_ids = out.outputs[0].token_ids
assert len(token_ids) == num_expected_tokens
return token_ids
async def engine_create_and_generate(engine_args: AsyncEngineArgs):
async with AsyncExitStack() as after:
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
token_ids = await asyncio.wait_for(
generate_with_timeout(engine), timeout=30
)
assert not engine.output_processor.has_unfinished_requests()
return token_ids
token_ids_with_eagle = await engine_create_and_generate(eagle_engine_args)
token_ids_no_eagle = await engine_create_and_generate(engine_args)
# Test for correctness
assert token_ids_with_eagle == token_ids_no_eagle

View File

@ -3,7 +3,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from dataclasses import fields
from enum import Enum
from typing import TYPE_CHECKING, Any
@ -21,7 +20,6 @@ from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
from vllm.sampling_params import (
GuidedDecodingParams,
SamplingParams,
StructuredOutputsParams,
)
@ -108,23 +106,6 @@ class CarDescription(BaseModel):
car_type: CarType
def test_guided_decoding_deprecated():
with pytest.warns(DeprecationWarning, match="GuidedDecodingParams is deprecated.*"):
guided_decoding = GuidedDecodingParams(json_object=True)
structured_outputs = StructuredOutputsParams(json_object=True)
assert fields(guided_decoding) == fields(structured_outputs)
with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
sp1 = SamplingParams(guided_decoding=guided_decoding)
with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
sp2 = SamplingParams.from_optional(guided_decoding=guided_decoding)
assert sp1 == sp2
assert sp1.structured_outputs == guided_decoding
@pytest.mark.parametrize(
"model_name, backend, tokenizer_mode, speculative_config",
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
@ -899,13 +880,11 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
output_json = json.loads(generated_text)
@pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"])
def test_structured_output_with_structural_tag(
guided_decoding_backend: str,
):
@pytest.mark.parametrize("backend", ["xgrammar"])
def test_structured_output_with_structural_tag(backend: str):
llm = LLM(
model="Qwen/Qwen2.5-1.5B-Instruct",
guided_decoding_backend=guided_decoding_backend,
structured_outputs_config=StructuredOutputsConfig(backend=backend),
)
structural_tag_config = {
@ -923,7 +902,7 @@ def test_structured_output_with_structural_tag(
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=500,
guided_decoding=StructuredOutputsParams(
structured_outputs=StructuredOutputsParams(
structural_tag=json.dumps(structural_tag_config)
),
)

View File

@ -294,6 +294,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
kv_last_page_lens: torch.Tensor | None = None,
sm_scale: float = 1.0,
logit_cap: float = 0.0,
q_scale: torch.Tensor | None = None,
kv_scale: torch.Tensor | None = None,
) -> None:
from aiter.mla import mla_decode_fwd
@ -308,6 +310,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
max_seqlen_qo,
sm_scale=sm_scale,
logit_cap=logit_cap,
q_scale=q_scale,
kv_scale=kv_scale,
)
@ -322,6 +326,8 @@ def _rocm_aiter_mla_decode_fwd_fake(
kv_last_page_lens: torch.Tensor | None = None,
sm_scale: float = 1.0,
logit_cap: float = 0.0,
q_scale: torch.Tensor | None = None,
kv_scale: torch.Tensor | None = None,
) -> None:
pass
@ -806,6 +812,8 @@ class rocm_aiter_ops:
kv_indices: torch.Tensor | None = None,
kv_last_page_lens: torch.Tensor | None = None,
logit_cap: float = 0.0,
q_scale: torch.Tensor | None = None,
kv_scale: torch.Tensor | None = None,
):
torch.ops.vllm.rocm_aiter_mla_decode_fwd(
q,
@ -818,6 +826,8 @@ class rocm_aiter_ops:
kv_last_page_lens,
sm_scale=sm_scale,
logit_cap=logit_cap,
q_scale=q_scale,
kv_scale=kv_scale,
)
@staticmethod

View File

@ -2201,7 +2201,8 @@ def gather_and_maybe_dequant_cache(
dst: torch.Tensor,
block_table: torch.Tensor,
cu_seq_lens: torch.Tensor,
batch_size: int,
token_to_seq: torch.Tensor,
num_tokens: int,
kv_cache_dtype: str,
scale: torch.Tensor,
seq_starts: torch.Tensor | None = None,
@ -2211,7 +2212,8 @@ def gather_and_maybe_dequant_cache(
dst,
block_table,
cu_seq_lens,
batch_size,
token_to_seq,
num_tokens,
kv_cache_dtype,
scale,
seq_starts,

View File

@ -194,7 +194,6 @@ def _cp_lse_common(
cp_attn_lse = cp_attn_lse.contiguous()
lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
assert out.is_contiguous()
return out, lse

View File

@ -63,13 +63,14 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
else:
logger.debug("Using InductorAdaptor")
return InductorAdaptor()
else:
assert compilation_config.backend == "eager", (
"Custom backends not supported with CompilationMode.VLLM_COMPILE"
)
elif compilation_config.backend == "eager":
logger.debug("Using EagerAdaptor")
return EagerAdaptor()
else:
logger.debug("Using custom backend: %s", compilation_config.backend)
compiler = resolve_obj_by_qualname(current_platform.get_compile_backend())()
assert isinstance(compiler, CompilerInterface)
return compiler
class CompilerManager:
@ -545,7 +546,10 @@ class VllmBackend:
self.prefix = prefix or model_tag
# Passes to run on the graph post-grad.
self.post_grad_pass_manager = PostGradPassManager()
self.pass_manager = resolve_obj_by_qualname(
current_platform.get_pass_manager_cls()
)()
self.pass_key = current_platform.pass_key
self.sym_tensor_indices = []
self.input_buffers = []
@ -562,24 +566,20 @@ class VllmBackend:
def configure_post_pass(self):
config = self.compilation_config
self.post_grad_pass_manager.configure(self.vllm_config)
self.pass_manager.configure(self.vllm_config)
# Post-grad custom passes are run using the post_grad_custom_post_pass
# hook. If a pass for that hook exists, add it to the pass manager.
inductor_config = config.inductor_compile_config
PASS_KEY = "post_grad_custom_post_pass"
if PASS_KEY in inductor_config:
if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
if self.pass_key in inductor_config:
if isinstance(inductor_config[self.pass_key], PostGradPassManager):
# PassManager already added to config, make sure it's correct
assert (
inductor_config[PASS_KEY].uuid()
== self.post_grad_pass_manager.uuid()
)
assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid()
else:
# Config should automatically wrap all inductor passes
assert isinstance(inductor_config[PASS_KEY], InductorPass)
self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
inductor_config[PASS_KEY] = self.post_grad_pass_manager
assert isinstance(inductor_config[self.pass_key], InductorPass)
self.pass_manager.add(inductor_config[self.pass_key])
inductor_config[self.pass_key] = self.pass_manager
def __call__(
self, graph: fx.GraphModule, example_inputs

View File

@ -331,9 +331,9 @@ class CompilationConfig:
We use string to avoid serialization issues when using compilation in a
distributed setting. When the compilation mode is 1 or 2, the backend is
used for the compilation directly (it sees the whole graph). When the
compilation mode is 3, the backend is used for the piecewise compilation
(it sees a part of the graph). The backend can not be custom for compilation
mode 3, i.e. the backend must be either eager or inductor. Furthermore,
compilation mode is 3, the backend supports both whole graph and piecewise
compilation, available backends include eager, inductor, and custom backends,
the latter of which can be defined via `get_compile_backend`. Furthermore,
compilation is only piecewise if splitting ops is set accordingly and
use_inductor_graph_partition is off. Note that the default options for
splitting ops are sufficient for piecewise compilation.
@ -768,7 +768,7 @@ class CompilationConfig:
self.backend = "inductor" if self.use_inductor else "eager"
if self.backend == "":
self.backend = current_platform.simple_compile_backend
self.backend = current_platform.get_compile_backend()
def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
"""
@ -800,9 +800,7 @@ class CompilationConfig:
assert self.mode == CompilationMode.VLLM_COMPILE
if self.backend not in ["eager", "inductor"]:
raise ValueError(
f"Invalid backend for piecewise compilation: {self.backend}"
)
logger.info("Using OOT custom backend for compilation.")
from vllm.compilation.backends import VllmBackend

View File

@ -33,6 +33,7 @@ from vllm.transformers_utils.config import (
try_get_safetensors_metadata,
try_get_tokenizer_config,
uses_mrope,
uses_xdrope_dim,
)
from vllm.transformers_utils.gguf_utils import (
maybe_patch_hf_config_from_gguf,
@ -585,16 +586,26 @@ class ModelConfig:
else: # task == "auto"
pass
else:
debug_info = {
"architectures": architectures,
"is_generative_model": is_generative_model,
"is_pooling_model": is_pooling_model,
}
raise AssertionError(
"The model should be a generative or "
"pooling model when task is set to "
f"{self.task!r}. Found: {debug_info}"
)
# Neither generative nor pooling model - try to convert if possible
if is_pooling_task:
runner = "pooling"
convert = _task_to_convert(self.task)
msg_hint = (
"Please replace this option with `--runner pooling "
f"--convert {convert}` to continue using this model "
"as a pooling model."
)
else:
debug_info = {
"architectures": architectures,
"is_generative_model": is_generative_model,
"is_pooling_model": is_pooling_model,
}
raise AssertionError(
"The model should be a generative or "
"pooling model when task is set to "
f"{self.task!r}. Found: {debug_info}"
)
self.runner = runner
self.convert = convert
@ -1605,6 +1616,10 @@ class ModelConfig:
def uses_mrope(self) -> bool:
return uses_mrope(self.hf_config)
@property
def uses_xdrope_dim(self) -> int:
return uses_xdrope_dim(self.hf_config)
@property
def is_multimodal_model(self) -> bool:
return self.multimodal_config is not None

View File

@ -141,22 +141,6 @@ class ParallelConfig:
- "deepep_high_throughput": Use deepep high-throughput kernels
- "deepep_low_latency": Use deepep low-latency kernels
- "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
num_redundant_experts: int | None = None
"""`num_redundant_experts` is deprecated and has been replaced with
`eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
Please use `eplb_config.num_redundant_experts` instead."""
eplb_window_size: int | None = None
"""`eplb_window_size` is deprecated and has been replaced with
`eplb_config.window_size`. This will be removed in v0.12.0.
Please use `eplb_config.window_size` instead."""
eplb_step_interval: int | None = None
"""`eplb_step_interval` is deprecated and has been replaced with
`eplb_config.step_interval`. This will be removed in v0.12.0.
Please use `eplb_config.step_interval` instead."""
eplb_log_balancedness: bool | None = None
"""`eplb_log_balancedness` is deprecated and has been replaced with
`eplb_config.log_balancedness`. This will be removed in v0.12.0.
Please use `eplb_config.log_balancedness` instead."""
max_parallel_loading_workers: int | None = None
"""Maximum number of parallel loading workers when loading model
@ -516,40 +500,6 @@ class ParallelConfig:
"--all2all-backend command-line argument instead."
)
# Forward deprecated fields to their new location
if self.num_redundant_experts is not None:
self.eplb_config.num_redundant_experts = self.num_redundant_experts
logger.warning_once(
"num_redundant_experts is deprecated and has been replaced "
"with eplb_config.num_redundant_experts. This will be removed "
"in v0.12.0. Changing this field after initialization will "
"have no effect."
)
if self.eplb_window_size is not None:
self.eplb_config.window_size = self.eplb_window_size
logger.warning_once(
"eplb_window_size is deprecated and has been replaced "
"with eplb_config.window_size. This will be removed "
"in v0.12.0. Changing this field after initialization will "
"have no effect."
)
if self.eplb_step_interval is not None:
self.eplb_config.step_interval = self.eplb_step_interval
logger.warning_once(
"eplb_step_interval is deprecated and has been replaced "
"with eplb_config.step_interval. This will be removed "
"in v0.12.0. Changing this field after initialization will "
"have no effect."
)
if self.eplb_log_balancedness is not None:
self.eplb_config.log_balancedness = self.eplb_log_balancedness
logger.warning_once(
"eplb_log_balancedness is deprecated and has been replaced "
"with eplb_config.log_balancedness. This will be removed "
"in v0.12.0. Changing this field after initialization will "
"have no effect."
)
# Continue with the rest of the initialization
self.world_size = (
self.pipeline_parallel_size

View File

@ -131,7 +131,7 @@ class SymmMemCommunicator:
return None
if out is None:
out = torch.empty_like(inp)
self.buffer[: inp.numel()].copy_(inp.view(-1))
self.buffer[: inp.numel()].copy_(inp.reshape(-1))
# Determine which algorithm to use
use_multimem = False

View File

@ -4,7 +4,6 @@ import contextlib
import copy
import logging
import math
import os
import queue
import threading
import time
@ -810,9 +809,6 @@ class NixlConnectorWorker:
self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config(
"backends", ["UCX"]
)
# TODO temporary, once nixl allows for telemetry flag in config
# (next release), we can remove this env var.
os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
# Agent.
non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
@ -828,10 +824,11 @@ class NixlConnectorWorker:
if nixl_agent_config is None:
config = None
else:
# Enable telemetry by default for NIXL 0.7.1 and above.
config = (
nixl_agent_config(backends=self.nixl_backends)
nixl_agent_config(backends=self.nixl_backends, capture_telemetry=True)
if len(non_ucx_backends) > 0
else nixl_agent_config(num_threads=num_threads)
else nixl_agent_config(num_threads=num_threads, capture_telemetry=True)
)
self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)

View File

@ -502,11 +502,6 @@ class EngineArgs:
)
reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
reasoning_parser_plugin: str | None = None
# Deprecated guided decoding fields
guided_decoding_backend: str | None = None
guided_decoding_disable_fallback: bool | None = None
guided_decoding_disable_any_whitespace: bool | None = None
guided_decoding_disable_additional_properties: bool | None = None
logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern
@ -725,19 +720,6 @@ class EngineArgs:
"--reasoning-parser-plugin",
**structured_outputs_kwargs["reasoning_parser_plugin"],
)
# Deprecated guided decoding arguments
for arg, type in [
("--guided-decoding-backend", str),
("--guided-decoding-disable-fallback", bool),
("--guided-decoding-disable-any-whitespace", bool),
("--guided-decoding-disable-additional-properties", bool),
]:
structured_outputs_group.add_argument(
arg,
type=type,
help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
deprecated=True,
)
# Parallel arguments
parallel_kwargs = get_kwargs(ParallelConfig)
@ -855,30 +837,6 @@ class EngineArgs:
"--expert-placement-strategy",
**parallel_kwargs["expert_placement_strategy"],
)
parallel_group.add_argument(
"--num-redundant-experts",
type=int,
help="[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
deprecated=True,
)
parallel_group.add_argument(
"--eplb-window-size",
type=int,
help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
deprecated=True,
)
parallel_group.add_argument(
"--eplb-step-interval",
type=int,
help="[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
deprecated=True,
)
parallel_group.add_argument(
"--eplb-log-balancedness",
action=argparse.BooleanOptionalAction,
help="[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
deprecated=True,
)
parallel_group.add_argument(
"--max-parallel-loading-workers",
@ -1612,6 +1570,12 @@ class EngineArgs:
model_config.skip_tokenizer_init = True
logger.info("Skipping tokenizer initialization for tokens-only mode.")
if self.async_scheduling and not self.disable_nccl_for_dp_synchronization:
logger.info(
"Disabling NCCL for DP synchronization when using async scheduling."
)
self.disable_nccl_for_dp_synchronization = True
# Forward the deprecated CLI args to the EPLB config.
if self.num_redundant_experts is not None:
self.eplb_config.num_redundant_experts = self.num_redundant_experts
@ -1736,21 +1700,6 @@ class EngineArgs:
self.reasoning_parser_plugin
)
# Forward the deprecated CLI args to the StructuredOutputsConfig
so_config = self.structured_outputs_config
if self.guided_decoding_backend is not None:
so_config.guided_decoding_backend = self.guided_decoding_backend
if self.guided_decoding_disable_fallback is not None:
so_config.disable_fallback = self.guided_decoding_disable_fallback
if self.guided_decoding_disable_any_whitespace is not None:
so_config.disable_any_whitespace = (
self.guided_decoding_disable_any_whitespace
)
if self.guided_decoding_disable_additional_properties is not None:
so_config.disable_additional_properties = (
self.guided_decoding_disable_additional_properties
)
observability_config = ObservabilityConfig(
show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
otlp_traces_endpoint=self.otlp_traces_endpoint,

View File

@ -652,62 +652,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
default=None,
description="Additional kwargs for structured outputs",
)
guided_json: str | dict | BaseModel | None = Field(
default=None,
description=(
"`guided_json` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please pass `json` to `structured_outputs` instead."
),
)
guided_regex: str | None = Field(
default=None,
description=(
"`guided_regex` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please pass `regex` to `structured_outputs` instead."
),
)
guided_choice: list[str] | None = Field(
default=None,
description=(
"`guided_choice` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please pass `choice` to `structured_outputs` instead."
),
)
guided_grammar: str | None = Field(
default=None,
description=(
"`guided_grammar` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please pass `grammar` to `structured_outputs` instead."
),
)
structural_tag: str | None = Field(
default=None,
description=(
"`structural_tag` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please pass `structural_tag` to `structured_outputs` instead."
),
)
guided_decoding_backend: str | None = Field(
default=None,
description=(
"`guided_decoding_backend` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please remove it from your request."
),
)
guided_whitespace_pattern: str | None = Field(
default=None,
description=(
"`guided_whitespace_pattern` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please pass `whitespace_pattern` to `structured_outputs` instead."
),
)
priority: int = Field(
default=0,
description=(
@ -717,7 +661,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
),
)
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
default_factory=random_uuid,
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
@ -841,20 +785,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
if prompt_logprobs is None and self.echo:
prompt_logprobs = self.top_logprobs
# Forward deprecated guided_* parameters to structured_outputs
if self.structured_outputs is None:
kwargs = dict[str, Any](
json=self.guided_json,
regex=self.guided_regex,
choice=self.guided_choice,
grammar=self.guided_grammar,
whitespace_pattern=self.guided_whitespace_pattern,
structural_tag=self.structural_tag,
)
kwargs = {k: v for k, v in kwargs.items() if v is not None}
if len(kwargs) > 0:
self.structured_outputs = StructuredOutputsParams(**kwargs)
response_format = self.response_format
if response_format is not None:
# If structured outputs wasn't already enabled,
@ -863,24 +793,23 @@ class ChatCompletionRequest(OpenAIBaseModel):
self.structured_outputs = StructuredOutputsParams()
# Set structured output params for response format
if response_format is not None:
if response_format.type == "json_object":
self.structured_outputs.json_object = True
elif response_format.type == "json_schema":
json_schema = response_format.json_schema
assert json_schema is not None
self.structured_outputs.json = json_schema.json_schema
elif response_format.type == "structural_tag":
structural_tag = response_format
assert structural_tag is not None and isinstance(
structural_tag,
(
LegacyStructuralTagResponseFormat,
StructuralTagResponseFormat,
),
)
s_tag_obj = structural_tag.model_dump(by_alias=True)
self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
if response_format.type == "json_object":
self.structured_outputs.json_object = True
elif response_format.type == "json_schema":
json_schema = response_format.json_schema
assert json_schema is not None
self.structured_outputs.json = json_schema.json_schema
elif response_format.type == "structural_tag":
structural_tag = response_format
assert structural_tag is not None and isinstance(
structural_tag,
(
LegacyStructuralTagResponseFormat,
StructuralTagResponseFormat,
),
)
s_tag_obj = structural_tag.model_dump(by_alias=True)
self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
if self.kv_transfer_params:
@ -1140,58 +1069,6 @@ class CompletionRequest(OpenAIBaseModel):
default=None,
description="Additional kwargs for structured outputs",
)
guided_json: str | dict | BaseModel | None = Field(
default=None,
description=(
"`guided_json` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please pass `json` to `structured_outputs` instead."
),
)
guided_regex: str | None = Field(
default=None,
description=(
"`guided_regex` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please pass `regex` to `structured_outputs` instead."
),
)
guided_choice: list[str] | None = Field(
default=None,
description=(
"`guided_choice` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please pass `choice` to `structured_outputs` instead."
),
)
guided_grammar: str | None = Field(
default=None,
description=(
"`guided_grammar` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please pass `grammar` to `structured_outputs` instead."
),
)
structural_tag: str | None = Field(
default=None,
description=("If specified, the output will follow the structural tag schema."),
)
guided_decoding_backend: str | None = Field(
default=None,
description=(
"`guided_decoding_backend` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please remove it from your request."
),
)
guided_whitespace_pattern: str | None = Field(
default=None,
description=(
"`guided_whitespace_pattern` is deprecated. "
"This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
"Please pass `whitespace_pattern` to `structured_outputs` instead."
),
)
priority: int = Field(
default=0,
description=(
@ -1201,7 +1078,7 @@ class CompletionRequest(OpenAIBaseModel):
),
)
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
default_factory=random_uuid,
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
@ -1336,35 +1213,31 @@ class CompletionRequest(OpenAIBaseModel):
echo_without_generation = self.echo and self.max_tokens == 0
guided_json_object = None
if self.response_format is not None:
if self.response_format.type == "json_object":
guided_json_object = True
elif self.response_format.type == "json_schema":
json_schema = self.response_format.json_schema
response_format = self.response_format
if response_format is not None:
# If structured outputs wasn't already enabled,
# we must enable it for these features to work
if self.structured_outputs is None:
self.structured_outputs = StructuredOutputsParams()
# Set structured output params for response format
if response_format.type == "json_object":
self.structured_outputs.json_object = True
elif response_format.type == "json_schema":
json_schema = response_format.json_schema
assert json_schema is not None
self.guided_json = json_schema.json_schema
elif self.response_format.type == "structural_tag":
structural_tag = self.response_format
self.structured_outputs.json = json_schema.json_schema
elif response_format.type == "structural_tag":
structural_tag = response_format
assert structural_tag is not None and isinstance(
structural_tag, StructuralTagResponseFormat
structural_tag,
(
LegacyStructuralTagResponseFormat,
StructuralTagResponseFormat,
),
)
s_tag_obj = structural_tag.model_dump(by_alias=True)
self.structural_tag = json.dumps(s_tag_obj)
# Forward deprecated guided_* parameters to structured_outputs
if self.structured_outputs is None:
kwargs = dict[str, Any](
json=self.guided_json,
json_object=guided_json_object,
regex=self.guided_regex,
choice=self.guided_choice,
grammar=self.guided_grammar,
whitespace_pattern=self.guided_whitespace_pattern,
)
kwargs = {k: v for k, v in kwargs.items() if v is not None}
if len(kwargs) > 0:
self.structured_outputs = StructuredOutputsParams(**kwargs)
self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
if self.kv_transfer_params:
@ -1502,7 +1375,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
),
)
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
default_factory=random_uuid,
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
@ -1597,7 +1470,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
),
)
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
default_factory=random_uuid,
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
@ -2019,7 +1892,7 @@ class ClassificationCompletionRequest(OpenAIBaseModel):
),
)
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
default_factory=random_uuid,
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
@ -2110,7 +1983,7 @@ class ClassificationChatRequest(OpenAIBaseModel):
)
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
default_factory=random_uuid,
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
@ -3221,7 +3094,7 @@ class TranslationResponseVerbose(OpenAIBaseModel):
####### Tokens IN <> Tokens OUT #######
class GenerateRequest(BaseModel):
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
default_factory=random_uuid,
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
@ -3278,7 +3151,7 @@ class GenerateResponseChoice(BaseModel):
class GenerateResponse(BaseModel):
request_id: str = Field(
default_factory=lambda: f"{random_uuid()}",
default_factory=random_uuid,
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "

View File

@ -273,6 +273,11 @@ class OpenAIServingChat(OpenAIServing):
try:
for i, engine_prompt in enumerate(engine_prompts):
prompt_text, _, _ = self._get_prompt_components(request_prompts[i])
# If we are creating sub requests for multiple prompts, ensure that they
# have unique request ids.
sub_request_id = (
request_id if len(engine_prompts) == 1 else f"{request_id}_{i}"
)
if self.default_sampling_params is None:
self.default_sampling_params = {}
@ -301,7 +306,7 @@ class OpenAIServingChat(OpenAIServing):
)
self._log_inputs(
request_id,
sub_request_id,
request_prompts[i],
params=sampling_params,
lora_request=lora_request,
@ -316,14 +321,14 @@ class OpenAIServingChat(OpenAIServing):
if isinstance(sampling_params, BeamSearchParams):
generator = self.beam_search(
prompt=engine_prompt,
request_id=request_id,
request_id=sub_request_id,
params=sampling_params,
lora_request=lora_request,
trace_headers=trace_headers,
)
else:
engine_request, tokenization_kwargs = await self._process_inputs(
request_id,
sub_request_id,
engine_prompt,
sampling_params,
lora_request=lora_request,
@ -334,7 +339,7 @@ class OpenAIServingChat(OpenAIServing):
generator = self.engine_client.generate(
engine_request,
sampling_params,
request_id,
sub_request_id,
lora_request=lora_request,
trace_headers=trace_headers,
priority=request.priority,

View File

@ -1242,16 +1242,19 @@ class OpenAIServing:
):
prompt_text, _, _ = self._get_prompt_components(request_prompt)
orig_priority = priority
sub_request = 0
while True:
# Ensure that each sub-request has a unique request id.
sub_request_id = f"{request_id}_{sub_request}"
self._log_inputs(
request_id,
sub_request_id,
request_prompt,
params=sampling_params,
lora_request=lora_request,
)
trace_headers = kwargs.get("trace_headers")
engine_request, tokenization_kwargs = await self._process_inputs(
request_id,
sub_request_id,
engine_prompt,
sampling_params,
lora_request=lora_request,
@ -1262,7 +1265,7 @@ class OpenAIServing:
generator = self.engine_client.generate(
engine_request,
sampling_params,
request_id,
sub_request_id,
lora_request=lora_request,
priority=priority,
prompt_text=prompt_text,
@ -1295,6 +1298,7 @@ class OpenAIServing:
sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids)
# OPTIMIZATION
priority = orig_priority - 1
sub_request += 1
def _get_prompt_components(
self,
@ -1345,11 +1349,12 @@ class OpenAIServing:
raw_request: Request | None, default: str | None = None
) -> str | None:
"""Pulls the request id to use from a header, if provided"""
default = default or random_uuid()
if raw_request is None:
return default
if raw_request is not None and (
(req_id := raw_request.headers.get("X-Request-Id")) is not None
):
return req_id
return raw_request.headers.get("X-Request-Id", default)
return random_uuid() if default is None else default
@staticmethod
def _get_data_parallel_rank(raw_request: Request | None) -> int | None:

View File

@ -201,10 +201,10 @@ class OpenAISpeechToText(OpenAIServing):
self.engine_client.generate(
prompt,
sampling_params,
request_id,
f"{request_id}_{i}",
lora_request=lora_request,
)
for prompt in prompts
for i, prompt in enumerate(prompts)
]
except ValueError as e:
# TODO: Use a vllm-specific Validation Error

View File

@ -812,19 +812,19 @@ def override_envs_for_invariance():
# "TRITON_MLA",
]
if curr_attn_backend not in supported_backends:
warning = (
"Forcibly updating attention backend to"
f" {supported_backends[0]} for batch_invariant. "
f" Supported backends: {supported_backends}."
error = (
"VLLM batch_invariant mode requires an attention backend in "
f"{supported_backends}, but got '{curr_attn_backend}'. "
"Please set the 'VLLM_ATTENTION_BACKEND' environment variable "
"to one of the supported backends before enabling batch_invariant."
)
logger.warning_once(warning)
os.environ["VLLM_ATTENTION_BACKEND"] = supported_backends[0]
raise RuntimeError(error)
if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]:
warning = (
"You are using a decode-invariant form of batch invariance. "
"This will not be invariant between prefill and decode."
)
logger.warning_once(warning)
logger.warning_once(warning, scope="local")
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

View File

@ -6,22 +6,7 @@ import torch
from torch.nn import functional as F
from vllm import _custom_ops as ops
def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
d = x.shape[-1] // 2
return F.silu(x[..., :d]) * x[..., d:]
def swigluoai_and_mul(
x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0
) -> torch.Tensor:
d = x.shape[-1] // 2
gate, up = x[..., :d], x[..., d:]
gate = gate.clamp(max=limit)
up = up.clamp(min=-limit, max=limit)
glu = gate * torch.sigmoid(alpha * gate)
return (up + 1) * glu
from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
def grouped_topk(
@ -227,6 +212,11 @@ class CPUFusedMOE:
layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
self.act_to_impl = {
"silu": SiluAndMul(),
"swigluoai": SwigluOAIAndMul(),
}
def __call__(
self,
layer: torch.nn.Module,
@ -246,7 +236,7 @@ class CPUFusedMOE:
apply_router_weight_on_input: bool = False,
activation: str = "silu",
) -> torch.Tensor:
assert activation in {"silu", "swigluoai"}, f"{activation} is not supported."
assert activation in self.act_to_impl, f"{activation} is not supported."
assert not apply_router_weight_on_input
topk_weights, topk_ids = select_experts(
hidden_states=x,
@ -283,10 +273,7 @@ class CPUFusedMOE:
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
if activation == "swigluoai":
gate_up = swigluoai_and_mul(gate_up)
else:
gate_up = silu_and_mul(gate_up)
gate_up = self.act_to_impl[activation].forward_native(gate_up)
expert_out = layer.down_linear[i](gate_up)
outputs.append(expert_out)
start_idx = end_idx

View File

@ -90,10 +90,14 @@ class FusedMoEMethodBase(QuantizeMethodBase):
def allow_inplace(self) -> bool:
return False
@property
def method_name(self) -> str:
return self.__class__.__name__
@abstractmethod
def apply(
self,
layer: torch.nn.Module,
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,

View File

@ -66,6 +66,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
def allow_inplace(self) -> bool:
return self.old_quant_method.allow_inplace
@property
def method_name(self) -> str:
return self.old_quant_method.method_name
def create_weights(
self,
layer: torch.nn.Module,
@ -84,7 +88,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
def apply(
self,
layer: torch.nn.Module,
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -105,42 +109,9 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
# Is getattr needed?
zero_expert_num = getattr(layer, "zero_expert_num", 0)
zero_expert_type = getattr(layer, "zero_expert_type", None)
if enable_eplb:
if self.supports_eplb:
assert expert_load_view is not None
assert logical_to_physical_map is not None
assert logical_replica_count is not None
else:
raise NotImplementedError(
"EPLB is not supported for "
f"{self.old_quant_method.__class__.__name__}."
)
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
enable_eplb=enable_eplb,
expert_map=expert_map,
expert_load_view=expert_load_view,
logical_to_physical_map=logical_to_physical_map,
logical_replica_count=logical_replica_count,
global_num_experts=global_num_experts,
zero_expert_num=zero_expert_num,
zero_expert_type=zero_expert_type,
)
result = self.fused_experts(
@ -156,7 +127,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
expert_map=None if self.disable_expert_map else expert_map,
)
if zero_expert_num != 0 and zero_expert_type is not None:
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)

View File

@ -1510,30 +1510,11 @@ class FusedMoE(CustomOp):
logits_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
)
@staticmethod
def select_experts(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
use_grouped_topk: bool,
renormalize: bool,
topk_group: int | None = None,
num_expert_group: int | None = None,
custom_routing_function: Callable | None = None,
scoring_func: str = "softmax",
routed_scaling_factor: float = 1.0,
e_score_correction_bias: torch.Tensor | None = None,
indices_type: torch.dtype | None = None,
enable_eplb: bool = False,
expert_map: torch.Tensor | None = None,
expert_load_view: torch.Tensor | None = None,
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
global_num_experts: int | None = None,
zero_expert_num: int | None = None,
zero_expert_type: str | None = None,
num_fused_shared_experts: int = 0,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
"""
Route the input hidden states to the top-k experts based on the
router logits.
@ -1552,6 +1533,27 @@ class FusedMoE(CustomOp):
fused_topk_bias,
)
if self.enable_eplb:
if self.quant_method.supports_eplb:
if self.expert_load_view is None:
raise ValueError(
"enable_eplb=True requiere expert_load_view != None"
)
if self.logical_to_physical_map is None:
raise ValueError(
"enable_eplb=True requiere logical_to_physical_map != None"
)
if self.logical_replica_count is None:
raise ValueError(
"enable_eplb=True requiere logical_replica_count != None"
)
else:
raise NotImplementedError(
f"EPLB is not supported for {self.quant_method.method_name}."
)
indices_type = self.quant_method.topk_indices_dtype
# Check if we should use a routing simulation strategy
routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
if routing_strategy != "":
@ -1559,20 +1561,20 @@ class FusedMoE(CustomOp):
hidden_states=hidden_states,
router_logits=router_logits,
strategy_name=routing_strategy,
top_k=top_k,
top_k=self.top_k,
indices_type=indices_type,
)
# DeepSeekv2 uses grouped_top_k
elif use_grouped_topk:
assert topk_group is not None
assert num_expert_group is not None
elif self.use_grouped_topk:
assert self.topk_group is not None
assert self.num_expert_group is not None
if rocm_aiter_ops.is_fused_moe_enabled():
if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
assert num_fused_shared_experts == 0
assert self.num_fused_shared_experts == 0
grouped_topk_impl = partial(
rocm_aiter_grouped_topk,
num_fused_shared_experts=num_fused_shared_experts,
num_fused_shared_experts=self.num_fused_shared_experts,
)
else:
grouped_topk_impl = grouped_topk
@ -1580,50 +1582,46 @@ class FusedMoE(CustomOp):
topk_weights, topk_ids = grouped_topk_impl(
hidden_states=hidden_states,
gating_output=router_logits,
topk=top_k,
renormalize=renormalize,
num_expert_group=num_expert_group,
topk_group=topk_group,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
topk=self.top_k,
renormalize=self.renormalize,
num_expert_group=self.num_expert_group,
topk_group=self.topk_group,
scoring_func=self.scoring_func,
routed_scaling_factor=self.routed_scaling_factor,
e_score_correction_bias=self.e_score_correction_bias,
)
elif e_score_correction_bias is not None:
elif self.e_score_correction_bias is not None:
topk_weights, topk_ids = fused_topk_bias(
hidden_states=hidden_states,
gating_output=router_logits,
e_score_correction_bias=e_score_correction_bias.data,
topk=top_k,
renormalize=renormalize,
e_score_correction_bias=self.e_score_correction_bias.data,
topk=self.top_k,
renormalize=self.renormalize,
)
if routed_scaling_factor != 1.0:
topk_weights *= routed_scaling_factor
elif custom_routing_function is None:
if self.routed_scaling_factor != 1.0:
topk_weights *= self.routed_scaling_factor
elif self.custom_routing_function is None:
topk_weights, topk_ids, token_expert_indices = fused_topk(
hidden_states=hidden_states,
gating_output=router_logits,
topk=top_k,
renormalize=renormalize,
topk=self.top_k,
renormalize=self.renormalize,
indices_type=indices_type,
)
else:
topk_weights, topk_ids = custom_routing_function(
topk_weights, topk_ids = self.custom_routing_function(
hidden_states=hidden_states,
gating_output=router_logits,
topk=top_k,
renormalize=renormalize,
topk=self.top_k,
renormalize=self.renormalize,
)
if enable_eplb:
assert expert_load_view is not None
assert logical_to_physical_map is not None
assert logical_replica_count is not None
if self.enable_eplb:
topk_ids = eplb_map_to_physical_and_record(
topk_ids=topk_ids,
expert_load_view=expert_load_view,
logical_to_physical_map=logical_to_physical_map,
logical_replica_count=logical_replica_count,
expert_load_view=self.expert_load_view,
logical_to_physical_map=self.logical_to_physical_map,
logical_replica_count=self.logical_replica_count,
)
if (indices_type is not None) and topk_ids.dtype != indices_type:
@ -1633,16 +1631,16 @@ class FusedMoE(CustomOp):
# Compute zero expert result if needed
if (
zero_expert_num is not None
and zero_expert_num > 0
and zero_expert_type is not None
and global_num_experts is not None
self.zero_expert_num is not None
and self.zero_expert_num > 0
and self.zero_expert_type is not None
and self.global_num_experts is not None
):
zero_expert_result = zero_experts_compute_triton(
expert_indices=topk_ids,
expert_scales=topk_weights,
num_experts=global_num_experts,
zero_expert_type=zero_expert_type,
num_experts=self.global_num_experts,
zero_expert_type=self.zero_expert_type,
hidden_states=hidden_states,
)
else:
@ -1692,6 +1690,10 @@ class FusedMoE(CustomOp):
)
def reduce_output(states: torch.Tensor) -> torch.Tensor:
# Slice before all_reduce to enable possible fusion
if self.hidden_size != og_hidden_states:
states = states[..., :og_hidden_states]
if (
not self.is_sequence_parallel
and not self.use_dp_chunking
@ -1714,11 +1716,12 @@ class FusedMoE(CustomOp):
if self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(fused_output, tuple)
fused_output, zero_expert_result = fused_output
return (reduce_output(fused_output) + zero_expert_result)[
..., :og_hidden_states
]
return (
reduce_output(fused_output)
+ zero_expert_result[..., :og_hidden_states]
)
else:
return reduce_output(fused_output)[..., :og_hidden_states]
return reduce_output(fused_output)
else:
if current_platform.is_tpu():
# TODO: Once the OOM issue for the TPU backend is resolved, we
@ -1731,8 +1734,8 @@ class FusedMoE(CustomOp):
hidden_states, router_logits, self.layer_name
)
return (
reduce_output(shared_output)[..., :og_hidden_states],
reduce_output(fused_output)[..., :og_hidden_states],
reduce_output(shared_output),
reduce_output(fused_output),
)
def forward_cuda(

View File

@ -331,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
def forward_cuda(
self,
layer: torch.nn.Module,
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,
@ -352,31 +352,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
zero_expert_num = getattr(layer, "zero_expert_num", 0)
zero_expert_type = getattr(layer, "zero_expert_type", None)
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
enable_eplb=enable_eplb,
expert_map=expert_map,
expert_load_view=expert_load_view,
logical_to_physical_map=logical_to_physical_map,
logical_replica_count=logical_replica_count,
global_num_experts=global_num_experts,
zero_expert_num=zero_expert_num,
zero_expert_type=zero_expert_type,
num_fused_shared_experts=layer.num_fused_shared_experts,
)
if self.rocm_aiter_moe_enabled:
@ -415,7 +393,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
expert_map=expert_map,
)
if zero_expert_num != 0 and zero_expert_type is not None:
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)
@ -425,7 +403,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
def forward_cpu(
self,
layer: torch.nn.Module,
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,
@ -474,7 +452,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
def forward_xpu(
self,
layer: torch.nn.Module,
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,
@ -515,7 +493,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
def forward_tpu(
self,
layer: torch.nn.Module,
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,

View File

@ -597,7 +597,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -618,24 +618,11 @@ class AWQMoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.")
assert activation == "silu", "Only SiLU activation is supported."
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
return fused_marlin_moe(

View File

@ -495,7 +495,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -518,25 +518,11 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `BitsAndBytesMoEMethod` yet."
)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
# TODO(bnell): Do these need to be called on the hot path?
if self.quant_config.load_in_8bit:
w13, w2 = self._apply_8bit_dequant(layer)
else:

View File

@ -511,7 +511,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -532,16 +532,17 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
)
assert activation == "silu", "Only SiLU activation is supported."
if (
self.allow_flashinfer
and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
):
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
)
return flashinfer_trtllm_fp4_moe(
layer=layer,
x=x,
@ -554,19 +555,9 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
e_score_correction_bias=e_score_correction_bias,
)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
if self.use_marlin:
@ -1109,7 +1100,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -1130,31 +1121,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
assert expert_load_view is not None
assert logical_to_physical_map is not None
assert logical_replica_count is not None
assert isinstance(layer, FusedMoE)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
num_fused_shared_experts=layer.num_fused_shared_experts,
enable_eplb=enable_eplb,
expert_map=expert_map,
expert_load_view=expert_load_view,
logical_to_physical_map=logical_to_physical_map,
logical_replica_count=logical_replica_count,
)
per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
@ -1377,7 +1346,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -1398,26 +1367,11 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
)
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
return fused_experts(
@ -1738,7 +1692,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -1759,26 +1713,11 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
)
assert activation == "silu", f"{activation} not supported for Marlin MoE."
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
return fused_marlin_moe(
@ -2001,7 +1940,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -2022,43 +1961,11 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
if expert_load_view is None:
raise ValueError("enable_eplb=True requiere expert_load_view != None")
if logical_to_physical_map is None:
raise ValueError(
"enable_eplb=True requiere logical_to_physical_map != None"
)
if logical_replica_count is None:
raise ValueError(
"enable_eplb=True requiere logical_replica_count != None"
)
if not isinstance(layer, FusedMoE):
raise TypeError(
"EPLB is only supported when `layer` is a instance of FusedMoE."
)
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0),
enable_eplb=enable_eplb,
expert_map=expert_map,
expert_load_view=expert_load_view,
logical_to_physical_map=logical_to_physical_map,
logical_replica_count=logical_replica_count,
)
return fused_experts(

View File

@ -137,7 +137,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -158,26 +158,11 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `ExpertsInt8MoEMethod` yet."
)
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
return fused_experts(

View File

@ -28,6 +28,7 @@ from vllm.model_executor.layers.fused_moe import (
FusedMoeWeightScaleSupported,
)
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEParallelConfig,
FusedMoEQuantConfig,
RoutingMethodType,
fp8_w8a8_moe_quant_config,
@ -118,7 +119,9 @@ class Fp8MoeBackend(Enum):
TRITON = 6
def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
def get_fp8_moe_backend(
block_quant: bool, moe_parallel_config: FusedMoEParallelConfig
) -> Fp8MoeBackend:
"""
Select the primary FP8 MoE backend
Note: Shape-specific fallbacks may still occur at runtime.
@ -159,8 +162,19 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
logger.info_once("Using Marlin backend for FP8 MoE")
return Fp8MoeBackend.MARLIN
# deepGEMM on supported platforms with block-quantized weights
if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
# Determine if we should use DeepGEMM with block-quantized weights:
# - If explicitly set by user, respect their choice
# - If not explicitly set (default), disable when TP size is >= 8
moe_use_deep_gemm = envs.VLLM_MOE_USE_DEEP_GEMM
if not envs.is_set("VLLM_MOE_USE_DEEP_GEMM") and moe_parallel_config.tp_size >= 8:
moe_use_deep_gemm = False
logger.info_once(
"DeepGEMM MoE is disabled by default when TP size is >= 8. "
"Set VLLM_MOE_USE_DEEP_GEMM=1 to enable it.",
scope="local",
)
if envs.VLLM_USE_DEEP_GEMM and moe_use_deep_gemm and block_quant:
if not has_deep_gemm():
logger.warning_once(
"DeepGEMM backend requested but not available.", scope="local"
@ -641,7 +655,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
self.quant_config = quant_config
self.weight_block_size = self.quant_config.weight_block_size
self.block_quant: bool = self.weight_block_size is not None
self.fp8_backend = get_fp8_moe_backend(self.block_quant)
self.fp8_backend = get_fp8_moe_backend(
self.block_quant, layer.moe_parallel_config
)
self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
@ -1140,7 +1156,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -1216,31 +1232,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
apply_router_weight_on_input=apply_router_weight_on_input,
)
zero_expert_num = getattr(layer, "zero_expert_num", 0)
zero_expert_type = getattr(layer, "zero_expert_type", None)
select_result = FusedMoE.select_experts(
select_result = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
enable_eplb=enable_eplb,
expert_map=expert_map,
expert_load_view=expert_load_view,
logical_to_physical_map=logical_to_physical_map,
logical_replica_count=logical_replica_count,
global_num_experts=global_num_experts,
zero_expert_num=zero_expert_num,
zero_expert_type=zero_expert_type,
num_fused_shared_experts=layer.num_fused_shared_experts,
)
topk_weights, topk_ids, zero_expert_result = select_result
@ -1322,7 +1316,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
self.allow_cutlass_block_scaled_grouped_gemm
),
)
if zero_expert_num != 0 and zero_expert_type is not None:
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)

View File

@ -621,7 +621,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -642,9 +642,6 @@ class GGUFMoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.")
assert activation == "silu", "Only SiLU activation is supported."
if apply_router_weight_on_input:
raise NotImplementedError(
@ -652,19 +649,9 @@ class GGUFMoEMethod(FusedMoEMethodBase):
"fused GGUF MoE method."
)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
return fused_moe_gguf(
x,

View File

@ -722,7 +722,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -743,26 +743,11 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `GPTQMarlinMoEMethod` yet."
)
assert activation == "silu", "Only SiLU activation is supported."
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
return fused_marlin_moe(

View File

@ -696,7 +696,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -717,12 +717,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `ModelOptFp8MoEMethod` yet."
)
if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
if layer.enable_eplb:
raise NotImplementedError(
"EPLB not supported for `ModelOptFp8MoEMethod` yet."
)
assert activation == "silu", (
f"Expected 'silu' activation but got {activation}"
)
@ -740,19 +739,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
)
# Expert selection
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
@ -1459,7 +1448,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -1480,16 +1469,16 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
)
assert activation == "silu", "Only SiLU activation is supported."
if (
self.allow_flashinfer
and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
):
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
)
return flashinfer_trtllm_fp4_moe(
layer=layer,
x=x,
@ -1502,19 +1491,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
e_score_correction_bias=e_score_correction_bias,
)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
if self.use_marlin:

View File

@ -359,7 +359,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -380,25 +380,12 @@ class MoeWNA16Method(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.")
from vllm.model_executor.layers.fused_moe import fused_experts
assert activation == "silu", "Only SiLU activation is supported."
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
return fused_experts(

View File

@ -132,12 +132,15 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
)
# If FlashInfer is not available, try either Marlin or Triton
if (
envs.VLLM_MXFP4_USE_MARLIN
or current_platform.get_device_capability()[0] < 9
or not has_triton_kernels()
or not is_torch_equal_or_newer("2.8.0")
):
triton_kernels_supported = (
has_triton_kernels()
and is_torch_equal_or_newer("2.8.0")
# NOTE: triton_kernels are only confirmed to work on SM90 and SM100
# SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
# SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
and (9, 0) <= current_platform.get_device_capability() < (11, 0)
)
if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
logger.info_once("Using Marlin backend")
return Mxfp4Backend.MARLIN
else:
@ -862,7 +865,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -887,18 +890,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
raise NotImplementedError("EPLB is not supported for mxfp4")
if self.mxfp4_backend == Mxfp4Backend.MARLIN:
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
)
return fused_marlin_moe(
@ -989,17 +983,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
):
from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
e_score_correction_bias=e_score_correction_bias,
)
# Backend-specific preparation

View File

@ -334,7 +334,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -355,24 +355,9 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet."
)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
if self.rocm_aiter_moe_enabled:
@ -609,7 +594,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -630,24 +615,9 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `QuarkOCP_MX_MoEMethod` yet."
)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
if not self.emulate:

View File

@ -356,7 +356,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
def apply(
self,
layer: torch.nn.Module,
layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@ -377,22 +377,9 @@ class RTNMoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if enable_eplb:
raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.")
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
use_grouped_topk=use_grouped_topk,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
custom_routing_function=custom_routing_function,
scoring_func=scoring_func,
routed_scaling_factor=routed_scaling_factor,
e_score_correction_bias=e_score_correction_bias,
indices_type=self.topk_indices_dtype,
)
return fused_marlin_moe(

View File

@ -17,6 +17,7 @@ from .llama4_vision_rope import Llama4VisionRotaryEmbedding
from .mrope import MRotaryEmbedding
from .ntk_scaling_rope import NTKScalingRotaryEmbedding
from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding
from .xdrope import XDRotaryEmbedding
from .yarn_scaling_rope import YaRNScalingRotaryEmbedding
_ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
@ -184,6 +185,18 @@ def get_rope(
raise ValueError(
"Dynamic rope scaling must contain either 'alpha' or 'factor' field"
)
elif scaling_type == "xdrope":
scaling_alpha = rope_parameters["alpha"]
rotary_emb = XDRotaryEmbedding(
head_size,
rotary_dim,
max_position,
base,
is_neox_style,
scaling_alpha,
dtype,
xdrope_section=rope_parameters["xdrope_section"],
)
elif scaling_type == "yarn":
scaling_factor = rope_parameters["factor"]
original_max_position = rope_parameters["original_max_position_embeddings"]

View File

@ -0,0 +1,102 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np
import torch
from .common import apply_rotary_emb_dispatch
from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
class XDRotaryEmbedding(DynamicNTKAlphaRotaryEmbedding):
"""DynamicNTKAlphaRotaryEmbedding extended with MultiModal(XD) Sections.
Based on the original DynamicNTKAlphaRotaryEmbedding implementation.
"""
def __init__(
self,
head_size: int,
rotary_dim: int,
max_position_embeddings: int,
base: float,
is_neox_style: bool,
scaling_alpha: float,
dtype: torch.dtype,
xdrope_section: list[int],
) -> None:
self.xdrope_section = xdrope_section
super().__init__(
head_size,
rotary_dim,
max_position_embeddings,
base,
is_neox_style,
scaling_alpha,
dtype,
)
def forward(
self,
positions: torch.Tensor,
query: torch.Tensor,
key: torch.Tensor | None = None,
offsets: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor | None]:
"""PyTorch-native implementation equivalent to forward().
Args:
positions:
[4, num_tokens] (P/W/H/T positions with multimodal inputs)
query: [num_tokens, num_heads * head_size]
key: [num_tokens, num_kv_heads * head_size]
"""
assert positions.ndim == 2
assert key is not None
num_tokens = positions.shape[-1]
cos_sin = self.cos_sin_cache[positions]
cos, sin = cos_sin.chunk(2, dim=-1)
cos = torch.cat(
[m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1
)
sin = torch.cat(
[m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1
)
query_shape = query.shape
query = query.view(num_tokens, -1, self.head_size)
query_rot = query[..., : self.rotary_dim]
query_pass = query[..., self.rotary_dim :]
query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
key_shape = key.shape
key = key.view(num_tokens, -1, self.head_size)
key_rot = key[..., : self.rotary_dim]
key_pass = key[..., self.rotary_dim :]
key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
return query, key
@staticmethod
def get_next_input_positions(
context_len: int,
seq_len: int,
xd_sections: int = 4,
) -> list[list[int]]:
return [list(range(context_len, seq_len)) for _ in range(xd_sections)]
@staticmethod
def get_next_input_positions_tensor(
out: np.ndarray,
out_offset: int,
context_len: int,
num_new_tokens: int,
):
values = np.arange(
context_len,
context_len + num_new_tokens,
dtype=out.dtype,
)
out[:, out_offset : out_offset + num_new_tokens] = values

View File

@ -576,7 +576,16 @@ class HunYuanDecoderLayer(nn.Module):
return hidden_states, residual, ori_kv_states
@support_torch_compile
@support_torch_compile(
dynamic_arg_dims={
"input_ids": 0,
# positions is of shape (xd, seq_len) if xdrope is enabled for hunyuan-vl,
# otherwise (seq_len, ).
"positions": -1,
"intermediate_tensors": 0,
"inputs_embeds": 0,
}
)
class HunYuanModel(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()

File diff suppressed because it is too large Load Diff

View File

@ -1047,7 +1047,7 @@ class SupportsMRoPE(Protocol):
supports_mrope: ClassVar[Literal[True]] = True
"""
A flag that indicates this model supports M-RoPE.
Note:
There is no need to redefine this flag if this class is in the
MRO of your model class.
@ -1088,3 +1088,52 @@ def supports_mrope(
model: type[object] | object,
) -> TypeIs[type[SupportsMRoPE]] | TypeIs[SupportsMRoPE]:
return isinstance(model, SupportsMRoPE)
@runtime_checkable
class SupportsXDRoPE(Protocol):
"""The interface required for all models that support XD-RoPE."""
supports_xdrope: ClassVar[Literal[True]] = True
"""
A flag that indicates this model supports XD-RoPE.
Note:
There is no need to redefine this flag if this class is in the
XDRope of your model class.
"""
def get_xdrope_input_positions(
self,
input_tokens: list[int],
mm_features: list["MultiModalFeatureSpec"],
) -> torch.Tensor:
"""
Get XD-RoPE input positions and delta value for this specific model.
This method should be implemented by each model that supports XD-RoPE
to provide model-specific logic for computing input positions.
Args:
input_tokens: List of input token IDs
mm_features: Information about each multi-modal data item
Returns:
llm_positions: Tensor of shape `[xdrope_dim, num_tokens]` with
4D(P/W/H/T) or 3D(W/H/T) positions.
"""
...
@overload
def supports_xdrope(model: type[object]) -> TypeIs[type[SupportsXDRoPE]]: ...
@overload
def supports_xdrope(model: object) -> TypeIs[SupportsXDRoPE]: ...
def supports_xdrope(
model: type[object] | object,
) -> TypeIs[type[SupportsXDRoPE]] | TypeIs[SupportsXDRoPE]:
return isinstance(model, SupportsXDRoPE)

View File

@ -142,6 +142,12 @@ class LlamaModel(nn.Module):
# Get drafter's quantization config
self.quant_config = get_draft_quant_config(vllm_config)
eagle_config = getattr(self.config, "eagle_config", None)
if eagle_config is not None and "use_aux_hidden_state" in eagle_config:
self.use_aux_hidden_state = eagle_config["use_aux_hidden_state"]
else:
self.use_aux_hidden_state = True
current_vllm_config = get_current_vllm_config()
self.embed_tokens = VocabParallelEmbedding(
@ -161,20 +167,20 @@ class LlamaModel(nn.Module):
for layer_idx in range(self.config.num_hidden_layers)
]
)
if hasattr(self.config, "target_hidden_size"):
fc_input_size = self.config.target_hidden_size * 3
else:
fc_input_size = self.config.hidden_size * 3
self.fc = ReplicatedLinear(
input_size=fc_input_size,
output_size=self.config.hidden_size,
bias=False,
params_dtype=vllm_config.model_config.dtype,
quant_config=self.quant_config,
prefix=maybe_prefix(prefix, "fc"),
return_bias=False,
)
if self.use_aux_hidden_state:
if hasattr(self.config, "target_hidden_size"):
fc_input_size = self.config.target_hidden_size * 3
else:
fc_input_size = self.config.hidden_size * 3
self.fc = ReplicatedLinear(
input_size=fc_input_size,
output_size=self.config.hidden_size,
bias=False,
params_dtype=vllm_config.model_config.dtype,
quant_config=self.quant_config,
prefix=maybe_prefix(prefix, "fc"),
return_bias=False,
)
self.norm = RMSNorm(
self.config.hidden_size,
eps=self.config.rms_norm_eps,
@ -332,6 +338,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
self,
hidden_states: torch.Tensor,
) -> torch.Tensor:
if not self.model.use_aux_hidden_state:
return hidden_states
# combine multiple auxiliary hidden states returned by eagle3
return self.model.fc(hidden_states)
@ -357,6 +365,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
skip_substrs.append("draft_id_to_target_id")
if not includes_embed_tokens:
skip_substrs.append("embed_tokens")
if not self.model.use_aux_hidden_state:
skip_substrs.append("fc.")
loader = AutoWeightsLoader(
self,
skip_prefixes=None,

View File

@ -56,10 +56,13 @@ from transformers.utils import is_flash_attn_2_available
from vllm.model_executor.layers.conv import Conv2dLayer
from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.models.utils import maybe_prefix
from vllm.platforms import current_platform
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
if is_flash_attn_2_available():
from flash_attn import flash_attn_varlen_func
elif current_platform.is_xpu():
from vllm.attention.utils.fa_utils import flash_attn_varlen_func
else:
flash_attn_varlen_func = None
@ -106,10 +109,10 @@ def multihead_attention(
q,
k,
v,
q_cu_seqlens,
k_cu_seqlens,
max_seqlen_q,
max_seqlen_k,
cu_seqlens_q=q_cu_seqlens,
cu_seqlens_k=k_cu_seqlens,
max_seqlen_q=max_seqlen_q,
max_seqlen_k=max_seqlen_k,
causal=False,
)
attn_out = attn_out.flatten(start_dim=-2)
@ -291,7 +294,12 @@ class Rope2DPosEmb(nn.Module):
"""
def __init__(
self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda"
self,
dim: int,
max_height: int,
max_width: int,
theta_base=10000,
device=current_platform.device_type,
):
super().__init__()
self.dim = dim
@ -437,7 +445,7 @@ class MoonVitEncoderLayer(nn.Module):
self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
self.attn_implementation = attn_implementation
# use fa2 in vllm by default
if is_flash_attn_2_available():
if is_flash_attn_2_available() or current_platform.is_xpu():
self.attn_implementation = "flash_attention_2"
self.norm0 = nn.LayerNorm(hidden_dim)

View File

@ -23,7 +23,6 @@
"""Inference-only Qwen2.5-Omni model (thinker part)."""
from collections.abc import Callable, Iterable, Mapping, Sequence
from copy import copy
from functools import partial
from typing import Annotated, Any, Literal
@ -387,15 +386,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
use_audio_in_video = False
if "video" in mm_kwargs:
video_items = [item for item in mm_kwargs["video"] if item is not None]
# only check video items (if there are any)
if video_items:
use_audio_in_video = all(
item["use_audio_in_video"].data for item in video_items
)
if is_update_applied:
mm_placeholders = self._find_mm_placeholders(
prompt_ids,
@ -404,7 +394,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
self._validate_mm_placeholders(
mm_placeholders,
mm_item_counts,
use_audio_in_video=use_audio_in_video,
)
else:
prompt_ids, mm_placeholders = self._apply_prompt_updates(
@ -414,7 +403,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
self._validate_mm_placeholders(
mm_placeholders,
mm_item_counts,
use_audio_in_video=use_audio_in_video,
)
return prompt_ids, mm_placeholders
@ -640,19 +628,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
return mm_processed_data
def _validate_mm_placeholders(
self,
mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
mm_item_counts: Mapping[str, int],
use_audio_in_video: bool = False,
) -> None:
if use_audio_in_video:
mm_item_counts = copy(mm_item_counts)
if "video" in mm_item_counts:
assert "audio" in mm_item_counts
mm_item_counts["audio"] -= mm_item_counts["video"]
super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
class Qwen2_5OmniConditionalGenerationMixin:
def _parse_and_validate_audio_input(

View File

@ -68,11 +68,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
MultiModalPromptUpdates,
PlaceholderFeaturesInfo,
PromptReplacement,
PromptUpdate,
PromptUpdateDetails,
)
from vllm.sequence import IntermediateTensors
@ -87,7 +87,6 @@ from .qwen2_5_omni_thinker import (
Qwen2_5OmniConditionalGenerationMixin,
Qwen2_5OmniThinkerDummyInputsBuilder,
Qwen2_5OmniThinkerMultiModalProcessor,
Qwen2_5OmniThinkerProcessingInfo,
)
from .qwen2_5_vl import (
Qwen2_5_VisionAttention,
@ -807,24 +806,8 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
else:
use_audio_in_video = False
if use_audio_in_video and "video" in mm_item_counts:
assert "audio" in mm_item_counts
mm_item_counts["audio"] -= mm_item_counts["video"]
# Special case with `use_audio_in_video=True`
if use_audio_in_video:
if is_update_applied:
prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video)
(
prompt_ids,
mm_placeholders,
) = self._apply_prompt_updates(
prompt_ids,
mm_prompt_updates,
)
self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
# normal case with `use_audio_in_video=False`
elif is_update_applied:
if is_update_applied:
mm_placeholders = self._find_mm_placeholders(
prompt_ids,
mm_prompt_updates,
@ -834,10 +817,24 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
mm_item_counts,
)
else:
prompt_ids, mm_placeholders = self._apply_prompt_updates(
prompt_ids,
mm_prompt_updates,
)
if use_audio_in_video and "audio" in mm_prompt_updates:
filtered_updates = {
k: v for k, v in mm_prompt_updates.items() if k != "audio"
}
prompt_ids, mm_placeholders = self._apply_prompt_updates(
prompt_ids,
filtered_updates,
)
# Derive audio placeholders from video placeholders
mm_placeholders = self._derive_audio_from_video_placeholders(
mm_placeholders, mm_prompt_updates
)
else:
prompt_ids, mm_placeholders = self._apply_prompt_updates(
prompt_ids,
mm_prompt_updates,
)
self._validate_mm_placeholders(
mm_placeholders,
mm_item_counts,
@ -962,7 +959,9 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
def get_replacement_qwen2_use_audio_in_video(item_idx: int):
nonlocal audio_in_video_item_idx
audio_num_features = audio_output_lengths[audio_item_idx + item_idx]
audio_num_features = audio_output_lengths[
audio_in_video_item_idx + item_idx
]
video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
audio_in_video_item_idx += 1
@ -971,14 +970,17 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
if second_per_grid_ts:
video_second_per_grid_t = second_per_grid_ts[item_idx]
else:
video_second_per_grid_t = 1.0
video_second_per_grid_t = 2.0
return self.get_updates_use_audio_in_video(
placeholder = self.get_updates_use_audio_in_video(
thinker_config=thinker_config,
audio_len=audio_num_features,
video_grid_thw=video_grid_thw,
video_second_per_grid_t=video_second_per_grid_t,
)
return PromptUpdateDetails.select_token_id(
placeholder, embed_token_id=video_token_id
)
video_replacement_fn = (
get_replacement_qwen2_use_audio_in_video
@ -1004,14 +1006,50 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
),
]
def _validate_mm_placeholders(
def _derive_audio_from_video_placeholders(
self,
mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
mm_item_counts: Mapping[str, int],
) -> None:
BaseMultiModalProcessor[
Qwen2_5OmniThinkerProcessingInfo
]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts)
placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
mm_prompt_updates: MultiModalPromptUpdates,
) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
"""
Helper to derive audio placeholders from video placeholders when
use_audio_in_video=True.
"""
if "video" not in placeholders:
return placeholders
# Validate audio and video counts match
num_videos = len(placeholders["video"])
num_audios = len(mm_prompt_updates.get("audio", []))
if num_audios != num_videos:
raise ValueError(
f"use_audio_in_video requires equal number of audio and video items, "
f"got {num_audios=}, {num_videos=}"
)
tokenizer = self.info.get_tokenizer()
processor = self.info.get_hf_processor()
audio_token_id = tokenizer.get_vocab()[processor.audio_token]
result_placeholders = dict(placeholders)
audio_placeholders = []
# Each video is paired with one audio
for video_idx, video_placeholder in enumerate(placeholders["video"]):
# Create is_embed mask selecting only audio tokens
audio_is_embed = torch.tensor(video_placeholder.tokens) == audio_token_id
audio_placeholder = PlaceholderFeaturesInfo(
modality="audio",
item_idx=video_idx,
start_idx=video_placeholder.start_idx,
tokens=video_placeholder.tokens,
is_embed=audio_is_embed,
)
audio_placeholders.append(audio_placeholder)
result_placeholders["audio"] = audio_placeholders
return result_placeholders
def _get_raw_input_ids(
self,
@ -1454,7 +1492,11 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
)
if not len(second_per_grid_ts) and len(video_grid_thw):
second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
second_per_grid_ts = 2.0
second_per_grids = (
torch.ones(len(video_grid_thw), dtype=torch.float32)
* second_per_grid_ts
)
else:
second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)

View File

@ -287,6 +287,10 @@ _MULTIMODAL_MODELS = {
"GraniteSpeechForConditionalGeneration",
),
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
"HunYuanVLForConditionalGeneration": (
"hunyuan_vision",
"HunYuanVLForConditionalGeneration",
),
"InternVLChatModel": ("internvl", "InternVLChatModel"),
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
"OpenCUAForConditionalGeneration": (

View File

@ -134,6 +134,11 @@ class Platform:
_global_graph_pool: Any | None = None
@property
def pass_key(self) -> str:
"""Inductor config key for the PassManager custom pass"""
return "post_grad_custom_post_pass"
@property
def supported_dtypes(self) -> list[torch.dtype]:
"""Returns the supported dtypes for the current platform."""
@ -177,6 +182,21 @@ class Platform:
# all ROCm platforms for now.
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
@classmethod
def get_pass_manager_cls(cls) -> str:
"""
Get the pass manager class for this platform.
It will be registered as a custom pass under the current_platform.pass_key.
"""
return "vllm.compilation.pass_manager.PostGradPassManager"
@classmethod
def get_compile_backend(cls) -> str:
"""
Get the custom compile backend for current platform.
"""
return cls.simple_compile_backend
@classmethod
def device_id_to_physical_device_id(cls, device_id: int):
# Treat empty device control env var as unset. This is a valid

View File

@ -262,6 +262,10 @@ class RocmPlatform(Platform):
f"is not MLA type while requested for MLA backend."
)
if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
logger.info("Using FlexAttention backend.")
return AttentionBackendEnum.FLEX_ATTENTION.get_path()
if selected_backend == AttentionBackendEnum.TRITON_ATTN:
logger.info("Using Triton Attention backend on V1 engine.")
return AttentionBackendEnum.TRITON_ATTN.get_path()
@ -317,8 +321,8 @@ class RocmPlatform(Platform):
return AttentionBackendEnum.TRITON_ATTN.get_path()
raise RuntimeError(
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
"to select a supported backend."
f"Attention backend {selected_backend.name} is not supported on "
"ROCm. Note that V0 attention backends have been removed."
)
@classmethod

View File

@ -57,7 +57,7 @@ class PoolingParams(
## Internal use only
task: PoolingTask | None = None
requires_token_ids: bool = False
skip_reading_prefix_cache: bool = None
skip_reading_prefix_cache: bool | None = None
extra_kwargs: dict[str, Any] | None = None
output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY

View File

@ -3,7 +3,6 @@
"""Sampling parameters for text generation."""
import copy
import warnings
from dataclasses import field
from enum import Enum, IntEnum
from functools import cached_property
@ -100,19 +99,6 @@ class StructuredOutputsParams:
)
@dataclass
class GuidedDecodingParams(StructuredOutputsParams):
def __post_init__(self):
warnings.warn(
"GuidedDecodingParams is deprecated. This will be removed in "
"v0.12.0 or v1.0.0, which ever is soonest. Please use "
"StructuredOutputsParams instead.",
DeprecationWarning,
stacklevel=2,
)
return super().__post_init__()
class RequestOutputKind(Enum):
# Return entire output so far in every RequestOutput
CUMULATIVE = 0
@ -234,8 +220,6 @@ class SamplingParams(
# Fields used to construct logits processors
structured_outputs: StructuredOutputsParams | None = None
"""Parameters for configuring structured outputs."""
guided_decoding: GuidedDecodingParams | None = None
"""Deprecated alias for structured_outputs."""
logit_bias: dict[int, float] | None = None
"""If provided, the engine will construct a logits processor that applies
these logit biases."""
@ -254,7 +238,7 @@ class SamplingParams(
generated token can complete the sequence."""
_bad_words_token_ids: list[list[int]] | None = None
skip_reading_prefix_cache: bool = None
skip_reading_prefix_cache: bool | None = None
@staticmethod
def from_optional(
@ -283,7 +267,6 @@ class SamplingParams(
truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None,
output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
structured_outputs: StructuredOutputsParams | None = None,
guided_decoding: GuidedDecodingParams | None = None,
logit_bias: dict[int, float] | dict[str, float] | None = None,
allowed_token_ids: list[int] | None = None,
extra_args: dict[str, Any] | None = None,
@ -295,16 +278,6 @@ class SamplingParams(
int(token): min(100.0, max(-100.0, bias))
for token, bias in logit_bias.items()
}
if guided_decoding is not None:
warnings.warn(
"guided_decoding is deprecated. This will be removed in "
"v0.12.0 or v1.0.0, which ever is soonest. Please use "
"structured_outputs instead.",
DeprecationWarning,
stacklevel=2,
)
structured_outputs = guided_decoding
guided_decoding = None
return SamplingParams(
n=1 if n is None else n,
@ -387,17 +360,6 @@ class SamplingParams(
# eos_token_id is added to this by the engine
self._all_stop_token_ids.update(self.stop_token_ids)
if self.guided_decoding is not None:
warnings.warn(
"guided_decoding is deprecated. This will be removed in "
"v0.12.0 or v1.0.0, which ever is soonest. Please use "
"structured_outputs instead.",
DeprecationWarning,
stacklevel=2,
)
self.structured_outputs = self.guided_decoding
self.guided_decoding = None
if self.skip_reading_prefix_cache is None:
# If prefix caching is enabled,
# the output of prompt logprobs may less than n_prompt_tokens,

View File

@ -86,6 +86,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
deepseek_vl_v2="DeepseekVLV2Config",
deepseek_v32="DeepseekV3Config",
flex_olmo="FlexOlmoConfig",
hunyuan_vl="HunYuanVLConfig",
kimi_linear="KimiLinearConfig",
kimi_vl="KimiVLConfig",
RefinedWeb="RWConfig", # For tiiuae/falcon-40b(-instruct)
@ -549,6 +550,23 @@ def thinker_uses_mrope(config: PretrainedConfig) -> bool:
return uses_mrope(thinker_text_config)
def uses_xdrope_dim(config: PretrainedConfig) -> int:
"""Detect if the model with this config uses XD-ROPE."""
xdrope_section = getattr(config, "xdrope_section", None)
if xdrope_section is not None and isinstance(xdrope_section, list):
return len(xdrope_section)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is None:
return 0
if isinstance(rope_scaling, dict) and "xdrope_section" in rope_scaling:
xdrope_section = rope_scaling["xdrope_section"]
if xdrope_section is not None and isinstance(xdrope_section, list):
return len(xdrope_section)
return 0
def is_encoder_decoder(config: PretrainedConfig) -> bool:
"""Detect if the model with this config is used as an encoder/decoder."""

View File

@ -23,6 +23,11 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig
# `FalconConfig` class from the official HuggingFace transformers library.
from vllm.transformers_utils.configs.falcon import RWConfig
from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
from vllm.transformers_utils.configs.hunyuan_vl import (
HunYuanVLConfig,
HunYuanVLTextConfig,
HunYuanVLVisionConfig,
)
from vllm.transformers_utils.configs.jais import JAISConfig
from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
@ -53,6 +58,9 @@ __all__ = [
"DotsOCRConfig",
"EAGLEConfig",
"FlexOlmoConfig",
"HunYuanVLConfig",
"HunYuanVLTextConfig",
"HunYuanVLVisionConfig",
"RWConfig",
"JAISConfig",
"Lfm2MoeConfig",

View File

@ -0,0 +1,322 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/configuration_hunyuan_vl.py
from transformers import PretrainedConfig
class HunYuanVLVisionConfig(PretrainedConfig):
model_type = "hunyuan_vl"
base_config_key = "vision_config"
def __init__(
self,
hidden_act="gelu",
hidden_size=1152,
intermediate_size=4304,
interpolate_mode="bilinear",
rms_norm_eps=1e-05,
learnable_mlp_pooling_size=0,
num_attention_heads=16,
num_key_value_heads=None,
num_channels=3,
num_hidden_layers=27,
out_hidden_size=4096,
patch_size=16,
remove_prenorm=True,
spatial_merge_size=2,
temporal_patch_size=1,
resize_resolution=2048,
img_max_token_num=4096,
max_image_size=2048,
video_max_image_size=768,
video_min_image_size=256,
min_image_size=512,
anyres_vit_max_image_size=2048,
max_vit_seq_len=16384,
text_hidden_size=3072,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_act = hidden_act
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.interpolate_mode = interpolate_mode
self.learnable_mlp_pooling_size = learnable_mlp_pooling_size
self.num_attention_heads = num_attention_heads
if not num_key_value_heads:
self.num_key_value_heads = num_attention_heads
else:
self.num_key_value_heads = num_key_value_heads
self.num_channels = num_channels
self.num_hidden_layers = num_hidden_layers
self.out_hidden_size = out_hidden_size
self.patch_size = patch_size
self.remove_prenorm = remove_prenorm
self.spatial_merge_size = spatial_merge_size
self.temporal_patch_size = temporal_patch_size
self.rms_norm_eps = rms_norm_eps
self.resize_resolution = resize_resolution
self.img_max_token_num = img_max_token_num
self.max_image_size = max_image_size
self.min_image_size = min_image_size
self.video_max_image_size = video_max_image_size
self.video_min_image_size = video_min_image_size
self.anyres_vit_max_image_size = anyres_vit_max_image_size
self.max_vit_seq_len = max_vit_seq_len
self.text_hidden_size = text_hidden_size
class HunYuanVLTextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`HunYuanVLTextConfig`]. It is used to instantiate an
HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the HunYuan-7B.
Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 290943):
Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`HunYuanVLTextConfig`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations or shared MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*, defaults to 0):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
eod_token_id (int, *optional*, defaults to 3):
Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
Example: In multi-document processing, this token helps the model distinguish between separate documents.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
experimental feature, subject to breaking API changes in future versions.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
head_dim (`int`, *optional*, defaults to 128):
The attention head dimension.
""" # noqa: E501
model_type = "hunyuan_vl_text"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=290943,
hidden_size=4096,
intermediate_size: int = 11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-5,
use_cache=True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
eod_token_id=3,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
head_dim=None,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.head_dim = head_dim
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# self._rope_scaling_validation() # TODO: Need validation?
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
if self.rope_scaling is None:
return
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with with two fields, `type` and "
f"`factor` or `type` and `alpha`, got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
rope_scaling_alpha = self.rope_scaling.get("alpha", None)
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], "
f"got {rope_scaling_type}"
)
if rope_scaling_factor is None and rope_scaling_alpha is None:
raise ValueError(
"`rope_scaling`'s factor or alpha field must be have one, "
"got both of none"
)
if rope_scaling_factor is not None and (
not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0
):
raise ValueError(
"`rope_scaling`'s factor field must be a float > 1.0, "
f"got {rope_scaling_factor}"
)
if rope_scaling_alpha is not None and (
not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0
):
raise ValueError(
"`rope_scaling`'s alpha field must be a float > 1.0, "
f"got {rope_scaling_alpha}"
)
class HunYuanVLConfig(PretrainedConfig):
model_type = "hunyuan_vl"
sub_configs = {
"vision_config": HunYuanVLVisionConfig,
"text_config": HunYuanVLTextConfig,
}
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
text_config=None,
vision_config=None,
im_start_id=120118,
im_end_id=120119,
image_token_id=120120,
im_newline_id=120121,
video_start_id=120122,
video_end_id=120123,
**kwargs,
):
# We need to init super() here so that it does not reset values
# that are in text config to the BaseClass defaults. The Base
# config has many text related defaults and not all defaults are
# same as for `HunYuanVLTextConfig`.
super().__init__(**kwargs)
if isinstance(vision_config, dict):
self.vision_config = self.sub_configs["vision_config"](**vision_config)
elif vision_config is None:
self.vision_config = self.sub_configs["vision_config"]()
if isinstance(text_config, dict):
self.text_config = self.sub_configs["text_config"](**text_config)
elif text_config is None:
# For BC use all kwargs to init `TextConfig`
self.text_config = self.sub_configs["text_config"](**kwargs)
self.image_token_id = image_token_id
self.im_start_id = im_start_id
self.im_end_id = im_end_id
self.im_newline_id = im_newline_id
self.video_start_id = video_start_id
self.video_end_id = video_end_id
self.vision_config.text_hidden_size = self.text_config.hidden_size
# Attention implementation to use. It sets it recursively on sub-configs
# so we call it again in the end.
self._attn_implementation = kwargs.pop("attn_implementation", None)
def __setattr__(self, key, value):
if (
(text_config := super().__getattribute__("__dict__").get("text_config"))
is not None
and key not in ["dtype", "_attn_implementation_internal"]
and key in text_config.__dict__
):
setattr(text_config, key, value)
else:
super().__setattr__(key, value)
def __getattribute__(self, key):
if "text_config" in super().__getattribute__("__dict__") and key not in [
"_name_or_path",
"model_type",
"dtype",
"_attn_implementation_internal",
]:
text_config = super().__getattribute__("text_config")
if key in text_config.__dict__:
return getattr(text_config, key)
return super().__getattribute__(key)

View File

@ -9,7 +9,15 @@ reasons:
"""
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
from vllm.transformers_utils.processors.ovis import OvisProcessor
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
__all__ = [
"DeepseekVLV2Processor",
"HunYuanVLProcessor",
"HunYuanVLImageProcessor",
"OvisProcessor",
"Ovis2_5Processor",
]

View File

@ -0,0 +1,233 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/processing_hunyuan_vl.py
import numpy as np
import torch
from transformers import AutoProcessor
from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from transformers.video_utils import VideoInput
class HunYuanVLProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer" # ("AutoTokenizer", None)
def __init__(
self,
image_processor=None,
tokenizer=None,
video_processor=None,
chat_template=None,
**kwargs,
):
# TODO Fix the init
self.tokenizer = tokenizer
self.image_token_id = 120120 # self.tokenizer.image_token_id
self.image_token = self.tokenizer.convert_ids_to_tokens(self.image_token_id)
self.im_start_token_id = 120118 # self.tokenizer.im_start_id
self.im_start_token = self.tokenizer.convert_ids_to_tokens(
self.im_start_token_id
)
self.im_end_token_id = 120119 # self.tokenizer.im_end_id
self.im_end_token = self.tokenizer.convert_ids_to_tokens(self.im_end_token_id)
self.placeholder_token = self.tokenizer.convert_ids_to_tokens(
self.tokenizer.vocab_size - 1
)
self.pad_id = 120002 # self.tokenizer.pad_token_id
super().__init__(
image_processor, tokenizer, video_processor, chat_template=chat_template
)
def __call__(
self,
images: ImageInput = None,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
videos: VideoInput = None,
**kwargs,
) -> BatchFeature:
image_inputs = {}
if images is not None:
image_inputs = self.image_processor(images=images)
image_grid_thw = image_inputs["image_grid_thw"]
if not isinstance(text, list):
text = [text]
text = text.copy() # below lines change text in-place
image_tokens_cumsum = [0]
if images is not None:
index = 0
for i in range(len(text)):
while self.image_token in text[i]:
grid_h, grid_w = image_grid_thw[index][-2:]
patch_h = grid_h // self.image_processor.merge_size
patch_w = grid_w // self.image_processor.merge_size
num_image_tokens = patch_h * (patch_w + 1) + 2
image_tokens_cumsum.append(
image_tokens_cumsum[-1] + num_image_tokens
)
# text[i] = text[i].replace(self.image_token, self.im_start_token + self.placeholder_token * num_image_tokens + self.im_end_token, 1) # noqa: E501
text[i] = text[i].replace(
self.image_token, self.placeholder_token * num_image_tokens, 1
)
index += 1
text[i] = text[i].replace(self.placeholder_token, self.image_token)
# text[i] = self.tokenizer.bos_token + text[i]
text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
input_ids = text_inputs["input_ids"]
position_ids = torch.arange(len(input_ids[0]))
position_ids_w = torch.arange(len(input_ids[0]))
position_ids_h = torch.arange(len(input_ids[0]))
position_ids_t = torch.arange(len(input_ids[0]))
if images is not None:
image_token_pos_indices = torch.where(input_ids[0] == self.image_token_id)[
0
]
for i in range(len(image_grid_thw)):
grid_h, grid_w = image_grid_thw[i][-2:]
patch_h = grid_h // self.image_processor.merge_size
patch_w = grid_w // self.image_processor.merge_size
start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1
replace_num = (patch_w + 1) * patch_h
position_ids_w[start_pos : start_pos + replace_num] = torch.tensor(
list(range(patch_w + 1)) * patch_h, dtype=torch.int64
)
patch_h_list = []
for h in range(patch_h):
patch_h_list += [h] * (patch_w + 1)
position_ids_h[start_pos : start_pos + replace_num] = torch.tensor(
patch_h_list, dtype=torch.int64
)
position_ids_t[start_pos : start_pos + replace_num] = 0
position_ids = torch.stack(
[position_ids, position_ids_w, position_ids_h, position_ids_t]
).unsqueeze(0)
text_inputs["position_ids"] = position_ids
attention_mask = input_ids.ne(self.pad_id)
text_inputs["attention_mask"] = attention_mask
text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids)]
# image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
return_tensors = kwargs.pop("return_tensors", None)
return BatchFeature(
data={**text_inputs, **image_inputs},
tensor_type=return_tensors,
)
def batch_decode(self, *args, **kwargs):
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
return self.tokenizer.decode(*args, **kwargs)
def post_process_image_text_to_text(
self,
generated_outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
**kwargs,
):
assert 0
def apply_chat_template(self, *args, **kwargs):
token_ids = self.tokenizer.apply_chat_template(*args, **kwargs)
return token_ids
def get_imgs_pos(self, doc_ids):
doc_ids = np.array(doc_ids, dtype=np.int64)
img_begin_index = np.where(doc_ids == self.im_start_token_id)[0]
img_end_index = np.where(doc_ids == self.im_end_token_id)[0]
imgs_pos = np.concatenate(
(
np.reshape(img_begin_index + 1, (-1, 1)),
np.reshape(img_end_index, (-1, 1)),
),
axis=-1,
).tolist()
return imgs_pos
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
def split_image_into_patch_blocks(
pixel_values: torch.Tensor, # shape: [batch_size, 3, H, W]
patch_size: int = 16, # e.g. 16
adaptor_patch_div: int = 4, # e.g. 4 --> each patch_size is cut into 4x4 small regions, i.e. patch_size // 4 # noqa: E501
) -> torch.Tensor:
"""
Split the input image tensor (supporting batch) into large patches of size `patch_size`,
and then further divide each large patch into smaller regions of size
(patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div).
Each small region is extracted as a tensor of shape [3, patch_size, patch_size].
The final output contains all such small region tensors.
Args:
pixel_values: Input image tensor of shape [batch_size, 3, H, W].
patch_size: Size of the large patch, e.g., 16.
adaptor_patch_div: Each large patch is divided into
(patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div)
smaller regions.
Returns:
patches: A tensor of shape [N, 3, patch_size, patch_size],
where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2.
Each element in the batch corresponds to one small image region.
""" # noqa: E501
batch_size, channels, height, width = pixel_values.shape
assert channels == 3, "Pixel values must have 3 channels in dim=1"
assert height % patch_size == 0 and width % patch_size == 0, (
"H and W must be divisible by patch_size"
)
patch_height_num = height // patch_size
patch_width_num = width // patch_size
# Reshape to [B, 3, ph, ps, pw, ps]
img = pixel_values.reshape(
batch_size, 3, patch_height_num, patch_size, patch_width_num, patch_size
)
# Further split each psxps patch into (ps//aps)x(ps//aps) small regions
img = img.reshape(
batch_size,
3,
patch_height_num,
patch_size // adaptor_patch_div, # ps // aps
adaptor_patch_div,
patch_width_num,
patch_size // adaptor_patch_div, # ps // aps
adaptor_patch_div,
)
# Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps]
img = img.permute(0, 2, 5, 3, 6, 1, 4, 7)
# Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size]
patches = img.reshape(-1, 3, patch_size, patch_size)
return patches
AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor)

View File

@ -0,0 +1,477 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/image_processing_hunyuan_vl.py
"""Image processor class for HunYuanVL."""
# isort conflicts with ruff for transformers imports
# isort: skip_file
import math
import numpy as np
import torchvision.transforms as transforms
from transformers import AutoImageProcessor
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
from transformers.image_transforms import (
convert_to_rgb,
)
from transformers.image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
make_flat_list_of_images,
make_list_of_images,
valid_images,
validate_preprocess_arguments,
)
from transformers.utils import TensorType, logging
from transformers.video_utils import VideoInput, make_batched_videos
logger = logging.get_logger(__name__)
def smart_resize(
height: int,
width: int,
factor: int = 16,
min_pixels: int = 512 * 512,
max_pixels: int = 2048 * 2048,
):
"""Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
if max(height, width) / min(height, width) > 200:
raise ValueError(
"absolute aspect ratio must be smaller than 200, got "
f"{max(height, width) / min(height, width)}"
)
h_bar = round(height / factor) * factor
w_bar = round(width / factor) * factor
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = max(factor, math.floor(height / beta / factor) * factor)
w_bar = max(factor, math.floor(width / beta / factor) * factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = math.ceil(height * beta / factor) * factor
w_bar = math.ceil(width * beta / factor) * factor
return h_bar, w_bar
class HunYuanVLImageProcessor(BaseImageProcessor):
model_input_names = [
"pixel_values",
"image_grid_thw",
"pixel_values_videos",
"video_grid_thw",
]
def __init__(
self,
do_resize: bool = True,
size: dict[str, int] | None = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: int | float = 1 / 255,
do_normalize: bool = True,
image_mean: float | list[float] | None = None,
image_std: float | list[float] | None = None,
do_convert_rgb: bool = True,
min_pixels: int | None = None,
max_pixels: int | None = None,
patch_size: int = 16,
temporal_patch_size: int = 2,
merge_size: int = 2,
**kwargs,
) -> None:
super().__init__(**kwargs)
if size is not None and (
"shortest_edge" not in size or "longest_edge" not in size
):
raise ValueError(
"size must contain 'shortest_edge' and 'longest_edge' keys."
)
else:
size = {"shortest_edge": 512 * 512, "longest_edge": 2048 * 2048}
# backward compatibility: override size with min_pixels and max_pixels
# if they are provided.
if min_pixels is not None:
size["shortest_edge"] = min_pixels
if max_pixels is not None:
size["longest_edge"] = max_pixels
self.min_pixels = size["shortest_edge"]
self.max_pixels = size["longest_edge"]
self.size = size
self.do_resize = do_resize
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.patch_size = patch_size
self.temporal_patch_size = temporal_patch_size
self.merge_size = merge_size
self.do_convert_rgb = do_convert_rgb
# hard-code
def _preprocess(
self,
images: ImageInput | VideoInput,
do_resize: bool | None = None,
size: dict[str, int] | None = None,
resample: PILImageResampling = None,
do_rescale: bool | None = None,
rescale_factor: float | None = None,
do_normalize: bool | None = None,
image_mean: float | list[float] | None = None,
image_std: float | list[float] | None = None,
patch_size: int = 16,
temporal_patch_size: int = 2,
merge_size: int = 2,
do_convert_rgb: bool | None = None,
data_format: ChannelDimension | None = ChannelDimension.FIRST,
input_data_format: str | ChannelDimension | None = None,
):
"""
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
Args:
images (`ImageInput`):
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
patch_size (`int`, *optional*, defaults to `self.patch_size`):
The spatial patch size of the vision encoder.
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
The temporal patch size of the vision encoder.
merge_size (`int`, *optional*, defaults to `self.merge_size`):
The merge size of the vision encoder to llm encoder.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
""" # noqa: E501
images = make_list_of_images(images)
if do_convert_rgb:
images = [convert_to_rgb(image) for image in images]
width, height = images[0].width, images[0].height
resized_width, resized_height = width, height
processed_images = []
for image in images:
if do_resize:
resized_width, resized_height = smart_resize(
width,
height,
factor=patch_size * merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
image = image.resize((resized_width, resized_height))
if do_normalize:
image = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(self.image_mean, self.image_std),
]
)(image)
processed_images.append(image)
patches = np.array(processed_images)
channel = patches.shape[1]
grid_t = patches.shape[0] // temporal_patch_size
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
patches = patches.reshape(
1,
channel,
grid_h // merge_size,
merge_size,
patch_size,
grid_w // merge_size,
merge_size,
patch_size,
)
patches = patches.transpose(0, 2, 3, 5, 6, 1, 4, 7)
flatten_patches = patches.reshape(
1 * grid_h * grid_w, channel * patch_size * patch_size
)
return flatten_patches, (grid_t, grid_h, grid_w)
def preprocess(
self,
images: ImageInput,
videos: VideoInput = None,
do_resize: bool | None = None,
size: dict[str, int] | None = None,
min_pixels: int | None = None,
max_pixels: int | None = None,
resample: PILImageResampling = None,
do_rescale: bool | None = None,
rescale_factor: float | None = None,
do_normalize: bool | None = None,
image_mean: float | list[float] | None = None,
image_std: float | list[float] | None = None,
patch_size: int | None = None,
temporal_patch_size: int | None = None,
merge_size: int | None = None,
do_convert_rgb: bool | None = None,
return_tensors: str | TensorType | None = None,
data_format: ChannelDimension | None = ChannelDimension.FIRST,
input_data_format: str | ChannelDimension | None = None,
):
"""
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
videos (`VideoInput`):
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
`True`.
min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
The min pixels of the image to resize the image.
max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
The max pixels of the image to resize the image.
patch_size (`int`, *optional*, defaults to `self.patch_size`):
The spatial patch size of the vision encoder.
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
The temporal patch size of the vision encoder.
merge_size (`int`, *optional*, defaults to `self.merge_size`):
The merge size of the vision encoder to llm encoder.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
""" # noqa: E501
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
if size is not None:
if "shortest_edge" not in size or "longest_edge" not in size:
raise ValueError(
"size must contain 'shortest_edge' and 'longest_edge' keys."
)
min_pixels = size["shortest_edge"]
elif min_pixels is not None and max_pixels is not None:
# backward compatibility: override size with min_pixels and max_pixels
# if they are provided.
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
else:
size = {**self.size}
do_resize = do_resize if do_resize is not None else self.do_resize
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = (
rescale_factor if rescale_factor is not None else self.rescale_factor
)
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
patch_size = patch_size if patch_size is not None else self.patch_size
temporal_patch_size = (
temporal_patch_size
if temporal_patch_size is not None
else self.temporal_patch_size
)
merge_size = merge_size if merge_size is not None else self.merge_size
do_convert_rgb = (
do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
)
if images is not None:
images = make_flat_list_of_images(images)
if images is not None and not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
validate_preprocess_arguments(
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_resize=do_resize,
size=size,
resample=resample,
)
data = {}
if images is not None:
pixel_values, vision_grid_thws = [], []
for image in images:
patches, image_grid_thw = self._preprocess(
image,
do_resize=do_resize,
size=size,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
patch_size=patch_size,
temporal_patch_size=temporal_patch_size,
merge_size=merge_size,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
)
pixel_values.extend(patches)
vision_grid_thws.append(image_grid_thw)
pixel_values = np.array(pixel_values)
vision_grid_thws = np.array(vision_grid_thws)
data.update(
{"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
)
# kept for BC only and should be removed after v5.0
if videos is not None:
logger.warning(
"`HunYuanVLV1ImageProcessor` works only with image inputs "
"and doesn't process videos anymore. "
"This is a deprecated behavior and will be removed in v5.0. "
"Your videos should be forwarded to `HunYuanVLV1VideoProcessor`. "
)
videos = make_batched_videos(videos)
pixel_values_videos, vision_grid_thws_videos = [], []
for images in videos:
patches, video_grid_thw = self._preprocess(
images,
do_resize=do_resize,
size=size,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
patch_size=patch_size,
temporal_patch_size=temporal_patch_size,
merge_size=merge_size,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
)
pixel_values_videos.extend(patches)
vision_grid_thws_videos.append(video_grid_thw)
data.update(
{
"pixel_values_videos": np.array(pixel_values_videos),
"video_grid_thw": np.array(vision_grid_thws_videos),
}
)
return BatchFeature(data=data, tensor_type=return_tensors)
def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
"""
A utility that returns number of image patches for a given image size.
Args:
height (`int`):
Height of the input image.
width (`int`):
Width of the input image.
images_kwargs (`dict`, *optional*):
Any kwargs to override defaults of the image processor.
Returns:
`int`: Number of image patches per image.
"""
min_pixels = (
images_kwargs["min_pixels"]
if "min_pixels" in images_kwargs
else self.size["shortest_edge"]
)
max_pixels = (
images_kwargs["max_pixels"]
if "max_pixels" in images_kwargs
else self.size["longest_edge"]
)
patch_size = images_kwargs.get("patch_size", self.patch_size)
merge_size = images_kwargs.get("merge_size", self.merge_size)
factor = patch_size * merge_size
resized_height, resized_width = smart_resize(
height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
)
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
return grid_h * (grid_w + 1) + 2
AutoImageProcessor.register("HunYuanVLImageProcessor", HunYuanVLImageProcessor)

View File

@ -52,9 +52,11 @@ STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
STR_INVALID_VAL: str = "INVALID"
MASK_64_BITS = (1 << 64) - 1
def random_uuid() -> str:
return str(uuid.uuid4().hex)
return f"{uuid.uuid4().int & MASK_64_BITS:016x}" # 16 hex chars
def length_from_prompt_token_ids_or_embeds(

View File

@ -7,6 +7,7 @@ import torch
from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig
from vllm.v1.attention.backends.utils import (
AttentionCGSupport,
AttentionMetadataBuilder,
CommonAttentionMetadata,
split_decodes_and_prefills,
@ -35,6 +36,8 @@ class LinearAttentionMetadata:
class LinearAttentionMetadataBuilder(AttentionMetadataBuilder[LinearAttentionMetadata]):
reorder_batch_threshold: int = 1
_cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
def __init__(
self,
kv_cache_spec: AttentionSpec,

View File

@ -340,6 +340,8 @@ class MLACommonPrefillMetadata:
max_seq_lens: list[int]
seq_lens: torch.Tensor
workspace: torch.Tensor
token_to_seq: torch.Tensor
chunk_total_token: list[int]
# for mla DCP
padded_local_chunk_seq_lens: list[list[int]] | None = None
@ -839,6 +841,19 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
torch.cumsum(
chunk_seq_lens, dim=1, out=cu_seq_lens_cpu[:, 1:], dtype=torch.int32
)
chunk_total_token = cu_seq_lens_cpu[:, -1]
max_token_num_over_chunk = chunk_total_token.max().item()
token_to_seq_tensor_cpu = torch.zeros(
[num_chunks, max_token_num_over_chunk], dtype=torch.int32
)
range_idx = torch.arange(num_prefills, dtype=torch.int32)
for i in range(num_chunks):
chunk_token_to_seq_tensor = torch.repeat_interleave(
range_idx, chunk_seq_lens[i]
)
chunk_len = chunk_token_to_seq_tensor.shape[0]
token_to_seq_tensor_cpu[i, :chunk_len] = chunk_token_to_seq_tensor
if self.dcp_world_size > 1:
local_context_lens_allranks = get_dcp_local_seq_lens(
@ -906,6 +921,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(),
max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
seq_lens=chunk_seq_lens,
token_to_seq=token_to_seq_tensor_cpu.to(
device, non_blocking=True
),
chunk_total_token=chunk_total_token.tolist(),
workspace=self.chunked_prefill_workspace,
padded_local_chunk_seq_lens=padded_local_chunk_seq_lens.tolist(),
local_context_lens_allranks=local_context_lens_allranks.tolist(),
@ -922,6 +941,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
seq_lens=chunk_seq_lens,
token_to_seq=token_to_seq_tensor_cpu.to(
device, non_blocking=True
),
chunk_total_token=chunk_total_token,
workspace=self.chunked_prefill_workspace,
)
@ -1638,16 +1661,15 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
output = None
iters = len(prefill_metadata.chunked_context.seq_tot)
workspace = prefill_metadata.chunked_context.workspace
for i in range(iters):
toks = prefill_metadata.chunked_context.seq_tot[i]
ops.gather_and_maybe_dequant_cache(
src_cache=kv_c_and_k_pe_cache,
dst=workspace,
block_table=prefill_metadata.block_table,
cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i],
batch_size=attn_metadata.num_prefills,
token_to_seq=prefill_metadata.chunked_context.token_to_seq[i],
num_tokens=prefill_metadata.chunked_context.chunk_total_token[i],
kv_cache_dtype=self.kv_cache_dtype,
scale=k_scale,
seq_starts=prefill_metadata.chunked_context.starts[i],

View File

@ -49,6 +49,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
paged_kv_last_page_len: torch.Tensor | None = None
# The query indptr, shape : [num_decode + 1]
qo_indptr: torch.Tensor | None = None
# The dtype of MLA out tensor
attn_out_dtype: torch.dtype = torch.bfloat16
class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
@ -74,6 +76,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
)
self.compilation_config = vllm_config.compilation_config
self.decode_attn_out_dtype = vllm_config.model_config.dtype
# kernel block size is always 1.
max_num_pages_per_req = vllm_config.model_config.max_model_len
max_num_reqs = vllm_config.scheduler_config.max_num_seqs
@ -162,6 +165,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
paged_kv_last_page_len=paged_kv_last_page_len,
qo_indptr=qo_indptr,
dcp_tot_seq_lens=dcp_tot_seq_lens_device,
attn_out_dtype=self.decode_attn_out_dtype,
)
return attn_metadata
@ -242,7 +246,11 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
assert isinstance(q, torch.Tensor)
B = q.shape[0]
o = torch.zeros(
B, self.num_heads, self.kv_lora_rank, dtype=q.dtype, device=q.device
B,
self.num_heads,
self.kv_lora_rank,
dtype=attn_metadata.decode.attn_out_dtype,
device=q.device,
)
kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
@ -260,6 +268,8 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
attn_metadata.decode.paged_kv_indptr,
attn_metadata.decode.paged_kv_indices,
attn_metadata.decode.paged_kv_last_page_len,
q_scale=layer._q_scale,
kv_scale=layer._k_scale,
)
return o, None

View File

@ -971,7 +971,16 @@ def _get_kv_cache_groups_uniform_page_size(
# is the minimum number of layers among all attention types. Need a better
# strategy if we want to support more complex patterns (e.g., 20 full + 30
# sw, where the group size should be 10).
group_size = min([len(layers) for layers in same_type_layers.values()])
min_num_layers = min([len(layers) for layers in same_type_layers.values()])
group_size = min_num_layers
max_num_layers = max([len(layers) for layers in same_type_layers.values()])
if max_num_layers < min_num_layers * 1.25:
# If the number of layers is not much larger than the minimum number of layers,
# use the maximum number of layers as the group size to avoid too many padding
# layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We
# pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a
# magic number to avoid too many padding layers.
group_size = max_num_layers
grouped_layers = []
for layers in same_type_layers.values():
num_padding_layers = group_size - len(layers) % group_size

View File

@ -508,9 +508,9 @@ class Scheduler(SchedulerInterface):
not self.scheduler_config.enable_chunked_prefill
and num_new_tokens > token_budget
):
self.waiting.pop_request()
skipped_waiting_requests.prepend_request(request)
continue
# If chunked_prefill is disabled,
# we can stop the scheduling here.
break
num_new_tokens = min(num_new_tokens, token_budget)
assert num_new_tokens > 0

View File

@ -440,57 +440,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
# Setting default values
self.record_sleep_state()
# GPU cache
#
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if self.show_hidden_metrics:
gauge_gpu_cache_usage = self._gauge_cls(
name="vllm:gpu_cache_usage_perc",
documentation=(
"GPU KV-cache usage. 1 means 100 percent usage."
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."
),
multiprocess_mode="mostrecent",
labelnames=labelnames,
)
self.gauge_gpu_cache_usage = make_per_engine(
gauge_gpu_cache_usage, engine_indexes, model_name
)
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if self.show_hidden_metrics:
counter_gpu_prefix_cache_queries = self._counter_cls(
name="vllm:gpu_prefix_cache_queries",
documentation=(
"GPU prefix cache queries, in terms of number of queried"
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
),
labelnames=labelnames,
)
self.counter_gpu_prefix_cache_queries = make_per_engine(
counter_gpu_prefix_cache_queries, engine_indexes, model_name
)
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if self.show_hidden_metrics:
counter_gpu_prefix_cache_hits = self._counter_cls(
name="vllm:gpu_prefix_cache_hits",
documentation=(
"GPU prefix cache hits, in terms of number of cached "
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
),
labelnames=labelnames,
)
self.counter_gpu_prefix_cache_hits = make_per_engine(
counter_gpu_prefix_cache_hits, engine_indexes, model_name
)
gauge_kv_cache_usage = self._gauge_cls(
name="vllm:kv_cache_usage_perc",
documentation="KV-cache usage. 1 means 100 percent usage.",
@ -735,39 +684,41 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
)
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# TODO: in 0.12, only enable if show_hidden_metrics=True
histogram_time_per_output_token = self._histogram_cls(
name="vllm:time_per_output_token_seconds",
documentation=(
"Histogram of time per output token in seconds."
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."
),
buckets=[
0.01,
0.025,
0.05,
0.075,
0.1,
0.15,
0.2,
0.3,
0.4,
0.5,
0.75,
1.0,
2.5,
5.0,
7.5,
10.0,
20.0,
40.0,
80.0,
],
labelnames=labelnames,
)
self.histogram_time_per_output_token = make_per_engine(
histogram_time_per_output_token, engine_indexes, model_name
)
# With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
# TODO: remove in 0.13.0
if self.show_hidden_metrics:
histogram_time_per_output_token = self._histogram_cls(
name="vllm:time_per_output_token_seconds",
documentation=(
"Histogram of time per output token in seconds."
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."
),
buckets=[
0.01,
0.025,
0.05,
0.075,
0.1,
0.15,
0.2,
0.3,
0.4,
0.5,
0.75,
1.0,
2.5,
5.0,
7.5,
10.0,
20.0,
40.0,
80.0,
],
labelnames=labelnames,
)
self.histogram_time_per_output_token = make_per_engine(
histogram_time_per_output_token, engine_indexes, model_name
)
histogram_inter_token_latency = self._histogram_cls(
name="vllm:inter_token_latency_seconds",
@ -966,20 +917,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
self.gauge_scheduler_waiting[engine_idx].set(
scheduler_stats.num_waiting_reqs
)
if self.show_hidden_metrics:
self.gauge_gpu_cache_usage[engine_idx].set(
scheduler_stats.kv_cache_usage
)
self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
if self.show_hidden_metrics:
self.counter_gpu_prefix_cache_queries[engine_idx].inc(
scheduler_stats.prefix_cache_stats.queries
)
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
scheduler_stats.prefix_cache_stats.hits
)
self.counter_prefix_cache_queries[engine_idx].inc(
scheduler_stats.prefix_cache_stats.queries
)
@ -1050,7 +989,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
self.histogram_time_to_first_token[engine_idx].observe(ttft)
for itl in iteration_stats.inter_token_latencies_iter:
self.histogram_inter_token_latency[engine_idx].observe(itl)
self.histogram_time_per_output_token[engine_idx].observe(itl)
if self.show_hidden_metrics:
self.histogram_time_per_output_token[engine_idx].observe(itl)
for finished_request in iteration_stats.finished_requests:
self.counter_request_success[finished_request.finish_reason][

View File

@ -40,6 +40,7 @@ from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.sampler import _SAMPLING_EPS
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm.v1.utils import CpuGpuBuffer
from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
logger = init_logger(__name__)
@ -65,6 +66,7 @@ class EagleProposer:
self.dtype = vllm_config.model_config.dtype
self.max_model_len = vllm_config.model_config.max_model_len
self.block_size = vllm_config.cache_config.block_size
self.dp_rank = vllm_config.parallel_config.data_parallel_rank
self.num_speculative_tokens = self.speculative_config.num_speculative_tokens
self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
self.token_arange_np = np.arange(self.max_num_tokens)
@ -83,6 +85,9 @@ class EagleProposer:
self.draft_indexer_metadata_builder: AttentionMetadataBuilder | None = None
self.attn_layer_names: list[str] = []
self.indexer_layer_names: list[str] = []
self.eagle3_use_aux_hidden_state: bool = (
self._get_eagle3_use_aux_hidden_state_from_config()
)
self.use_cuda_graph = False
@ -268,15 +273,24 @@ class EagleProposer:
assert draft_indexer_metadata is not None
per_layer_attn_metadata[layer_name] = draft_indexer_metadata
num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
num_tokens_unpadded=num_tokens,
num_tokens_padded=num_tokens,
)
cudagraph_runtime_mode = CUDAGraphMode.NONE
if (
self.use_cuda_graph
and num_tokens <= self.compilation_config.max_cudagraph_capture_size
and num_tokens_dp_padded
<= self.compilation_config.max_cudagraph_capture_size
):
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens_dp_padded)
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
else:
num_input_tokens = num_tokens
num_input_tokens = num_tokens_dp_padded
if num_tokens_across_dp is not None:
num_tokens_across_dp[self.dp_rank] = num_input_tokens
# copy inputs to buffer for cudagraph
self._set_positions(num_tokens, target_positions)
self.hidden_states[:num_tokens] = target_hidden_states
@ -300,6 +314,7 @@ class EagleProposer:
per_layer_attn_metadata,
self.vllm_config,
num_tokens=num_input_tokens,
num_tokens_across_dp=num_tokens_across_dp,
cudagraph_runtime_mode=cudagraph_runtime_mode,
):
ret_hidden_states = self.model(
@ -362,15 +377,23 @@ class EagleProposer:
# Generate the remaining draft tokens.
draft_token_ids_list = [draft_token_ids]
batch_size_dp_padded, batch_size_across_dp = self._pad_batch_across_dp(
num_tokens_unpadded=batch_size,
num_tokens_padded=batch_size,
)
if (
self.use_cuda_graph
and batch_size <= self.compilation_config.max_cudagraph_capture_size
and batch_size_dp_padded
<= self.compilation_config.max_cudagraph_capture_size
):
input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size_dp_padded)
cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
else:
input_batch_size = batch_size
input_batch_size = batch_size_dp_padded
cudagraph_runtime_mode = CUDAGraphMode.NONE
if batch_size_across_dp is not None:
batch_size_across_dp[self.dp_rank] = input_batch_size
common_attn_metadata.num_actual_tokens = batch_size
common_attn_metadata.max_query_len = 1
@ -471,6 +494,7 @@ class EagleProposer:
per_layer_attn_metadata,
self.vllm_config,
num_tokens=input_batch_size,
num_tokens_across_dp=batch_size_across_dp,
cudagraph_runtime_mode=cudagraph_runtime_mode,
):
ret_hidden_states = self.model(
@ -1113,36 +1137,56 @@ class EagleProposer:
self,
num_tokens: int,
use_cudagraphs=True,
is_graph_capturing=False,
) -> None:
# Determine if CUDA graphs should be used for this run.
cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
if (
cudagraphs_enabled
and num_tokens <= self.compilation_config.max_cudagraph_capture_size
):
num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
with set_forward_context(
None,
self.vllm_config,
num_tokens=num_tokens,
cudagraph_runtime_mode=(
CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE
),
# FIXME: when using tree-based specdec, adjust number of forward-passes
# according to the depth of the tree.
for fwd_idx in range(
self.num_speculative_tokens if not is_graph_capturing else 1
):
if self.supports_mm_inputs:
input_ids = None
inputs_embeds = self.inputs_embeds[:num_tokens]
else:
input_ids = self.input_ids[:num_tokens]
inputs_embeds = None
if fwd_idx <= 1:
num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
num_tokens_unpadded=num_tokens,
num_tokens_padded=num_tokens,
)
if (
cudagraphs_enabled
and num_tokens_dp_padded
<= self.compilation_config.max_cudagraph_capture_size
):
num_input_tokens = self.vllm_config.pad_for_cudagraph(
num_tokens_dp_padded
)
else:
num_input_tokens = num_tokens_dp_padded
if num_tokens_across_dp is not None:
num_tokens_across_dp[self.dp_rank] = num_input_tokens
self.model(
input_ids=input_ids,
positions=self._get_positions(num_tokens),
hidden_states=self.hidden_states[:num_tokens],
inputs_embeds=inputs_embeds,
)
with set_forward_context(
None,
self.vllm_config,
num_tokens=num_input_tokens,
num_tokens_across_dp=num_tokens_across_dp,
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE
if cudagraphs_enabled
else CUDAGraphMode.NONE,
):
if self.supports_mm_inputs:
input_ids = None
inputs_embeds = self.inputs_embeds[:num_input_tokens]
else:
input_ids = self.input_ids[:num_input_tokens]
inputs_embeds = None
self.model(
input_ids=input_ids,
positions=self._get_positions(num_input_tokens),
hidden_states=self.hidden_states[:num_input_tokens],
inputs_embeds=inputs_embeds,
)
def _get_attention_metadata_builder(self) -> AttentionMetadataBuilder:
"""Find and return the attention metadata builders for EAGLE layers.
@ -1169,6 +1213,22 @@ class EagleProposer:
)
return builder
def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool:
"""
Some eagle3 heads (e.g., nvidia/gpt-oss-120b-Eagle3-v2) do not use auxiliary
hidden states and directly uses the last layer output just like eagle1.
They might indicate this by setting "use_aux_hidden_state" to False
inside the "eagle_config" dict of their hf_config.
"""
if self.method != "eagle3":
return False
# Assume that eagle3 heads use aux hidden states by default
use_aux_hidden_state = True
eagle_config = getattr(self.draft_model_config.hf_config, "eagle_config", None)
if eagle_config is not None:
use_aux_hidden_state = eagle_config.get("use_aux_hidden_state", True)
return use_aux_hidden_state
def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
"""
Validate that all eagle layers belong to the same KVCacheGroup.
@ -1192,6 +1252,28 @@ class EagleProposer:
== 1
), "All eagle layers should belong to the same kv cache group"
def _pad_batch_across_dp(
self,
num_tokens_unpadded: int,
num_tokens_padded: int,
) -> tuple[int, torch.Tensor]:
# TODO(Flechman): support DBO ubatching
ubatch_slices, num_toks_across_dp = coordinate_batch_across_dp(
num_tokens_unpadded=num_tokens_unpadded,
parallel_config=self.vllm_config.parallel_config,
allow_microbatching=False,
allow_dp_padding=self.use_cuda_graph,
num_tokens_padded=num_tokens_padded,
uniform_decode=None,
num_scheduled_tokens_per_request=None,
)
assert ubatch_slices is None, "DBO ubatching not implemented for EAGLE"
num_tokens_dp_padded = num_tokens_padded
if num_toks_across_dp is not None:
num_tokens_dp_padded = int(num_toks_across_dp[self.dp_rank].item())
return num_tokens_dp_padded, num_toks_across_dp
# NOTE(woosuk): Currently, the below code is not used and we always use argmax
# to sample the draft tokens. We will use this after we find a way to manage

View File

@ -4,7 +4,6 @@ from dataclasses import dataclass
from typing import Any
import numba
import numba.types as types
import numpy as np
import torch
@ -37,6 +36,9 @@ class InputBuffers:
self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
# Spec decoding.
self.next_prefill_tokens = self._make_buffer(max_num_reqs, dtype=torch.int32)
# Structured outputs.
self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
self.grammar_bitmask = self._make_buffer(
@ -144,80 +146,42 @@ class InputBatch:
)
# NOTE: With the type annotations, this function is pre-compiled
# before the first call.
@numba.jit(
[
types.none(
types.int32[:], # idx_mapping
types.int32[:], # num_scheduled_tokens
types.int32[:, :], # prefill_token_ids
types.int32[:], # num_computed_prefill_tokens
types.int32[:], # prefill_len
types.int32[:], # input_ids
types.int32[:], # query_start_loc
)
],
nopython=True,
cache=True,
)
@numba.njit(cache=True)
def _prepare_prefill_inputs(
idx_mapping: np.ndarray, # batch_idx -> req_idx
num_scheduled_tokens: np.ndarray, # [B]
idx_mapping: np.ndarray, # [B]
query_lens: np.ndarray, # [B]
query_start_loc: np.ndarray, # [B + 1]
prefill_token_ids: np.ndarray, # [N, max_model_len]
num_computed_prefill_tokens: np.ndarray, # [N]
prefill_len: np.ndarray, # [N]
input_ids: np.ndarray, # [num_input_tokens]
query_start_loc: np.ndarray, # [B + 1]
) -> None:
num_reqs = num_scheduled_tokens.shape[0]
query_start_loc[0] = 0
cu_num_tokens = 0
num_reqs = idx_mapping.shape[0]
query_starts = query_start_loc[:num_reqs]
query_ends = query_start_loc[1 : num_reqs + 1]
starts = num_computed_prefill_tokens[idx_mapping]
ends = starts + query_lens
for i in range(num_reqs):
req_idx = idx_mapping[i]
query_len = num_scheduled_tokens[i]
start = num_computed_prefill_tokens[req_idx]
end = min(start + query_len, prefill_len[req_idx])
n = end - start
start_idx = cu_num_tokens
input_ids[start_idx : start_idx + n] = prefill_token_ids[req_idx, start:end]
cu_num_tokens = start_idx + query_len
query_start_loc[i + 1] = cu_num_tokens
# Pad the inputs for CUDA graphs.
# Note: pad query_start_loc to be non-decreasing, as kernels
# like FlashAttention requires that
query_start_loc[num_reqs + 1 :].fill(cu_num_tokens)
input_ids[query_starts[i] : query_ends[i]] = prefill_token_ids[
idx_mapping[i], starts[i] : ends[i]
]
def prepare_prefill_inputs(
idx_mapping: np.ndarray,
num_scheduled_tokens: np.ndarray,
total_num_tokens: int,
query_start_loc: np.ndarray,
prefill_token_ids: np.ndarray,
num_computed_prefill_tokens: np.ndarray,
prefill_len: np.ndarray,
input_ids: CpuGpuBuffer,
query_start_loc: CpuGpuBuffer,
input_ids: np.ndarray,
) -> None:
_prepare_prefill_inputs(
idx_mapping,
num_scheduled_tokens,
query_start_loc,
prefill_token_ids,
num_computed_prefill_tokens,
prefill_len,
input_ids.np,
query_start_loc.np,
input_ids,
)
input_ids.copy_to_gpu(total_num_tokens)
# NOTE(woosuk): We should copy the whole query_start_loc and seq_lens
# tensors from CPU to GPU, because they may include paddings needed
# for full CUDA graph mode.
query_start_loc.copy_to_gpu()
@triton.jit
@ -380,8 +344,8 @@ def _post_update_kernel(
sampled_tokens_ptr,
sampled_tokens_stride,
num_sampled_ptr,
num_rejected_ptr,
query_start_loc_ptr,
cu_num_logits_ptr,
):
req_id = tl.program_id(0)
req_state_idx = tl.load(idx_mapping_ptr + req_id)
@ -396,17 +360,10 @@ def _post_update_kernel(
query_start = tl.load(query_start_loc_ptr + req_id)
query_end = tl.load(query_start_loc_ptr + req_id + 1)
query_len = query_end - query_start
num_rejected = tl.load(num_rejected_ptr + req_id)
num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
num_computed += query_len
# Consider the rejected tokens in spec decoding.
if num_sampled > 0:
# NOTE(woosuk): We must skip num_sampled == 0 to account for chunked prefills.
logits_start = tl.load(cu_num_logits_ptr + req_id)
logits_end = tl.load(cu_num_logits_ptr + req_id + 1)
num_logits = logits_end - logits_start
num_rejected = num_logits - num_sampled
num_computed -= num_rejected
num_computed += query_len - num_rejected
tl.store(num_computed_tokens_ptr + req_state_idx, num_computed)
@ -421,10 +378,10 @@ def post_update(
sampled_tokens: torch.Tensor,
# [num_reqs]
num_sampled: torch.Tensor,
# [num_reqs]
num_rejected: torch.Tensor,
# [num_reqs + 1]
query_start_loc: torch.Tensor,
# [num_reqs + 1]
cu_num_logits: torch.Tensor,
) -> None:
num_reqs = idx_mapping.shape[0]
_post_update_kernel[(num_reqs,)](
@ -434,7 +391,7 @@ def post_update(
sampled_tokens,
sampled_tokens.stride(0),
num_sampled,
num_rejected,
query_start_loc,
cu_num_logits,
num_warps=1,
)

View File

@ -45,7 +45,11 @@ from vllm.v1.worker.gpu.input_batch import (
prepare_prefill_inputs,
)
from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
from vllm.v1.worker.gpu.spec_decode import init_speculator
from vllm.v1.worker.gpu.spec_decode.rejection_sample import (
get_num_rejected,
rejection_sample,
)
from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
@ -97,16 +101,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
if self.use_async_scheduling:
self.input_prep_event = torch.cuda.Event()
self.structured_outputs_event = torch.cuda.Event()
self.spec_decode_event = torch.cuda.Event()
else:
self.input_prep_event = None
self.structured_outputs_event = None
self.spec_decode_event = None
if self.speculative_config is not None:
self.do_spec_decode = True
self.num_speculative_steps = self.speculative_config.num_speculative_tokens
self.speculator = init_speculator(self.vllm_config, self.device)
else:
self.do_spec_decode = False
self.num_speculative_steps = 0
self.speculator = None
self.req_states = RequestState(
max_num_reqs=self.max_num_reqs,
@ -153,6 +161,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.vllm_config,
self.device,
)
if self.do_spec_decode:
self.speculator.load_model(self.model)
time_after_load = time.perf_counter()
self.model_memory_usage = m.consumed_memory
@ -285,6 +295,35 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
logits = self.model.compute_logits(hidden_states)
self.sampler(logits, sampling_metadata)
@torch.inference_mode()
def _dummy_speculator_run(
self,
hidden_states: torch.Tensor,
aux_hidden_states: list[torch.Tensor] | None,
) -> None:
num_tokens = hidden_states.shape[0]
num_reqs = min(num_tokens, self.max_num_reqs)
input_batch = InputBatch.make_dummy(
num_reqs=num_reqs,
num_tokens=num_tokens,
input_buffers=self.input_buffers,
device=self.device,
)
sampling_metadata = SamplingMetadata.make_dummy(
num_reqs=num_reqs,
device=self.device,
)
num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
num_rejected = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
self.propose_draft(
input_batch=input_batch,
sampling_metadata=sampling_metadata,
last_hidden_states=hidden_states,
aux_hidden_states=aux_hidden_states,
num_sampled=num_sampled,
num_rejected=num_rejected,
)
@torch.inference_mode()
def profile_run(self) -> None:
hidden_states, sample_hidden_states = self._dummy_run(
@ -292,6 +331,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
skip_attn=True,
)
self._dummy_sampler_run(sample_hidden_states)
if self.do_spec_decode:
self._dummy_speculator_run(hidden_states, None)
torch.cuda.synchronize()
del hidden_states, sample_hidden_states
gc.collect()
@ -466,20 +507,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
block_tables = self.block_tables.gather_block_tables(idx_mapping)
# Copy prefill tokens from CPU to GPU and get query_start_loc.
# Get query_start_loc.
np.cumsum(
num_scheduled_tokens,
out=self.input_buffers.query_start_loc.np[1 : num_reqs + 1],
)
# Pad for full CUDA graph mode.
# Some attention backends like FA3 require query_start_loc to be non-decreasing.
self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
self.input_buffers.query_start_loc.copy_to_gpu()
query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
# Copy prefill tokens from CPU to GPU.
prepare_prefill_inputs(
idx_mapping_np,
num_scheduled_tokens,
num_tokens,
query_start_loc_np,
self.req_states.prefill_token_ids,
self.req_states.num_computed_prefill_tokens,
self.req_states.prefill_len.np,
self.input_buffers.input_ids,
self.input_buffers.query_start_loc,
self.input_buffers.input_ids.np,
)
query_start_loc = self.input_buffers.query_start_loc
query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
query_start_loc_np = query_start_loc.np[: num_reqs + 1]
self.input_buffers.input_ids.copy_to_gpu(num_tokens)
# Prepare positions and seq_lens.
prepare_pos_seq_lens(
@ -562,7 +611,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
input_batch: InputBatch,
sampling_metadata: SamplingMetadata,
grammar_output: GrammarOutput | None,
) -> tuple[SamplerOutput, torch.Tensor]:
) -> tuple[SamplerOutput, torch.Tensor, torch.Tensor]:
sample_hidden_states = hidden_states[input_batch.logits_indices]
logits = self.model.compute_logits(sample_hidden_states)
if grammar_output is not None:
@ -588,6 +637,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# No draft tokens (common case).
# 0 if chunked-prefilling, 1 if not.
num_sampled = (~is_chunked_prefilling).int()
num_rejected = torch.zeros_like(num_sampled)
else:
# Draft tokens for spec decoding.
input_ids = input_batch.input_ids[input_batch.logits_indices]
@ -598,9 +648,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.num_speculative_steps,
)
num_sampled *= ~is_chunked_prefilling
num_rejected = get_num_rejected(
input_batch.cu_num_logits,
num_sampled,
)
sampler_output.sampled_token_ids = sampled_tokens
# TODO(woosuk): Support logprobs with spec decoding.
return sampler_output, num_sampled
return sampler_output, num_sampled, num_rejected
def compute_prompt_logprobs(
self,
@ -706,6 +760,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
input_batch: InputBatch,
sampled_tokens: torch.Tensor,
num_sampled: torch.Tensor,
num_rejected: torch.Tensor,
) -> None:
# Update the number of computed tokens.
post_update(
@ -714,8 +769,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.req_states.last_sampled_tokens,
sampled_tokens,
num_sampled,
num_rejected,
input_batch.query_start_loc,
input_batch.cu_num_logits,
)
# Update the number of computed prefill tokens.
@ -727,6 +782,43 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.req_states.prefill_len.np[idx_mapping_np],
)
@torch.inference_mode()
def propose_draft(
self,
input_batch: InputBatch,
sampling_metadata: SamplingMetadata,
last_hidden_states: torch.Tensor,
aux_hidden_states: list[torch.Tensor] | None,
num_sampled: torch.Tensor,
num_rejected: torch.Tensor,
) -> torch.Tensor:
num_reqs = input_batch.num_reqs
idx_mapping_np = input_batch.idx_mapping_np
with async_barrier(self.spec_decode_event):
self.input_buffers.next_prefill_tokens.np[:num_reqs] = (
self.req_states.prefill_token_ids[
idx_mapping_np,
self.req_states.num_computed_prefill_tokens[idx_mapping_np],
]
)
next_prefill_tokens = self.input_buffers.next_prefill_tokens.copy_to_gpu(
num_reqs
)
assert self.speculator is not None
draft_tokens = self.speculator.propose(
input_batch,
sampling_metadata,
last_hidden_states,
aux_hidden_states,
num_sampled,
num_rejected,
self.req_states.last_sampled_tokens,
next_prefill_tokens,
)
self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
return draft_tokens
def get_cudagraph_and_dp_padding(
self,
scheduler_output: SchedulerOutput,
@ -879,7 +971,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.execute_model_state = None # type: ignore
assert sampling_metadata is not None
sampler_output, num_sampled_tokens = self.sample(
sampler_output, num_sampled, num_rejected = self.sample(
hidden_states, input_batch, sampling_metadata, grammar_output
)
prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch)
@ -900,7 +992,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
async_output = AsyncOutput(
model_runner_output=model_runner_output,
sampler_output=sampler_output,
num_sampled_tokens=num_sampled_tokens,
num_sampled_tokens=num_sampled,
copy_stream=self.output_copy_stream,
copy_event=self.output_copy_event,
)
@ -911,8 +1003,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# This sequencing may slightly reduce latency as async D2H copy does not
# need to wait for the postprocess to finish.
self.postprocess(
input_batch, sampler_output.sampled_token_ids, num_sampled_tokens
input_batch, sampler_output.sampled_token_ids, num_sampled, num_rejected
)
if self.do_spec_decode:
_ = self.propose_draft(
input_batch,
sampling_metadata,
hidden_states,
None, # aux_hidden_states
num_sampled,
num_rejected,
)
if self.use_async_scheduling:
return async_output

View File

@ -100,8 +100,9 @@ def _gumbel_sample_kernel(
mask=mask,
other=float("-inf"),
)
logits = logits.to(tl.float32)
temp = tl.load(temp_ptr + req_idx)
temp = tl.load(temp_ptr + req_idx).to(tl.float32)
if temp != 0.0:
# Calculate the seed for gumbel noise.
seed = tl.load(seeds_ptr + req_idx)
@ -116,7 +117,7 @@ def _gumbel_sample_kernel(
# Apply temperature.
if APPLY_TEMPERATURE:
# NOTE(woosuk): Use div_rn to match the behavior of torch.
logits = tl.div_rn(logits, temp.to(tl.float32))
logits = tl.div_rn(logits, temp)
# Apply gumbel noise.
logits = tl.where(mask, logits + gumbel_noise, float("-inf"))

View File

@ -0,0 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm.config import VllmConfig
def init_speculator(
vllm_config: VllmConfig,
device: torch.device,
):
speculative_config = vllm_config.speculative_config
assert speculative_config is not None
if speculative_config.use_eagle():
from vllm.v1.worker.gpu.spec_decode.eagle import EagleSpeculator
return EagleSpeculator(vllm_config, device)
raise NotImplementedError(f"{speculative_config.method} is not supported yet.")

View File

@ -0,0 +1,209 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import torch.nn as nn
from vllm.config import VllmConfig
from vllm.config.compilation import CUDAGraphMode
from vllm.forward_context import set_forward_context
from vllm.model_executor.model_loader import get_model
from vllm.triton_utils import tl, triton
from vllm.v1.worker.gpu.input_batch import InputBatch
from vllm.v1.worker.gpu.sampler import gumbel_sample
from vllm.v1.worker.gpu.states import SamplingMetadata
class EagleSpeculator:
def __init__(self, vllm_config: VllmConfig, device: torch.device):
self.vllm_config = vllm_config
self.device = device
self.speculative_config = vllm_config.speculative_config
assert self.speculative_config is not None
self.method = self.speculative_config.method
self.num_speculative_steps = self.speculative_config.num_speculative_tokens
self.draft_model_config = self.speculative_config.draft_model_config
self.scheduler_config = vllm_config.scheduler_config
self.max_num_reqs = self.scheduler_config.max_num_seqs
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
self.input_ids = torch.zeros(
self.max_num_tokens, dtype=torch.int32, device=device
)
self.positions = torch.zeros(
self.max_num_tokens, dtype=torch.int64, device=device
)
def load_model(self, target_model: nn.Module) -> None:
from vllm.compilation.backends import set_model_tag
with set_model_tag("eagle_head"):
self.model = get_model(
vllm_config=self.vllm_config, model_config=self.draft_model_config
)
share_lm_head = True
if share_lm_head and hasattr(target_model, "lm_head"):
if hasattr(self.model, "lm_head"):
del self.model.lm_head
self.model.lm_head = target_model.lm_head
@torch.inference_mode()
def propose(
self,
input_batch: InputBatch,
sampling_metadata: SamplingMetadata,
# [num_tokens, hidden_size]
last_hidden_states: torch.Tensor,
# num_layers x [num_tokens, hidden_size]
aux_hidden_states: list[torch.Tensor] | None,
# [num_reqs]
num_sampled: torch.Tensor,
# [num_reqs]
num_rejected: torch.Tensor,
# [max_num_reqs, 1]
last_sampled: torch.Tensor,
# [num_reqs]
next_prefill_tokens: torch.Tensor,
) -> torch.Tensor:
# NOTE(woosuk): To avoid CPU-GPU synchronization without CPU knowing the
# number of rejected tokens, we maintain the size of eagle's input_ids and
# hidden_states the same as the target model's. This means, we pad each
# request's query length to include any rejected positions. By doing so,
# we can also reuse the attention metadata (e.g., query_start_loc,
# seq_lens) of the target model.
if aux_hidden_states:
assert self.method == "eagle3"
hidden_states = self.model.combine_hidden_states(
torch.cat(aux_hidden_states, dim=-1)
)
else:
hidden_states = last_hidden_states
# Get the input ids and last token indices for the speculator.
last_token_indices = prepare_eagle_inputs(
self.input_ids,
input_batch,
num_sampled,
num_rejected,
last_sampled,
next_prefill_tokens,
)
input_ids = self.input_ids[: input_batch.num_tokens_after_padding]
# Prefill: Run the eagle speculator with eager mode.
with set_forward_context(
input_batch.attn_metadata,
self.vllm_config,
num_tokens=input_batch.num_tokens_after_padding,
cudagraph_runtime_mode=CUDAGraphMode.NONE,
):
ret_hidden_states = self.model(
input_ids=input_ids,
positions=input_batch.positions,
hidden_states=hidden_states,
)
if self.method == "mtp":
last_hidden_states = ret_hidden_states
hidden_states = ret_hidden_states
else:
last_hidden_states, hidden_states = ret_hidden_states
sample_hidden_states = last_hidden_states[last_token_indices]
logits = self.model.compute_logits(sample_hidden_states)
num_reqs = input_batch.num_reqs
cu_num_logits = input_batch.cu_num_logits[:num_reqs]
temperature = sampling_metadata.temperature[cu_num_logits]
seed = sampling_metadata.seeds[cu_num_logits]
# NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
# used for draft and target sampling.
pos = input_batch.positions[last_token_indices] + 1
# NOTE(woosuk): For draft sampling, we only consider the temperature
# and ignore the other sampling parameters such as top_k and top_p,
# for simplicity and performance.
# While this may slightly degrade the acceptance rate, it does not
# affect the output distribution after rejection sampling.
draft_tokens = gumbel_sample(
logits, temperature, seed, pos, apply_temperature=True
)
if self.num_speculative_steps == 1:
# Early exit.
return draft_tokens.view(-1, 1)
raise NotImplementedError("num_speculative_steps > 1 is not supported yet.")
@triton.jit
def _prepare_eagle_inputs_kernel(
last_token_indices_ptr,
eagle_input_ids_ptr,
target_input_ids_ptr,
idx_mapping_ptr,
last_sampled_ptr,
next_prefill_tokens_ptr,
num_sampled_ptr,
num_rejected_ptr,
query_start_loc_ptr,
BLOCK_SIZE: tl.constexpr,
):
batch_idx = tl.program_id(0)
query_start = tl.load(query_start_loc_ptr + batch_idx)
query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
query_len = query_end - query_start
# Get the true query length and next token after accounting for rejected tokens.
num_rejected = tl.load(num_rejected_ptr + batch_idx)
query_len -= num_rejected
num_sampled = tl.load(num_sampled_ptr + batch_idx)
if num_sampled > 0:
req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32)
else:
# Chunked prefilling.
# Get the next prefill token.
next_token = tl.load(next_prefill_tokens_ptr + batch_idx)
# Shift target_input_ids by one.
for i in range(1, query_len, BLOCK_SIZE):
block = i + tl.arange(0, BLOCK_SIZE)
mask = block < query_len
input_ids = tl.load(target_input_ids_ptr + query_start + block, mask=mask)
tl.store(eagle_input_ids_ptr + query_start + block - 1, input_ids, mask=mask)
last_token_index = query_start + query_len - 1
tl.store(last_token_indices_ptr + batch_idx, last_token_index)
tl.store(eagle_input_ids_ptr + last_token_index, next_token)
def prepare_eagle_inputs(
eagle_input_ids: torch.Tensor,
input_batch: InputBatch,
# [num_reqs]
num_sampled: torch.Tensor,
# [num_reqs]
num_rejected: torch.Tensor,
# [max_num_reqs, 1]
last_sampled: torch.Tensor,
# [max_num_reqs]
next_prefill_tokens: torch.Tensor,
) -> torch.Tensor:
num_reqs = input_batch.num_reqs
last_token_indices = torch.empty(
num_reqs,
dtype=torch.int64,
device=eagle_input_ids.device,
)
_prepare_eagle_inputs_kernel[(num_reqs,)](
last_token_indices,
eagle_input_ids,
input_batch.input_ids,
input_batch.idx_mapping,
last_sampled,
next_prefill_tokens,
num_sampled,
num_rejected,
input_batch.query_start_loc,
BLOCK_SIZE=1024,
)
return last_token_indices

View File

@ -69,3 +69,15 @@ def rejection_sample(
num_warps=1,
)
return sampled, num_sampled
@torch.compile(dynamic=True)
def get_num_rejected(
cu_num_logits: torch.Tensor,
num_sampled: torch.Tensor,
) -> torch.Tensor:
num_logits = cu_num_logits[1:] - cu_num_logits[:-1]
num_rejected = num_logits - num_sampled
# No token is rejected for chunked prefills.
num_rejected *= num_sampled > 0
return num_rejected

Some files were not shown because too many files have changed in this diff Show More